In [104]:
import datetime as dt
import pandas as pd
import numpy as np
import seaborn as sns
import joblib

from pandas_profiling import ProfileReport
from category_encoders import OrdinalEncoder
from category_encoders import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

Since all historical data is already in the DS database (not the production DB that's synced to the web backend), we connect to it, load all the data as a dataframe, and use that as the training data. This drastically cuts down on the amount of cleaning/wrangling we have to do than if we'd used All_data_with_exits.csv.

However, keep in mind that only data from the Members table in the DB was used for training. This is because the Families table in the DS database is incomplete (a lot of the columns in the production database the backend is synced to is missing in the DS database). But the Members table for the production and DS database are mostly identical. 

NOTE: It was Team A that created a separate database for DS and migrated the historical data there. We at Team B just ended up using their database. Their code setting up the database and migrating the historical data into it can be found here: https://github.com/Lambda-School-Labs/family-promise-spokane-ds-a/tree/main/migration

In [50]:
import sqlalchemy

engine = sqlalchemy.create_engine(SQLALCHEMY_DATABASE_URL)
df = pd.read_sql_table('members',engine)


In [179]:
df.head()

Unnamed: 0,id,date_of_enrollment,household_type,length_of_stay,demographics,barriers,schools,case_members,predicted_exit_destination,family_id,date_of_exit,income_at_exit,exit_destination
0,56550,2017-09-10,Household with Adults and Children,120,"{'race': 'White', 'gender': 'Male', 'income': ...","{'HIV_AIDs': '', 'drug_abuse': '', 'alcohol_ab...",{'enrolled_status': ''},6,,92775,2018-01-08,-1,Unknown/Other
1,36139,2017-09-10,Household with Adults and Children,136,"{'race': 'White', 'gender': 'Female', 'income'...","{'HIV_AIDs': '', 'drug_abuse': '', 'alcohol_ab...",{'enrolled_status': ''},6,,92775,2018-01-24,-1,Unknown/Other
2,36140,2017-09-10,Household with Adults and Children,120,"{'race': 'White', 'gender': 'Male', 'income': ...","{'HIV_AIDs': '', 'drug_abuse': '', 'alcohol_ab...",{'enrolled_status': ''},6,,92775,2018-01-08,-1,Unknown/Other
3,36141,2017-09-10,Household with Adults and Children,120,"{'race': 'White', 'gender': 'Female', 'income'...","{'HIV_AIDs': '', 'drug_abuse': 'Drug Abuse', '...",{'enrolled_status': ''},6,,92775,2018-01-08,1600,Unknown/Other
4,36142,2017-09-10,Household with Adults and Children,120,"{'race': 'White', 'gender': 'Male', 'income': ...","{'HIV_AIDs': '', 'drug_abuse': '', 'alcohol_ab...",{'enrolled_status': ''},6,,92775,2018-01-08,-1,Unknown/Other


# Cleaning Pipeline

In [180]:
def start_pipeline(df):
    '''Creates a copy of original dataframe to use in pipeline'''
    return df.copy()

def unpack_json_cols(df):
  '''Unpack json columns into a dataframe and concatenate each resulting dataframe to the original dataframe.'''  
  demographics = pd.json_normalize(df['demographics'])
  barriers = pd.json_normalize(df['barriers'])
  schools = pd.json_normalize(df['schools'])
  df = pd.concat([df, demographics, barriers, schools], axis=1)
  return df

def delete_cols(df):
  '''Deletes original json columns as well as columns that will not be used in training the model (because of leakage)'''
  json_cols = ['barriers', 'demographics', 'schools']
  not_used = ['id', 'predicted_exit_destination', 'family_id', 'date_of_exit', 'income_at_exit', 'date_of_enrollment']
  df.drop(columns=[*json_cols, *not_used], inplace=True)
  return df

def barriers(df):
  '''Assigns a value of True if the person has the barrier, else False'''
  has_barrier = ['Alcohol Abuse', 'Developmental Disability', 'Chronic Health', 'Drug Abuse', 'HIV/AIDS', 'Mental Illness', 'Physical Disability']
  barrier_cols = ['HIV_AIDs',	'drug_abuse',	'alcohol_abuse',	'mental_illness',	'chronic_health_issues', 'physical_disabilities',	'developmental_disabilities']
  for barrier in barrier_cols:
    df[barrier] = df[barrier].apply(lambda x: True if x in has_barrier else False)
  return df

def replace_values(df):
  '''
  Replace missing and unknown values in data to NaN.
  Currently, missing values in the database are denoted as either -1.0 or "" (an empty string)
  '''
  replace_list = [-1.0, ""]
  df.replace(replace_list, np.NaN, inplace=True)
  return df

In [181]:
# Execute pipeline

df_clean = (df
      .pipe(start_pipeline)
      .pipe(unpack_json_cols)
      .pipe(delete_cols)
      .pipe(barriers)
      .pipe(replace_values)
)

In [182]:
df_clean.head()

Unnamed: 0,household_type,length_of_stay,case_members,exit_destination,race,gender,income,ethnicity,relationship,HIV_AIDs,drug_abuse,alcohol_abuse,mental_illness,chronic_health_issues,physical_disabilities,developmental_disabilities,enrolled_status
0,Household with Adults and Children,120,6,Unknown/Other,White,Male,,Non-Hispanic/Latino,Son,False,False,False,False,False,False,False,
1,Household with Adults and Children,136,6,Unknown/Other,White,Female,,Non-Hispanic/Latino,Daughter,False,False,False,False,False,False,False,
2,Household with Adults and Children,120,6,Unknown/Other,White,Male,,Non-Hispanic/Latino,Son,False,False,False,False,False,False,False,
3,Household with Adults and Children,120,6,Unknown/Other,White,Female,1200.0,Non-Hispanic/Latino,Significant Other (Non-Married),False,True,False,True,False,False,False,
4,Household with Adults and Children,120,6,Unknown/Other,White,Male,,Non-Hispanic/Latino,Son,False,False,False,False,False,False,False,


Prior Labs cohorts accidentally introduced data leakage when train/test splitting and members of the same family were assigned to different sets. To avoid data leakage, only train on the head of household for each family (ie 'relationship' == 'Self').

In [183]:
df_training = df_clean[df_clean['relationship'] == 'Self']

Unfortunately, after correcting for leakage, the number of training examples shrinks by 1/3 (from 1493 training samples to 464).

In [None]:
print(f"The number of training examples before fixing data leakage: {len(df_clean)}")
print(f"The number of training examples after fixing data leakage: {len(df_training)}")

# Fitting the Model

In [None]:
#Remove 'relationship' column before training because it isn't pertinent to the model 

df_training.drop(columns=['relationship'], inplace=True)


In [188]:
target = 'exit_destination'

X = df_training.drop(columns=[target])
y = df_training[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [189]:
X_train.columns

Index(['household_type', 'length_of_stay', 'case_members', 'race', 'gender',
       'income', 'ethnicity', 'HIV_AIDs', 'drug_abuse', 'alcohol_abuse',
       'mental_illness', 'chronic_health_issues', 'physical_disabilities',
       'developmental_disabilities', 'enrolled_status'],
      dtype='object')

In [190]:
# Pipeline for random forest model
random_forest_model = Pipeline([( 'ord', OrdinalEncoder()),
                                ('imputer', SimpleImputer()),
                                ('classifier', RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42, verbose=1))])

In [191]:
# Fit the model
random_forest_model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.6s finished


Pipeline(memory=None,
         steps=[('ord',
                 OrdinalEncoder(cols=['household_type', 'race', 'gender',
                                      'ethnicity', 'enrolled_status'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'household_type',
                                          'data_type': dtype('O'),
                                          'mapping': Household without Children            1
Household with Adults and Children    2
Household with Only Children          3
NaN                                  -2
dtype: int6...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
         

In [192]:
random_forest_model.score(X_test, y_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 1000 out of 1000 | elapsed:    0.2s finished


0.40860215053763443

Unfortunately, after correcting for the leakage, the number of training examples shrank considerably, leading to a low accuracy. Feature engineering and hyperparameter tuning, which we didn't get to this round, could lead to better model performance.

# Model Serialization

In [203]:
import joblib

clf = random_forest_model
joblib_file = "model.pkl"
joblib.dump(clf, joblib_file)

['model.pkl']