# Data Importing

In [None]:
import datetime as dt
import pandas as pd
import numpy as np
import seaborn as sns
import joblib

from pandas_profiling import ProfileReport
from category_encoders import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
# Import data from root dir
data = pd.read_csv('../All_data_with_exits.csv')
df = pd.DataFrame(data)

# Set view options
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
print("Original CSV Shape: ", df.shape)
df.head()

In [None]:
df.tail()

# Target Recategorization


Because the target is initially recorded in a very granular manner, the target labels will need to be recategorized to fit into the 5 Categories provided by stakeholder:

- Permanent Exit
- Temporary Exit
- Emergency Shelter
- Transitional Housing
- Unknown/Other

**Permanent Exit**

- Staying or living with family, permanent tenure
- Staying or living with friends, permanent tenure
- Permanent housing (other than RRH) for formerly homeless persons
- Rental by client with RRH or equivalent subsidy
- Rental by client, no ongoing housing subsidy
- Rental by client, other ongoing housing subsidy
- Owned by client, no ongoing housing subsidy

**Temporary Exit**  

- Place not meant for habitation (e.g., a vehicle, an abandoned building, bus/train/subway, station/airport or anywhere outside)
- Staying or living with family, temporary tenure (e.g., room, apartment or house)
- Staying or living with friends, temporary tenure (e.g., room, apartment or house)
- Hotel or Motel paid for without Emergency Shelter Voucher

**Emergency Shelter**  

- Emergency shelter, including hotel or motel paid for with emergency shelter voucher, or RHY-funded Host Home shelter 

**Transitional Housing**  

- Transitional Housing for homeless persons (including homeless youth)
- Safe Haven
- Substance Abuse Treatment or Detox Center
- Foster Care Home or Foster Care Group Home
- Psychiatric Hospital or Other Psychiatric Facility

**Unknown/Other**

- No exit interview completed
- Client refused
- Other
- Client doesn't know

**RRH = Rapid Re-Housing*

Because pandas has a built in value mapping function that is more performant and consistent using a dictionary of this format, we are going with this dictionary structure rather than a more DRY dictionary with each entry as an element of a list with the category as the key.  
e.g. `values_dict = {'Permanent Exit' : [some_value, some_value2]}`

In [None]:
# Use apply to assign values in dataframe to categories
values_dict = {
    
    # Permanent Exits
    'Staying or living with family, permanent tenure' : 'Permanent Exit',
    'Staying or living with friends, permanent tenure' : 'Permanent Exit',
    'Permanent housing (other than RRH) for formerly homeless persons' : 'Permanent Exit',
    'Rental by client with RRH or equivalent subsidy' : 'Permanent Exit',
    'Rental by client, no ongoing housing subsidy' : 'Permanent Exit',
    'Rental by client, other ongoing housing subsidy' : 'Permanent Exit',
    'Owned by client, no ongoing housing subsidy' : 'Permanent Exit',
    
    # Temporary Exits
    'Staying or living with family, temporary tenure (e.g., room, apartment or house)' : 'Temporary Exit',
    'Staying or living with friends, temporary tenure (e.g., room, apartment or house)' : 'Temporary Exit',
    
    # Emergency Shelter
    'Emergency shelter, including hotel or motel paid for with emergency shelter voucher, or RHY-funded Host Home shelter' : 'Emergency Shelter',
   
    # Transitional Housing
    'Transitional Housing for homeless persons (including homeless youth)' : 'Transitional Housing',
    'Safe Haven' : 'Transitional Housing',
    'Substance Abuse Treatment or Detox Center' : 'Transitional Housing',
    'Foster Care Home or Foster Care Group Home' : 'Transitional Housing',
    'Psychiatric Hospital or Other Psychiatric Facility' : 'Transitional Housing',
   
    # Unknown/Other
    'Hotel or Motel paid for without Emergency Shelter Voucher' : 'Unknown/Other',
    'Place not meant for habitation (e.g., a vehicle, an abandoned building, bus/train/subway station/airport or anywhere outside)' : 'Unknown/Other',
    'No exit interview completed' : 'Unknown/Other',
    'Client refused' : 'Unknown/Other',
    'Other' : 'Unknown/Other',
    'Client doesn\'t know' : 'Unknown/Other',
    np.NaN : 'Unknown/Other'
}

In [None]:
# Features that need to have dtype converted to datetime
date_features = ['Enroll Date', 'Exit Date', 'CurrentDate', 'Date of First Contact (Beta)', 
                 'Date of First ES Stay (Beta)', 'Date of Last Contact (Beta)', 
                 'Date of Last ES Stay (Beta)', 'Engagement Date','Homeless Start Date']

In [None]:
# Features will artifacts remaining after filter application to text
text_artifacts = ['RReferral Source',
                  'RDate Status Determined',
                  'REnroll Status',
                  'RRunaway Youth',
                  'RReason Why No Services Funded',
                  'RSexual Orientation',
                  'RLast Grade Completed',
                  'RSchool Status',
                  'REmployed Status',
                  'RWhy Not Employed',
                  'RType of Employment',
                  'RLooking for Work',
                  'RGeneral Health Status',
                  'RDental Health Status',
                  'RMental Health Status',
                  'RPregnancy Status',
                  'RPregnancy Due Date',
                  'VLast Permanent Address',
                  'VState',
                  'VZip']

# Dict comprehension to generate dict of fixed names
rename_dict = {k: k[1:] for k in text_artifacts}

## Cleaning Pipeline 1

Cleaning has been split into multiple pipelines

**Pipeline 1 Goals:**
- Make column labels human readable and easily parsable
- Enforce Data types
- Create target exit destination

In [None]:
def start_pipeline(dataf):
    '''Creates a copy of original dataframe to use in pipeline'''
    return dataf.copy()

def column_cleaner(dataf):
    '''Takes in a dataframe and removes decimals from column names'''
    dataf.columns = dataf.columns.str.replace(r'\d+.', '')
    return dataf

def column_rename(dataf):
    '''Fixes column name artifacts from string filter'''
    dataf = dataf.rename(columns = rename_dict)
    return dataf

def column_strip(dataf):
    '''Strips leading whitespace artifacting from RE'''
    dataf.columns = dataf.columns.str.lstrip(' ')
    return dataf

def set_dtypes(dataf):
    '''Converts date str to datetime objects in ordinal format'''
    dataf[date_features] = dataf[date_features].apply(pd.to_datetime, infer_datetime_format=True)
    for column in date_features:
        dataf[column] = dataf[column].apply(dt.datetime.toordinal)
    return dataf

def add_target(dataf):
    '''Adds each entry to one of the five target categories'''
    dataf['Target Exit Destination'] = dataf['Exit Destination'].map(values_dict)
    return dataf

Run pipeline

In [None]:
df_pipeline1 = (df
    .pipe(start_pipeline)
    .pipe(column_cleaner)
    .pipe(column_rename)
    .pipe(column_strip)
    .pipe(set_dtypes)
    .pipe(add_target)
)

## Pipeline 1 Results Testing

In [None]:
# Check for enforcement of datetime dtype
for column in date_features:
    print(df_pipeline1[column].dtypes)

In [None]:
df_pipeline1['Target Exit Destination'].value_counts(dropna=False)

In [None]:
print(df_pipeline1.shape)
df_pipeline1.head()

# Second Cleaning & Pipeline
Cleaning has been split into multiple pipelines

**Pipeline 2 Goals:**
- Remove columns in the dataframe with high incidence of null values
- Remove columns that are contextually irrelevant to modeling
- Data re-binning
- Enforce column labels

In [None]:
# Columns to be removed from feature selection due to not exisitng in the intake data
not_in_intake = ['Utilization Tracking Method (Invalid)',
                 'Federal Grant Programs',
                 'Client Location',
                 'Engagement Date',
                 'Days Enrolled Until Engagement Date',
                 'RRH | Most Recent Enrollment',
                 'Coordinated Entry | Most Recent Enrollment',
                 'Emergency Shelter | Most Recent Enrollment',
                 'Bed Nights During Report Period',
                 'Count of Bed Nights - Entire Episode',
                 'Chronic Homeless Status_vHMISDatAssessment',
                 'Chronic Homeless Status_EvaluatevHMIS&HMISDA']

In [None]:
# Columns to be removed from feature selection for reasons described in column_removal_documentation.md
columns_not_selected = ['Current Age',
                        'Birthdate Quality',
                        'Information Release Status',
                        'InfoReleaseNo',
                        'Client Record Restricted',
                        'Contact Services',
                        'Date of Last Contact (Beta)',
                        'Date of First Contact (Beta)',
                        'Chronic Homeless Status',
                        'Exit Destination',
                        'Personal ID',
                        'Household ID'] 

In [None]:
columns_need_testing = ['School Status', 
                        'Date of Last ES Stay (Beta)', 
                        'Date of First ES Stay (Beta)',
                        'Non-Cash Benefit Count',
                        'Non-Cash Benefit Count at Exit']

In [None]:
# Pipeline 2
def start_pipeline(dataf):
    '''Creates a copy of original dataframe to use in pipeline'''
    return dataf.copy()

def replace_values(dataf):
    '''Takes columns in column_impute_list and replaces missing and unknown 
    values with "Unknown"'''
    # List of columns that needs values consolidated and replaced
    column_replace_list = ['Race' , 'Ethnicity' , 'Length of Stay']
    # List of values to replace with "Unknown"
    value_replace_list = ['Client refused','Client doesn\'t know', 'Data not collected', np.NaN]
    for column in column_replace_list:
        dataf[column].replace(value_replace_list, 'Unknown', inplace=True)
    return dataf 

def remove_null_columns(dataf):
    '''Removes columns with null incidence greater than threshold'''
    # Set null threshold based on %
    threshold = 0.90 * dataf.shape[0]
    # Create a dictionary of the number of null values in each column
    null_count_dict = dataf.isnull().sum().to_dict()
    # Create a list of column labels that >= threshold
    null_columns_list = [entry for entry in null_count_dict if null_count_dict[entry] >= threshold]
    # Drop columns in null_columns_list
    dataf.drop(columns = null_columns_list, inplace=True)
    return dataf

def remove_one_value_columns(dataf):
    '''Removes columns with a cardinality of 1'''
    # Create a dictionary of the number of null values in each column
    nunique_count_dict = dataf.nunique(dropna=False).to_dict()
    # Create a list of column labels that >= threshold
    nunique_columns_list = [entry for entry in nunique_count_dict if nunique_count_dict[entry] == 1]
    # Drop columns in null_columns_list
    dataf.drop(columns = nunique_columns_list, inplace=True)
    return dataf

def remove_final_columns(dataf):
    '''Removes columns that either do not appear in the intake or are selected for modeling'''
    dataf = dataf.drop(columns=(not_in_intake + columns_not_selected))
    return dataf

In [None]:
# Execute Pipeline 2
df_pipeline2 = (df_pipeline1
    .pipe(start_pipeline)
    .pipe(replace_values)
    .pipe(remove_null_columns)
    .pipe(remove_one_value_columns)
    .pipe(remove_final_columns)
)

# Pipeline 2 Results Testing

In [None]:
df_pipeline2.shape

In [None]:
df_pipeline2.head()

## Initial Visualizations  

Final Visualizations will need to be formatted with proper object usage and syntax

In [None]:
# Value Distribution
df_pipeline2['Target Exit Destination'].value_counts().plot(kind='bar');

In [None]:
# Basic scatterplots
sns.scatterplot(data=df_pipeline2, y='Target Exit Destination', x='Income Total at Entry').set_title('Exit Destination vs. Income Total at Entry');

# Feature Engineering

In [None]:
# List of income columns
income = ['Earned Income',
          'Supplemental Security Income', 
          'Social Security Disability Income', 
          'VA Disability Compensation', 
          'Private Disability Income', 
          'Workers Compensation', 
          'TANF', 
          'General Assistance' ,
          'Child Support', 
          'Other Income']


# # Write a function that calculates the sum of these columns
# dataf['Income Total'] = df.loc[income].sum(axis=1)

# Pipeline 3

In [None]:
# Pipeline 3

def start_pipeline(dataf):
    '''Creates a copy of original dataframe to use in pipeline'''
    return dataf.copy()

def income_sum(dataf):
    '''Creates a column that is the sum of each person\'s income'''
    dataf['Income Total'] = dataf.loc[income].sum(axis=1)
    return dataf


In [None]:
# Execute Pipeline 3
df_pipeline3 = (df_pipeline2
    .pipe(start_pipeline)
#     .pipe(income_sum)
)

In [None]:
print(df_pipeline3.shape)
df_pipeline3.head()

# Feature Selection

In [None]:
# features = ['CaseMembers','Race', 'Ethnicity', 
#             'Current Age', 'Gender', 'Length of Stay', 
#             'Days Enrolled in Project','Household Type', 
#             'Barrier Count at Entry']

In [None]:
target = 'Target Exit Destination'

In [None]:
X = df_pipeline3.drop(columns=[target])
y = df_pipeline3[target]

# Baseline

In [None]:
y.value_counts(normalize=True)

# Data Split

In [None]:
# Train, Test, Validation Split

# First split : Train, Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split : Train, Val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Modeling

Modeling Strategy: 
- Implement SKL pipeline to add modularity to workflow
- Begin with random forest implementation
- Update model choices using combinations of cross-validation, loss metrics, hyperparameter tuning

In [None]:
# Pipeline for random forest model
random_forest_model = Pipeline([('ord', OrdinalEncoder()),
                                ('imputer', SimpleImputer()),
                                ('classifier', RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42, verbose=1))])


In [None]:
# Fit the model
random_forest_model.fit(X_train, y_train)

In [None]:
print("Validation: ", random_forest_model.score(X_val, y_val))
print("Test: ", random_forest_model.score(X_test, y_test))

In [None]:
# Setup for classification report metrics
y_true = y_val
y_pred = random_forest_model.predict(X_val)
target_names = ['Permanent Exit', 'Temporary Exit', 'Transitional Housing', 'Emergency Shelter' , 'Unknown/Other']

In [None]:
print(classification_report(y_true, y_pred, target_names=target_names))

# Modeling Serialization


In [None]:
# Extract the classifier step from the pipeline
clf = random_forest_model['classifier']
joblib_file = "randomforest_modelv1.pkl"
joblib.dump(clf, joblib_file)

In [None]:
joblib.dump(random_forest_model, "randomforest_model.joblib", compress=True)

In [None]:
# df_pipeline3.to_csv("../All_data_with_exits_cleaned.csv", index=False)

# Reading the files

In [None]:
model_pkl = pd.read_pickle(r'randomforest_modelv1.pkl')

In [None]:
model_pkl

In [None]:
model_job = joblib.load('randomforest_model.joblib')

In [None]:
model_job

# Labs30

In [None]:
%config IPCompleter.greedy=True

In [None]:
# Setup:
import eli5
import numpy as np
import pandas as pd
import streamlit as st

# Plot:
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning:
from catboost import CatBoostClassifier
from category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# Interpretation:
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
from pdpbox.pdp import pdp_isolate, pdp_interact, pdp_plot, pdp_interact_plot
import shap

In [None]:
df_pipeline3.head()

In [None]:
feature = "CaseMembers"

isolated = pdp_isolate(
    model=model_job,
    dataset=X_val,
    model_features=X_val.columns,
    feature = feature
)

In [None]:
pdp_plot(isolated[:1], feature_name=feature)

In [None]:
test_df = pd.DataFrame(df_pipeline1['Household ID'].value_counts())
sns.histplot(test_df);

In [None]:
print(df_pipeline3.shape)
df_pipeline3.head()

In [None]:
# df_pipeline3[:500].to_csv("../visuals/All_data_with_exits_cleaned_500r.csv", index=False)