# Mental Health in Tech Part 3 - Modelling
Exported from Filament on Sun, 13 Mar 2022 17:31:02 GMT

---

This workbook applies a random forest model and logistic regression model for predicting what factors in the workplace most influence a person seeking treatment for their mental health. 

**Data import and cleaning**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("survey.csv")

In [None]:
def invalid_entries(df):
    ''' Function to identify invalid responses '''
    
    return np.where(((df['Age']<18)|(df['Age']>75))&
           (df['self_employed']=='Yes')&
           (df['family_history']=='Yes')&
           (df['treatment']=='Yes')&
           (df['work_interfere']=='Often')&
           (df['no_employees']=='1-5')&
           (df['remote_work']=='Yes')&
           (df['tech_company']=='Yes')&
           (df['benefits']=='Yes')&
           (df['care_options']=='Yes')&
           (df['wellness_program']=='Yes')&
           (df['seek_help']=='Yes')&
           (df['anonymity']=='Yes')&
           (df['leave']=='Very easy')&
           (df['mental_health_consequence']=='Yes')&
           (df['phys_health_consequence']=='Yes')&
           (df['coworkers']=='Yes')&
           (df['supervisor']=='Yes')&
           (df['mental_health_interview']=='Yes')&
           (df['phys_health_interview']=='Yes')&
           (df['mental_vs_physical']=='Yes')&
           (df['obs_consequence']=='Yes'))

In [None]:
invalid_entries(df)

In [None]:
def check_duplicates(df):
    ''' check for duplicates in dataset with Timestamp column excluded '''
    exclude_col = ['Timestamp']
    include_cols = [x for x in df.columns if x not in exclude_col]
    return np.where(df[include_cols].duplicated() == True)

In [None]:
check_duplicates(df)

In [None]:
# Drop the 6 redundant rows from dataframe: 

df.drop(df.index[[1127,989,821,860,1134,1218]],inplace=True)

if df.shape[0]!=(1259-6):  # 1259 = original number of rows
    raise Exception(f'unexpected number of rows: {df.shape[0]}')

df.reset_index(inplace=True)

In [None]:
def invalid_ages(age):
    ''' Function to return entries with ages outside valid age range '''
    return np.where((age<18)|(age>75))

In [None]:
invalid_ages(df['Age'])

In [None]:
# Replace invalid ages with median age from train, 31

df.loc[[143,364,390,715,734,1087],'Age']=31

# Check dataframe

df[['Age']].iloc[[143,364,390,715,734,1087]]

In [None]:
# Group genders

Male = ['Male', 'male', 'M', 'm']
Female = ['Female', 'female', 'F', 'f']

Other = [x for x in df.Gender.unique() if x not in Male and x not in Female]

# Replace all Gender values with Male, Female or Other

df['Gender'] = df['Gender'].replace(Male,'Male')
df['Gender'] = df['Gender'].replace(Female,'Female')
df['Gender'] = df['Gender'].replace(Other,'Other')

df['Gender'].unique()

In [None]:
countries = ['United States', 'United Kingdom'] # country categories to keep

Other = [x for x in df.Country.unique() if x not in countries] 

df['Country'] = df['Country'].replace(Other,'Other') # combine remaining

df['Country'].unique()

**Feature engineering**

In [None]:
exclude = ['index','Timestamp','Age','state','comments']

include = [x for x in df.columns if x not in exclude]

for col in include:
    print(col, df[col].unique())

In [None]:
df['self_employed'] = df['self_employed'].fillna('No')
df['work_interfere'] = df['work_interfere'].fillna(0)

if df['self_employed'].isnull().sum() == 0:
    print("There are no nulls in the self_employed column")
if df['work_interfere'].isnull().sum() ==0:
    print("There are no nulls in the work_interfere column")

In [None]:
# Making data fields numerical


df.Gender.replace(('Male', 'Other', 'Female'), (-1, 0, 1), inplace=True)

df.Country.replace(('United States', 'United Kingdom', 'Other'), (1, 0, -1), inplace=True)



yn_cols = ['self_employed', 'family_history', 'treatment', 'remote_work',
               'tech_company','obs_consequence']

for col in yn_cols:
    df[col].replace(('Yes', 'No'), (1, 0), inplace=True)
    


dk_cols = ['benefits', 'wellness_program', 'seek_help', 'anonymity', 'mental_vs_physical']

for col in dk_cols:
    df[col].replace(('Yes', 'Don\'t know', 'No'), (1, 0, -1), inplace=True)
    


mb_cols = ['mental_health_consequence', 'phys_health_consequence', 
           'mental_health_interview', 'phys_health_interview']

for col in mb_cols:
    df[col].replace(('Yes', 'Maybe', 'No'), (1, 0, -1), inplace=True)


    
sm_cols = ['coworkers', 'supervisor']

for col in sm_cols:
    df[col].replace(('Yes', 'Some of them', 'No'), (1, 0, -1), inplace=True)
    


df['care_options'].replace(('Yes', 'Not sure', 'No'), (1, 0, -1), inplace=True)

df['no_employees'].replace(('1-5', '6-25', '26-100', '100-500', '500-1000', 
                            'More than 1000'), (1, 2, 3, 4, 5, 6), inplace=True)

df['work_interfere'].replace(('Never', 'Rarely', 'Sometimes', 'Often'), (1, 2, 3, 4), inplace=True)

df['leave'].replace(('Don\'t know', 'Very easy', 'Somewhat easy', 
                     'Somewhat difficult', 'Very difficult'), (0, 1, 2, 3, 4), inplace=True)

In [None]:
sns.heatmap(df[include].corr(), annot=False)
plt.show()

**Train-test split**

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [None]:
model_df = df[df['work_interfere'] != (0)]

non_feature_cols = ['index','Timestamp','state','comments','treatment']

feature_cols = [x for x in df.columns if x not in non_feature_cols]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_df[feature_cols], 
                                                    model_df['treatment'], # target variable
                                                    test_size = 0.2, # 20%
                                                    random_state = 1)

In [None]:
X_train['Age'].median()

**Representative test set?**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = X_train.copy()
test = X_test.copy()

# Create new target
train['train'] = 1
test['train'] = 0

# Concatenate test and train
train_test = pd.concat([train, test], axis = 0)
train_test[['train','Age']].groupby('train').count()

In [None]:
# Define dependent and independent variables
y = train_test['train']
X = train_test.drop('train', axis =1)

# Model
rfc = RandomForestClassifier(n_estimators=10, random_state=1)
rfc.fit(X, y)

In [None]:
# Cross validation
cv_results = cross_val_score(rfc, X, y, cv=5, scoring='roc_auc')
print('Cross validation results:', cv_results)
print('Average:', np.mean(cv_results))
print('Difference from 0.5:', 0.5-np.mean(cv_results))

# close to 0.5, therefore representative

# Random Forest

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, tree

In [None]:
#Fitting our model
dt = DecisionTreeClassifier(max_depth=3, 
                            min_samples_leaf=2, 
                            min_samples_split=5, 
                            random_state=1) # dt variable name for decision tree

In [None]:
#Fitting our chosen model using train data
dt.fit(X_train, y_train)

In [None]:
#Target variables - the models predictor classes
dt.classes_

In [None]:
#Plotting our tree

fig = plt.figure(figsize=(15,10))
picture = tree.plot_tree(dt, 
                   feature_names=feature_cols,  
                   class_names=['no','yes'],
                   filled=True)

plt.show()

In [None]:
# model accurcay on train and test
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

In [None]:
# Feature importance
importance = list(zip(feature_cols, list(dt.feature_importances_)))
importance.sort(key = lambda x:x[1], reverse = True)

important = []
for f in importance:
    if f[1] > 0:
        important.append(f) # excludes unimportant/unused features in model

print('Features used in classification model in order of decreasing importance: ')
important

## Bagging

In [None]:
rf = RandomForestClassifier(random_state=1)

In [None]:
## Gridsearch used to tune hyperparameters

rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5],
} 
# these test values were chosen based on previous testing with lower max depths


gs = GridSearchCV(rf, param_grid=rf_params, scoring='precision', cv=10)

gs.fit(X_train, y_train)

print(gs.best_score_)

gs.best_params_

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rf.fit(X_train, y_train)

### Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
## defining function

def apr(y_pred, y_real):
    " Function returns the accurcay, precision, and recall of model predictions "
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1


## defining function 

def produce_confusion(positive_label, negative_label, cut_off, df, y_pred_name, y_real_name):
    " Function returns confusion matrix "
    #Set pred to 0 or 1 depending on whether it's higher than the cut_off point.
    
    if cut_off != 'binary':      
        df['pred_binary'] = np.where(df[y_pred_name] > cut_off , 1, 0)
    else: 
        df['pred_binary'] = df[y_pred_name]
    
    #Build the CM
    cm = confusion_matrix(df[y_real_name], df['pred_binary'])  
    
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, fmt='g'); 

    # labels, title, ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('Real labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels([negative_label, positive_label])
    ax.yaxis.set_ticklabels([negative_label, positive_label]);

    print('Test accuracy = ', accuracy_score(df[y_real_name], df['pred_binary']))

    return accuracy_score(df[y_real_name], df['pred_binary'])

In [None]:
## Predict on Train
## Check Accuracy, Precision, Recall & F1

predictions_rf_train = pd.DataFrame(index=X_train.index)

predictions_rf_train['Pred'] = gs.predict(X_train)
predictions_rf_train['Actual'] = y_train
predictions_rf_train['Prob'] = gs.predict_proba(X_train)[:,1]

apr(predictions_rf_train['Pred'],predictions_rf_train['Actual'])

In [None]:
## Predict on Test
## Check Accuracy, Precision, Recall & F1

predictions_rf_test = pd.DataFrame(index=X_test.index)

predictions_rf_test['Pred'] = gs.predict(X_test)
predictions_rf_test['Actual'] = y_test
predictions_rf_test['Prob'] = gs.predict_proba(X_test)[:,1]

apr(predictions_rf_test['Pred'],predictions_rf_test['Actual'])

In [None]:
# Confusion matrix on train data
produce_confusion('Yes', 'No', 'binary', predictions_rf_train, 'Pred', 'Actual')

In [None]:
# Confusion matrix on test data
produce_confusion('Yes', 'No', 'binary', predictions_rf_test, 'Pred', 'Actual')

In [None]:
# Feature importance
importance = list(zip(feature_cols, list(rf.feature_importances_)))
importance.sort(key = lambda x:x[1], reverse = True)
print('Features listed in order of decreasing importance: ')
importance

# Logistic regression

In [None]:
import statsmodels.api as sm #modelling logistic regression

In [None]:
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
y_train = list(y_train)
y_test = list(y_test)

In [None]:
# Model fitting
lg_reg_MN = sm.MNLogit(y_train, X_train).fit()

In [None]:
# Model Summary
lg_reg_MN.summary()

### Logistic regression evaluation

In [None]:
#Model predictions
train_pred_MN = lg_reg_MN.predict(X_train)
test_pred_MN = lg_reg_MN.predict(X_test)

In [None]:
# make probabilitites binary with cutoff 0.5

# train
train_pred_MN['Pred']  = (train_pred_MN[1].values > 0.5)
train_pred_MN['Pred'].replace((True, False),(1, 0),inplace=True)

# test
test_pred_MN['Pred']  = (test_pred_MN[1].values > 0.5)
test_pred_MN['Pred'].replace((True, False),(1, 0),inplace=True)

In [None]:
# train
apr(train_pred_MN['Pred'],y_train)

In [None]:
# test
apr(test_pred_MN['Pred'],y_test)

In [None]:
train_pred_MN['Actual'] = y_train
test_pred_MN['Actual'] = y_test

In [None]:
# Confusion matrix on train data
produce_confusion('Yes', 'No', 'binary', train_pred_MN, 'Pred', 'Actual')

In [None]:
# Confusion matrix on test data
produce_confusion('Yes', 'No', 'binary', test_pred_MN, 'Pred', 'Actual')

# Conclusions

* Models perform similarly in terms of accuracy (\~0.8); random forest performs slightly better in terms of recall and logistic regression in terms of precision. 

* The degree to which mental health interfered with work was a strong predictor of seeking treatment, followed by a family history of mental illness. 

* Random forest model identified care options provided by employer as the third most significant feature in someone seeking treatment.

* Improvement to logistic regression model and additional classification models may be employed to produce a better model. 