In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Loading data
file_path = Path("../Resources/cleaned_data.csv")
mental_health_df = pd.read_csv(file_path)
mental_health_df.head()

Unnamed: 0,I am currently employed at least part-time,I identify as having a mental illness,Education,I have my own computer separate from a smart phone,I have been hospitalized before for my mental illness,How many days were you hospitalized for your mental illness,I am legally disabled,I have my regular access to the internet,I live with my parents,I have a gap in my resume,...,Obsessive thinking,Mood swings,Panic attacks,Compulsive behavior,Tiredness,Age,Gender,Household Income,Region,Device Type
0,0,0,High School or GED,0,0,0.0,0,1,0,1,...,1.0,0.0,1.0,0.0,0.0,30-44,Male,"$25,000-$49,999",Mountain,Android Phone / Tablet
1,1,1,Some Phd,1,0,0.0,0,1,0,0,...,0.0,0.0,1.0,0.0,1.0,18-29,Male,"$50,000-$74,999",East South Central,MacOS Desktop / Laptop
2,1,0,Completed Undergraduate,1,0,0.0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$150,000-$174,999",Pacific,MacOS Desktop / Laptop
3,0,0,Some Undergraduate,1,0,,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$25,000-$49,999",New England,Windows Desktop / Laptop
4,1,1,Completed Undergraduate,1,1,35.0,1,1,0,1,...,1.0,1.0,1.0,1.0,1.0,30-44,Male,"$25,000-$49,999",East North Central,iOS Phone / Tablet


In [4]:
# Drop NaN 
mental_health_df = mental_health_df.dropna()

In [5]:
# Binary encoding using Pandas 
mental_health_df = pd.get_dummies(mental_health_df, columns=["Education", "Age", "Gender", "Household Income", "Region", "Device Type"])
mental_health_df.head()

Unnamed: 0,I am currently employed at least part-time,I identify as having a mental illness,I have my own computer separate from a smart phone,I have been hospitalized before for my mental illness,How many days were you hospitalized for your mental illness,I am legally disabled,I have my regular access to the internet,I live with my parents,I have a gap in my resume,Total length of any gaps in my resume in months.,...,Region_New England,Region_Pacific,Region_South Atlantic,Region_West North Central,Region_West South Central,Device Type_Android Phone / Tablet,Device Type_MacOS Desktop / Laptop,Device Type_Other,Device Type_Windows Desktop / Laptop,Device Type_iOS Phone / Tablet
0,0,0,0,0,0.0,0,1,0,1,24,...,0,0,0,0,0,1,0,0,0,0
1,1,1,1,0,0.0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1,0,1,0,0.0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,1,1,1,1,35.0,1,1,0,1,33,...,0,0,0,0,0,0,0,0,0,1
5,1,0,1,0,0.0,0,1,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [6]:
mental_health_df.count()

I am currently employed at least part-time                     294
I identify as having a mental illness                          294
I have my own computer separate from a smart phone             294
I have been hospitalized before for my mental illness          294
How many days were you hospitalized for your mental illness    294
                                                              ... 
Device Type_Android Phone / Tablet                             294
Device Type_MacOS Desktop / Laptop                             294
Device Type_Other                                              294
Device Type_Windows Desktop / Laptop                           294
Device Type_iOS Phone / Tablet                                 294
Length: 64, dtype: int64

In [7]:
# Define features set
X = mental_health_df.copy()
X = X.drop("I am currently employed at least part-time", axis=1)
X.head()

Unnamed: 0,I identify as having a mental illness,I have my own computer separate from a smart phone,I have been hospitalized before for my mental illness,How many days were you hospitalized for your mental illness,I am legally disabled,I have my regular access to the internet,I live with my parents,I have a gap in my resume,Total length of any gaps in my resume in months.,Annual income (including any social welfare programs) in USD,...,Region_New England,Region_Pacific,Region_South Atlantic,Region_West North Central,Region_West South Central,Device Type_Android Phone / Tablet,Device Type_MacOS Desktop / Laptop,Device Type_Other,Device Type_Windows Desktop / Laptop,Device Type_iOS Phone / Tablet
0,0,0,0,0.0,0,1,0,1,24,35,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0.0,0,1,0,0,1,22,...,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0.0,0,1,0,0,0,100,...,0,1,0,0,0,0,1,0,0,0
4,1,1,1,35.0,1,1,0,1,33,32,...,0,0,0,0,0,0,0,0,0,1
5,0,1,0,0.0,0,1,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [8]:
# Define the target set.
y = mental_health_df["I am currently employed at least part-time"].ravel()
y[:5]

array([0, 1, 1, 1, 1], dtype=int64)

In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(220, 63)
(74, 63)
(220,)
(74,)


In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [13]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [14]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [15]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,17,7
Actual 1,0,50


In [16]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [17]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,17,7
Actual 1,0,50


Accuracy Score : 0.9054054054054054
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.71      0.83        24
           1       0.88      1.00      0.93        50

    accuracy                           0.91        74
   macro avg       0.94      0.85      0.88        74
weighted avg       0.92      0.91      0.90        74



In [18]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00589858, 0.01313699, 0.00421379, 0.01333561, 0.03375328,
       0.00672086, 0.01018717, 0.01491712, 0.0485364 , 0.0642805 ,
       0.26890378, 0.01009955, 0.02664996, 0.02107627, 0.00432961,
       0.011155  , 0.00901987, 0.00766369, 0.00869792, 0.00915697,
       0.00551985, 0.0054752 , 0.00479067, 0.01190685, 0.01166331,
       0.00147381, 0.01331016, 0.00969028, 0.00129766, 0.01090515,
       0.00476389, 0.00144081, 0.00725891, 0.01190383, 0.01895622,
       0.04718849, 0.01429693, 0.01415622, 0.02159291, 0.00655611,
       0.01053938, 0.00680456, 0.00525782, 0.0021444 , 0.00797458,
       0.00913118, 0.00875196, 0.00562761, 0.01239505, 0.00977057,
       0.00386713, 0.00669113, 0.00468   , 0.00818171, 0.00959043,
       0.01186709, 0.00177036, 0.00869318, 0.01215398, 0.00726494,
       0.0043512 , 0.01269183, 0.01391971])

In [19]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2689037822487102, 'I am unemployed'),
 (0.06428049727848961,
  'Annual income (including any social welfare programs) in USD'),
 (0.04853640458780085, 'Total length of any gaps in my resume in\xa0months.'),
 (0.04718849431420585, 'Age_> 60'),
 (0.033753282761891386, 'I am legally disabled'),
 (0.026649960738385815, 'Annual income from social welfare programs'),
 (0.021592906019362063, 'Household Income_$0-$9,999'),
 (0.021076269535172904, 'I receive food stamps'),
 (0.01895622487150085, 'Age_45-60'),
 (0.014917117998768454, 'I have a gap in my resume'),
 (0.014296934367397039, 'Gender_Female'),
 (0.014156221693450464, 'Gender_Male'),
 (0.013919712895856627, 'Device Type_iOS Phone / Tablet'),
 (0.013335609929756681,
  'How many days were you hospitalized for your mental illness'),
 (0.013310163306610202, 'Education_Completed Undergraduate'),
 (0.013136991455704615, 'I have my own computer separate from a smart phone'),
 (0.01269183165542353, 'Device Type_Windows Desktop / Laptop'),
