# Unsupervised Machine Learning Case Study: Mental Health in Technology-related jobs

## Data Analysis: Feature Importance

#### This is a purely supplementary notebook to data analysis for feature importance only. 

## Notebook Objectives:

1. Load encoded and labeled data set
2. Transform cluster labels into binary values for Supervised Machine Learning
3. Use RandomForestClassifier to find Feature Importance for each cluster. 

This analysis provided insight in important features for individual cluster formation and the foundation for data visualization for the Mental Health in Technology dataset. 

In [26]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.model_selection import feature_importances


In [27]:
# Load in the data
data = pd.read_csv('tech_df_encoded_scaled_clustered.csv')

In [28]:
data['clusters'].value_counts()

clusters
1    495
2    329
0    322
Name: count, dtype: int64

In [29]:
# Map clusters from 0-2 to a binary classification. 

data['Binary Cluster 0'] = data['clusters'].map({0: 1, 1: 0, 2: 0})
data['Binary Cluster 1'] = data['clusters'].map({0: 0, 1: 1, 2: 0})
data['Binary Cluster 2'] = data['clusters'].map({0: 0, 1: 0, 2: 1})

print(data['Binary Cluster 0'].value_counts())
print(data['Binary Cluster 1'].value_counts())
print(data['Binary Cluster 2'].value_counts())


Binary Cluster 0
0    824
1    322
Name: count, dtype: int64
Binary Cluster 1
0    651
1    495
Name: count, dtype: int64
Binary Cluster 2
0    817
1    329
Name: count, dtype: int64


In [30]:
data = data.drop('clusters', axis=1)

In [31]:
# Cluster 0

X = data.drop(['Binary Cluster 0', 'Binary Cluster 1', 'Binary Cluster 2'], axis=1)
y = data['Binary Cluster 0']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get the feature importance

feature_importance_0 = rf.feature_importances_

# Create a DataFrame to visualize feature importance

feature_importance_df_0 = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance_0})

# Sort the DataFrame in order of importance
feature_importance_df_0 = feature_importance_df_0.sort_values('Importance', ascending=False)

# Display the top features

print(feature_importance_df_0.head(10))



                                            Feature  Importance
80           at_any_point_diagnosed_MH_disorder_Yes    0.130364
81                        treatment_MH_disorder_Yes    0.077209
87                continent_residence_North America    0.073267
86                       continent_residence_Europe    0.069905
94                     continent_work_North America    0.069535
14  interference_with_work_with_effective_treatment    0.068771
15   interference_with_work_NOT_effective_treatment    0.068550
25                       past_history_mental_health    0.068495
93                            continent_work_Europe    0.049908
26                              current_MH_disorder    0.040726


In [32]:
# Cluster 1

X = data.drop(['Binary Cluster 0', 'Binary Cluster 1', 'Binary Cluster 2'], axis=1)
y = data['Binary Cluster 1']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get the feature importance

feature_importance_1 = rf.feature_importances_

# Create a DataFrame to visualize feature importance

feature_importance_df_1 = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance_1})

# Sort the DataFrame in order of importance
feature_importance_df_1 = feature_importance_df_1.sort_values('Importance', ascending=False)

# Display the top features

print(feature_importance_df_1.head(10))



                                            Feature  Importance
80           at_any_point_diagnosed_MH_disorder_Yes    0.133929
81                        treatment_MH_disorder_Yes    0.087944
25                       past_history_mental_health    0.081373
94                     continent_work_North America    0.067157
14  interference_with_work_with_effective_treatment    0.065631
26                              current_MH_disorder    0.056945
86                       continent_residence_Europe    0.054580
15   interference_with_work_NOT_effective_treatment    0.053639
87                continent_residence_North America    0.052287
93                            continent_work_Europe    0.049063


In [33]:
# Cluster 2

X = data.drop(['Binary Cluster 0', 'Binary Cluster 1', 'Binary Cluster 2'], axis=1)
y = data['Binary Cluster 2']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get the feature importance

feature_importance_2 = rf.feature_importances_

# Create a DataFrame to visualize feature importance

feature_importance_df_2 = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance_2})

# Sort the DataFrame in order of importance
feature_importance_df_2 = feature_importance_df_2.sort_values('Importance', ascending=False)

# Display the top features

print(feature_importance_df_2.head(10))



                                              Feature  Importance
94                       continent_work_North America    0.238214
87                  continent_residence_North America    0.169134
86                         continent_residence_Europe    0.164744
93                              continent_work_Europe    0.157633
43                         mental_health_benefits_Yes    0.016889
45           mental_health_coverage_awareness_Missing    0.016059
2            previous_employer_mental_health_benefits    0.013399
41                          mental_health_benefits_No    0.012118
52                         mental_health_resources_No    0.009188
3   previous_employer_mental_health_coverage_aware...    0.008340
