# Unsupervised Machine Learning Case Study: Mental Health in Technology-related jobs

## Data Analysis: Feature Importance

#### This is a purely supplementary notebook to data analysis for feature importance only. 

## Notebook Objectives:

1. Load encoded and labeled data set
2. Transform cluster labels into binary values for Supervised Machine Learning
3. Use RandomForestClassifier to find Feature Importance for each cluster. 

This analysis provided insight in important features for individual cluster formation and the foundation for data visualization for the Mental Health in Technology dataset. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.model_selection import feature_importances


In [2]:
# Load in the data
data = pd.read_csv('tech_df_encoded_scaled_clustered.csv')

In [3]:
data['clusters'].value_counts()

clusters
1    495
2    329
0    322
Name: count, dtype: int64

In [4]:
# Map clusters from 0-2 to a binary classification. 

data['Binary Cluster 0'] = data['clusters'].map({0: 1, 1: 0, 2: 0})
data['Binary Cluster 1'] = data['clusters'].map({0: 0, 1: 1, 2: 0})
data['Binary Cluster 2'] = data['clusters'].map({0: 0, 1: 0, 2: 1})

print(data['Binary Cluster 0'].value_counts())
print(data['Binary Cluster 1'].value_counts())
print(data['Binary Cluster 2'].value_counts())


Binary Cluster 0
0    824
1    322
Name: count, dtype: int64
Binary Cluster 1
0    651
1    495
Name: count, dtype: int64
Binary Cluster 2
0    817
1    329
Name: count, dtype: int64


In [5]:
data = data.drop('clusters', axis=1)

In [6]:
# Cluster 0

X = data.drop(['Binary Cluster 0', 'Binary Cluster 1', 'Binary Cluster 2'], axis=1)
y = data['Binary Cluster 0']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get the feature importance

feature_importance_0 = rf.feature_importances_

# Create a DataFrame to visualize feature importance

feature_importance_df_0 = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance_0})

# Sort the DataFrame in order of importance
feature_importance_df_0 = feature_importance_df_0.sort_values('Importance', ascending=False)

# Display the top features

print(feature_importance_df_0.head(10))



                                            Feature  Importance
80           at_any_point_diagnosed_MH_disorder_Yes    0.106030
81                        treatment_MH_disorder_Yes    0.084050
86                         country_residence_Europe    0.077845
25                       past_history_mental_health    0.072465
15   interference_with_work_NOT_effective_treatment    0.071033
87                  country_residence_North America    0.066372
14  interference_with_work_with_effective_treatment    0.066268
94                       country_work_North America    0.060696
93                              country_work_Europe    0.048889
26                              current_MH_disorder    0.041684


In [7]:
# Cluster 1

X = data.drop(['Binary Cluster 0', 'Binary Cluster 1', 'Binary Cluster 2'], axis=1)
y = data['Binary Cluster 1']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get the feature importance

feature_importance_1 = rf.feature_importances_

# Create a DataFrame to visualize feature importance

feature_importance_df_1 = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance_1})

# Sort the DataFrame in order of importance
feature_importance_df_1 = feature_importance_df_1.sort_values('Importance', ascending=False)

# Display the top features

print(feature_importance_df_1.head(10))



                                            Feature  Importance
80           at_any_point_diagnosed_MH_disorder_Yes    0.127322
81                        treatment_MH_disorder_Yes    0.087234
25                       past_history_mental_health    0.081337
15   interference_with_work_NOT_effective_treatment    0.061380
86                         country_residence_Europe    0.060016
93                              country_work_Europe    0.058181
87                  country_residence_North America    0.057977
14  interference_with_work_with_effective_treatment    0.056222
26                              current_MH_disorder    0.052933
94                       country_work_North America    0.050135


In [8]:
# Cluster 2

X = data.drop(['Binary Cluster 0', 'Binary Cluster 1', 'Binary Cluster 2'], axis=1)
y = data['Binary Cluster 2']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get the feature importance

feature_importance_2 = rf.feature_importances_

# Create a DataFrame to visualize feature importance

feature_importance_df_2 = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance_2})

# Sort the DataFrame in order of importance
feature_importance_df_2 = feature_importance_df_2.sort_values('Importance', ascending=False)

# Display the top features

print(feature_importance_df_2.head(10))



                                              Feature  Importance
94                         country_work_North America    0.241472
93                                country_work_Europe    0.169463
86                           country_residence_Europe    0.167634
87                    country_residence_North America    0.149658
43                         mental_health_benefits_Yes    0.016082
45           mental_health_coverage_awareness_Missing    0.013989
2            previous_employer_mental_health_benefits    0.013045
41                          mental_health_benefits_No    0.012310
3   previous_employer_mental_health_coverage_aware...    0.010109
52                         mental_health_resources_No    0.009953
