In [37]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Dependencies for interaction with database:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from config import password

# Machine Learning dependencies:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Create engine and link to local postgres database:
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Final_project_mental_health')
connect = engine.connect()

In [3]:
# Create session:
session = Session(engine)

In [4]:
# Import clean_dataset_2016 table:
clean_2016_df = pd.read_sql("SELECT * FROM clean_dataset_2016", connect)

In [5]:
clean_2016_df.head()

Unnamed: 0,index,self_employed,company_size,tech_or_not,related_tech_it,employer_provide_mh_coverage,employer_discus_mh,employers_options_help,protected_anonymity_mh,leave,...,suportive_response_to_mh,tech_industry_support_mh,age,gender,country_live,state_live,ethnicity,country_work,state_work,key
0,0,0,100-500,1.0,1.0,No,No,I don't know,I don't know,I don't know,...,"Yes, I experienced",1.0,27.0,Female,United Kingdom,,,United Kingdom,,45
1,1,0,100-500,1.0,1.0,Yes,No,No,I don't know,I don't know,...,Maybe/Not sure,2.0,31.0,male,United Kingdom,,,United Kingdom,,45
2,2,0,6-25,1.0,1.0,I don't know,I don't know,No,Yes,Difficult,...,"Yes, I experienced",1.0,36.0,male,United States of America,Missouri,White,United States of America,Missouri,45
3,3,0,More than 1000,1.0,1.0,Yes,I don't know,I don't know,Yes,Difficult,...,Maybe/Not sure,2.0,22.0,Male,United States of America,Washington,White,United States of America,Washington,45
4,4,1,,,,,,,,,...,"Yes, I observed",1.0,52.0,female,United States of America,Illinois,More than one of the above,United States of America,Illinois,45


In [8]:
# Check data for insights:
print(clean_2016_df.shape)
print(clean_2016_df.columns.tolist())
print(clean_2016_df.value_counts)

(756, 50)
['index', 'self_employed', 'company_size', 'tech_or_not', 'related_tech_it', 'employer_provide_mh_coverage', 'employer_discus_mh', 'employers_options_help', 'protected_anonymity_mh', 'leave', 'mh_supervisor', 'mh_employer', 'mh_coworkers', 'employer_importance_ph', 'employer_importance_mh', 'private_mh_insurance', 'reveal_mh_clients', 'reveal_affect_relationship', 'reveal_mh_coworkers', 'reveal_impact_relationship', 'prev_employer', 'prev_tech_or_not', 'prev_provided_mh_coverage', 'prev_wellness_program', 'prev_anonymity_preserved_mh', 'prev_discuss_supervisors', 'prev_discuss_employer', 'prev_discuss_coworkers', 'prev_importance_ph', 'prev_importance_mh', 'currently_mh_disorder', 'dx_mh_disorder', 'treatment_mh_from_professional', 'mh_family_history', 'ph_interview', 'why_why_not', 'mh_interview', 'why_whynot', 'identified_with_mh', 'unsuportive_reponse_to_mh', 'suportive_response_to_mh', 'tech_industry_support_mh', 'age', 'gender', 'country_live', 'state_live', 'ethnicity',

### Test:
- Dataset filtered on tech_or_not = "yes"
- Target: dx_mh_disorder 
- Features: company_size, age, gender, country_live, identified_with_mh
 

In [9]:
# Filter tech_or_not columns:
clean_2016_df["tech_or_not"].head()

0    1.0
1    1.0
2    1.0
3    1.0
4    NaN
Name: tech_or_not, dtype: float64

In [12]:
tech_df = pd.read_sql("SELECT * FROM clean_dataset_2016 WHERE tech_or_not = 1", connect)
tech_df.shape

(501, 50)

In [14]:
ml_df = tech_df[["company_size", "age", "gender", "country_live", "identified_with_mh", "dx_mh_disorder"]]
ml_df

Unnamed: 0,company_size,age,gender,country_live,identified_with_mh,dx_mh_disorder
0,100-500,27.0,Female,United Kingdom,0.0,
1,100-500,31.0,male,United Kingdom,0.0,
2,6-25,36.0,male,United States of America,1.0,Yes
3,More than 1000,22.0,Male,United States of America,0.0,Yes
4,100-500,30.0,male,United States of America,0.0,
...,...,...,...,...,...,...
496,100-500,34.0,Male,United States of America,0.0,
497,26-100,29.0,Male,United States of America,0.0,Yes
498,26-100,41.0,Male,United Kingdom,0.0,
499,6-25,40.0,Male,United States of America,1.0,Yes


In [22]:
# Encode dataset:

# Create label encoder instance:
le = LabelEncoder()

# Make a copy of desire data:
encoded_df = ml_df.copy()

# Apply encoder:
encoded_df["age"] = le.fit_transform(encoded_df["age"] )
encoded_df["company_size"] = le.fit_transform(encoded_df["company_size"])
encoded_df["gender"] = le.fit_transform(encoded_df["gender"])
encoded_df["country_live"] = le.fit_transform(encoded_df["country_live"])
encoded_df["identified_with_mh"] = le.fit_transform(encoded_df["identified_with_mh"])
encoded_df["dx_mh_disorder"] = le.fit_transform(encoded_df["dx_mh_disorder"])

# Check:
encoded_df.head()

Unnamed: 0,company_size,age,gender,country_live,identified_with_mh,dx_mh_disorder
0,1,7,6,33,0,2
1,1,11,35,33,0,2
2,4,16,35,34,1,1
3,5,2,13,34,0,1
4,1,10,35,34,0,2


In [27]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse = False)

encoded_df1 = ml_df.copy()

# Apply encoder:
encoded_df1 = encoder.fit_transform(encoded_df1)
encoded_df1

array([[0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.]])

In [29]:
# Create our target:
y = encoded_df["dx_mh_disorder"]

# Create our features:
X = encoded_df.drop(columns = "dx_mh_disorder", axis =1)

In [30]:
# Split the data:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.25)

In [31]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Balanced Random Forest Classifier:

In [33]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier:
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

# Fitting the model:
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data:
predictions = rf_model.predict(X_test)

In [45]:
X.describe()

Unnamed: 0,company_size,age,gender,country_live,identified_with_mh
count,501.0,501.0,501.0,501.0,501.0
mean,2.548902,14.241517,17.936128,29.47505,0.10978
std,1.607515,7.838849,10.785356,9.176812,0.312928
min,0.0,0.0,0.0,0.0,0.0
25%,1.0,9.0,13.0,33.0,0.0
50%,2.0,13.0,13.0,34.0,0.0
75%,4.0,19.0,30.0,34.0,0.0
max,5.0,41.0,39.0,34.0,1.0


In [34]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score

y_pred = predictions
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.6031746031746031


In [44]:
# Display the confusion matrix:
# Calculating the confusion matrix:
confusion_matrix(y_test, predictions)

array([[ 0,  0,  1],
       [ 0, 29, 21],
       [ 0, 28, 47]], dtype=int64)

In [38]:
# Print the imbalanced classification report:
from imblearn.metrics import classification_report_imbalanced

print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print("------------------------------------------------------------------------------------")
print(f"Classification report:           Balanced Random Forest Classifier   ")
print("------------------------------------------------------------------------------------")
print(classification_report_imbalanced(y_test, y_pred))
print("------------------------------------------------------------------------------------")

Accuracy score: 0.6031746031746031
------------------------------------------------------------------------------------
Classification report:           Balanced Random Forest Classifier   
------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00         1
          1       0.51      0.58      0.63      0.54      0.61      0.36        50
          2       0.68      0.63      0.57      0.65      0.60      0.36        75

avg / total       0.61      0.60      0.60      0.60      0.60      0.36       126

------------------------------------------------------------------------------------


### ClusterCentroids resampler:

In [48]:
# Resample the data using the ClusterCentroids resampler:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 3, 1: 3, 2: 3})

In [49]:
# Train the Logistic Regression model using the resampled data:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [50]:
# Calculated the balanced accuracy score:
y_pred = model.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.3968253968253968


In [52]:
# Print the imbalanced classification report:

print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print("------------------------------------------------------------------------------------")
print(f"Classification report:           Cluster Centroids Undersampling   ")
print("------------------------------------------------------------------------------------")
print(classification_report_imbalanced(y_test, y_pred))
print("------------------------------------------------------------------------------------")

Accuracy score: 0.3968253968253968
------------------------------------------------------------------------------------
Classification report:           Cluster Centroids Undersampling   
------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      0.78      0.00      0.00      0.00         1
          1       0.38      0.28      0.70      0.32      0.44      0.19        50
          2       0.59      0.48      0.51      0.53      0.49      0.24        75

avg / total       0.50      0.40      0.59      0.44      0.47      0.22       126

------------------------------------------------------------------------------------
