In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Dependencies for interaction with database:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from config import password

# Machine Learning dependencies:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Create engine and link to local postgres database:
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/Final_project_mental_health')
connect = engine.connect()

In [3]:
# Create session:
session = Session(engine)

In [4]:
# Import clean_dataset_2016 table:
clean_2016_df = pd.read_sql("SELECT * FROM clean_dataset_2016", connect)

In [5]:
clean_2014_df =  pd.read_sql("SELECT * FROM clean_dataset_2016", connect)

In [6]:
clean_2014_df.head()

Unnamed: 0,index,self_employed,company_size,tech_or_not,related_tech_it,employer_provide_mh_coverage,employer_discus_mh,employers_options_help,protected_anonymity_mh,leave,...,suportive_response_to_mh,tech_industry_support_mh,age,gender,country_live,state_live,ethnicity,country_work,state_work,key
0,0,0,100-500,1.0,1.0,No,No,I don't know,I don't know,I don't know,...,"Yes, I experienced",1.0,27.0,Female,United Kingdom,,,United Kingdom,,45
1,1,0,100-500,1.0,1.0,Yes,No,No,I don't know,I don't know,...,Maybe/Not sure,2.0,31.0,male,United Kingdom,,,United Kingdom,,45
2,2,0,6-25,1.0,1.0,I don't know,I don't know,No,Yes,Difficult,...,"Yes, I experienced",1.0,36.0,male,United States of America,Missouri,White,United States of America,Missouri,45
3,3,0,More than 1000,1.0,1.0,Yes,I don't know,I don't know,Yes,Difficult,...,Maybe/Not sure,2.0,22.0,Male,United States of America,Washington,White,United States of America,Washington,45
4,4,1,,,,,,,,,...,"Yes, I observed",1.0,52.0,female,United States of America,Illinois,More than one of the above,United States of America,Illinois,45


In [7]:
clean_2016_df.head()

Unnamed: 0,index,self_employed,company_size,tech_or_not,related_tech_it,employer_provide_mh_coverage,employer_discus_mh,employers_options_help,protected_anonymity_mh,leave,...,suportive_response_to_mh,tech_industry_support_mh,age,gender,country_live,state_live,ethnicity,country_work,state_work,key
0,0,0,100-500,1.0,1.0,No,No,I don't know,I don't know,I don't know,...,"Yes, I experienced",1.0,27.0,Female,United Kingdom,,,United Kingdom,,45
1,1,0,100-500,1.0,1.0,Yes,No,No,I don't know,I don't know,...,Maybe/Not sure,2.0,31.0,male,United Kingdom,,,United Kingdom,,45
2,2,0,6-25,1.0,1.0,I don't know,I don't know,No,Yes,Difficult,...,"Yes, I experienced",1.0,36.0,male,United States of America,Missouri,White,United States of America,Missouri,45
3,3,0,More than 1000,1.0,1.0,Yes,I don't know,I don't know,Yes,Difficult,...,Maybe/Not sure,2.0,22.0,Male,United States of America,Washington,White,United States of America,Washington,45
4,4,1,,,,,,,,,...,"Yes, I observed",1.0,52.0,female,United States of America,Illinois,More than one of the above,United States of America,Illinois,45


In [8]:
# Check data for insights:
print(clean_2016_df.shape)
print(clean_2016_df.columns.tolist())
#print(clean_2016_df.value_counts)

(756, 50)
['index', 'self_employed', 'company_size', 'tech_or_not', 'related_tech_it', 'employer_provide_mh_coverage', 'employer_discus_mh', 'employers_options_help', 'protected_anonymity_mh', 'leave', 'mh_supervisor', 'mh_employer', 'mh_coworkers', 'employer_importance_ph', 'employer_importance_mh', 'private_mh_insurance', 'reveal_mh_clients', 'reveal_affect_relationship', 'reveal_mh_coworkers', 'reveal_impact_relationship', 'prev_employer', 'prev_tech_or_not', 'prev_provided_mh_coverage', 'prev_wellness_program', 'prev_anonymity_preserved_mh', 'prev_discuss_supervisors', 'prev_discuss_employer', 'prev_discuss_coworkers', 'prev_importance_ph', 'prev_importance_mh', 'currently_mh_disorder', 'dx_mh_disorder', 'treatment_mh_from_professional', 'mh_family_history', 'ph_interview', 'why_why_not', 'mh_interview', 'why_whynot', 'identified_with_mh', 'unsuportive_reponse_to_mh', 'suportive_response_to_mh', 'tech_industry_support_mh', 'age', 'gender', 'country_live', 'state_live', 'ethnicity',

In [9]:
clean_2016_df["leave"].value_counts()

Somewhat easy                 164
Very easy                     144
I don't know                  105
Neither easy nor difficult    102
Somewhat difficult             71
Difficult                      57
Name: leave, dtype: int64

In [10]:
values = clean_2016_df["leave"].tolist()
values

["I don't know",
 "I don't know",
 'Difficult',
 'Difficult',
 None,
 'Somewhat easy',
 'Very easy',
 'Somewhat easy',
 'Very easy',
 None,
 "I don't know",
 'Difficult',
 'Neither easy nor difficult',
 'Neither easy nor difficult',
 'Very easy',
 'Somewhat easy',
 'Neither easy nor difficult',
 None,
 'Somewhat easy',
 'Neither easy nor difficult',
 'Neither easy nor difficult',
 'Somewhat difficult',
 'Difficult',
 'Difficult',
 "I don't know",
 'Very easy',
 'Somewhat difficult',
 'Somewhat easy',
 'Neither easy nor difficult',
 'Very easy',
 None,
 'Somewhat difficult',
 'Very easy',
 'Somewhat easy',
 None,
 "I don't know",
 'Difficult',
 'Difficult',
 'Somewhat easy',
 None,
 'Somewhat easy',
 'Very easy',
 'Neither easy nor difficult',
 "I don't know",
 None,
 'Very easy',
 'Very easy',
 'Difficult',
 'Somewhat difficult',
 None,
 'Difficult',
 'Difficult',
 "I don't know",
 'Somewhat easy',
 'Difficult',
 'Very easy',
 'Somewhat difficult',
 'Somewhat difficult',
 None,
 'Very 

In [11]:
clean_2016_df["leave"].value_counts()
values = clean_2016_df["leave"].tolist()
clean_2016_df["leave"] = clean_2016_df["leave"].replace(["Somewhat easy", "Somewhat difficult"],["Very easy", "Difficult"])
clean_2016_df["leave"].tolist()
clean_2016_df["leave"].value_counts

<bound method IndexOpsMixin.value_counts of 0      I don't know
1      I don't know
2         Difficult
3         Difficult
4              None
           ...     
751       Very easy
752       Very easy
753            None
754       Very easy
755            None
Name: leave, Length: 756, dtype: object>

In [12]:
# Filter tech_or_not columns:
clean_2016_df["tech_or_not"].head()

tech_df = pd.read_sql("SELECT * FROM clean_dataset_2016 WHERE tech_or_not = 1", connect)
tech_df.shape

(501, 50)

In [13]:
# Log-in tech_df:
table_name = "work_in_tech_2016"
tech_df.to_sql(
    table_name,
    engine,
    if_exists = "replace")

In [14]:
role_tech = pd.read_sql("SELECT * FROM work_in_tech_2016 WHERE related_tech_it = 1", connect)
print(role_tech.shape)

(460, 51)


In [15]:
role_it = pd.read_sql("SELECT * FROM work_in_tech_2016 WHERE related_tech_it = 0", connect)
print(role_it.shape)

(41, 51)


In [16]:
ml_df =  pd.read_sql("SELECT * FROM clean_dataset_2016 WHERE tech_or_not = 1", connect)

In [17]:
ml_df["leave"] = clean_2016_df["leave"]
ml_df["leave"].value_counts()

Very easy                     209
Difficult                      83
Neither easy nor difficult     69
I don't know                   64
Name: leave, dtype: int64

In [18]:
# Target: dx_mh_disorder Can we predict if an individual can take a leave based on his mental health conditions prev and current employer.
# features: company_size/employer_provide_mh_coverage/protected_anonymity_mh/leave/employer_importance_mh/prev_employer/prev_provided_mh_coverage
#prev_anonymity_preserved_mh/prev_importance_mh/currently_mh_disorder/dx_mh_disorder/treatment_mh_from_professional/mh_family_history/age/gender
#identified_with_mh/employers_options_help/country_live/ethnicity/country_work/state_work

In [19]:
ml_df = ml_df[["identified_with_mh", "employer_provide_mh_coverage", "protected_anonymity_mh", "leave", "employer_importance_mh", "prev_employer",
                  "prev_provided_mh_coverage", "prev_anonymity_preserved_mh", "prev_importance_mh", "currently_mh_disorder", "dx_mh_disorder", "treatment_mh_from_professional",
                  "mh_family_history", "age", "gender", "country_live", "country_work"]]
ml_df["leave"] = ml_df["leave"].replace(["Somewhat easy", "Somewhat difficult", "I don't know"],["Very easy", "Difficult", "Neither easy nor difficult"])                
ml_df.head()
print(ml_df.shape)
print(ml_df["leave"].value_counts())

(501, 17)
Very easy                     209
Neither easy nor difficult    133
Difficult                      83
Name: leave, dtype: int64


In [20]:
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   identified_with_mh              501 non-null    float64
 1   employer_provide_mh_coverage    501 non-null    object 
 2   protected_anonymity_mh          501 non-null    object 
 3   leave                           425 non-null    object 
 4   employer_importance_mh          501 non-null    float64
 5   prev_employer                   501 non-null    int64  
 6   prev_provided_mh_coverage       434 non-null    object 
 7   prev_anonymity_preserved_mh     434 non-null    object 
 8   prev_importance_mh              434 non-null    float64
 9   currently_mh_disorder           501 non-null    object 
 10  dx_mh_disorder                  204 non-null    object 
 11  treatment_mh_from_professional  501 non-null    int64  
 12  mh_family_history               501 

In [21]:
# Encode dataset:

# Create label encoder instance:
le = LabelEncoder()

# Make a copy of desire data:
encoded_df = ml_df.copy()

# Encode all desired columns:
features = encoded_df.columns.tolist()
for feature in features:
    encoded_df[feature] = le.fit_transform(encoded_df[feature])

# Check:
encoded_df.head()

Unnamed: 0,identified_with_mh,employer_provide_mh_coverage,protected_anonymity_mh,leave,employer_importance_mh,prev_employer,prev_provided_mh_coverage,prev_anonymity_preserved_mh,prev_importance_mh,currently_mh_disorder,dx_mh_disorder,treatment_mh_from_professional,mh_family_history,age,gender,country_live,country_work
0,0,1,0,1,0,1,0,3,3,2,2,1,1,7,6,33,32
1,0,3,0,1,2,1,2,0,2,2,2,0,1,11,35,33,32
2,1,0,2,0,1,1,2,0,0,3,1,1,2,16,35,34,33
3,0,3,2,0,5,0,4,4,11,3,1,1,0,2,13,34,33
4,0,3,2,3,5,1,1,3,3,1,2,0,2,10,35,34,33


In [22]:
# Create our target:
y = encoded_df["leave"]

# Create our features:
X = encoded_df.drop(columns = "leave", axis =1)

In [23]:
# Split the data:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, test_size = 0.25)

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier:
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

# Fitting the model:
rf_model = rf_model.fit(X_train, y_train)

# Making predictions using the testing data:
predictions = rf_model.predict(X_test)

In [25]:
X.describe()

Unnamed: 0,identified_with_mh,employer_provide_mh_coverage,protected_anonymity_mh,employer_importance_mh,prev_employer,prev_provided_mh_coverage,prev_anonymity_preserved_mh,prev_importance_mh,currently_mh_disorder,dx_mh_disorder,treatment_mh_from_professional,mh_family_history,age,gender,country_live,country_work
count,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0
mean,0.10978,1.886228,0.706587,5.315369,0.866267,1.844311,1.263473,4.702595,1.916168,1.58483,0.58483,1.191617,14.241517,17.936128,29.47505,28.588822
std,0.312928,1.316446,0.929374,2.369883,0.340705,1.288298,1.571764,3.340265,1.037766,0.509205,0.493244,0.819274,7.838849,10.785356,9.176812,9.018791
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,4.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,0.0,9.0,13.0,33.0,32.0
50%,0.0,3.0,0.0,5.0,1.0,2.0,0.0,4.0,2.0,2.0,1.0,1.0,13.0,13.0,34.0,33.0
75%,0.0,3.0,2.0,7.0,1.0,3.0,3.0,6.0,3.0,2.0,1.0,2.0,19.0,30.0,34.0,33.0
max,1.0,3.0,2.0,10.0,1.0,4.0,4.0,11.0,3.0,2.0,1.0,2.0,41.0,39.0,34.0,33.0


In [26]:
# Predict outcomes for test data set:
predictions = rf_model.predict(X_test)
pd.DataFrame({"Actual": y_test, "Prediction": predictions})

Unnamed: 0,Actual,Prediction
135,2,1
182,2,1
145,1,2
57,0,1
146,2,2
...,...,...
424,2,2
108,0,0
283,1,2
262,0,1


In [27]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score

y_pred = predictions
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.3968253968253968


In [28]:
# Print the imbalanced classification report:
from imblearn.metrics import classification_report_imbalanced

print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print("------------------------------------------------------------------------------------")
print(f"Classification report:           Balanced Random Forest Classifier   ")
print("------------------------------------------------------------------------------------")
print(classification_report_imbalanced(y_test, y_pred))
print("------------------------------------------------------------------------------------")

Accuracy score: 0.3968253968253968
------------------------------------------------------------------------------------
Classification report:           Balanced Random Forest Classifier   
------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.12      0.05      0.93      0.07      0.21      0.04        21
          1       0.27      0.21      0.80      0.24      0.41      0.16        33
          2       0.48      0.77      0.40      0.59      0.55      0.32        53
          3       0.14      0.05      0.94      0.08      0.22      0.05        19

avg / total       0.32      0.40      0.67      0.34      0.41      0.19       126

------------------------------------------------------------------------------------


### Naive Random Oversampling

In [29]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({1: 156, 2: 156, 0: 156, 3: 156})

In [30]:
# Train the Logistic Regression model using the resampled data:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [31]:
# Predict outcomes for test data set
predictions = model.predict(X_test)
pd.DataFrame({"Actual": y_test, "Prediction": predictions,})

Unnamed: 0,Actual,Prediction
135,2,3
182,2,3
145,1,2
57,0,1
146,2,3
...,...,...
424,2,0
108,0,0
283,1,0
262,0,2


In [32]:
# Calculated the balanced accuracy score:

y_pred = model.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.29365079365079366


In [33]:
# Print the imbalanced classification report:

from imblearn.metrics import classification_report_imbalanced

print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print("------------------------------------------------------------------------------------")
print(f"Classification report:           Naive Random Oversampling   ")
print("------------------------------------------------------------------------------------")
print(classification_report_imbalanced(y_test, y_pred))
print("------------------------------------------------------------------------------------")

Accuracy score: 0.29365079365079366
------------------------------------------------------------------------------------
Classification report:           Naive Random Oversampling   
------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.17      0.19      0.81      0.18      0.39      0.14        21
          1       0.34      0.39      0.73      0.37      0.54      0.28        33
          2       0.42      0.32      0.68      0.37      0.47      0.21        53
          3       0.12      0.16      0.80      0.14      0.36      0.12        19

avg / total       0.31      0.29      0.74      0.30      0.46      0.20       126

------------------------------------------------------------------------------------


### Clusters centroids Undersampling:

In [34]:
# Resample the data using the ClusterCentroids resampler:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 57, 1: 57, 2: 57, 3: 57})

In [35]:
# Train the Logistic Regression model using the resampled data:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [37]:
# Calculated the balanced accuracy score:
y_pred = model.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.24603174603174602


In [38]:
# Print the imbalanced classification report:

print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print("------------------------------------------------------------------------------------")
print(f"Classification report:           Cluster Centroids Undersampling   ")
print("------------------------------------------------------------------------------------")
print(classification_report_imbalanced(y_test, y_pred))
print("------------------------------------------------------------------------------------")

Accuracy score: 0.24603174603174602
------------------------------------------------------------------------------------
Classification report:           Cluster Centroids Undersampling   
------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.21      0.19      0.86      0.20      0.40      0.15        21
          1       0.29      0.36      0.69      0.32      0.50      0.24        33
          2       0.33      0.11      0.84      0.17      0.31      0.09        53
          3       0.19      0.47      0.64      0.27      0.55      0.30        19

avg / total       0.28      0.25      0.77      0.23      0.41      0.17       126

------------------------------------------------------------------------------------


### Combination (Over and Under) Sampling

In [39]:
# Resample the training data with SMOTEENN:
# Warning: This is a large dataset, and this step may take some time to complete:

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [40]:
# Train the Logistic Regression model using the resampled data:

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [41]:
# Calculated the balanced accuracy score:

y_pred = model.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.2619047619047619


In [42]:
# Print the imbalanced classification report:

print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print("------------------------------------------------------------------------------------")
print(f"Classification report:           SMOTEENN Combination (Over and Under) Sampling   ")
print("------------------------------------------------------------------------------------")
print(classification_report_imbalanced(y_test, y_pred))
print("------------------------------------------------------------------------------------")

Accuracy score: 0.2619047619047619
------------------------------------------------------------------------------------
Classification report:           SMOTEENN Combination (Over and Under) Sampling   
------------------------------------------------------------------------------------
                   pre       rec       spe        f1       geo       iba       sup

          0       0.30      0.33      0.85      0.32      0.53      0.27        21
          1       0.23      0.18      0.78      0.20      0.38      0.13        33
          2       0.52      0.21      0.86      0.30      0.42      0.17        53
          3       0.16      0.47      0.56      0.24      0.52      0.26        19

avg / total       0.36      0.26      0.79      0.27      0.44      0.19       126

------------------------------------------------------------------------------------
