In [3]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Dependencies for interaction with database:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session


# Machine Learning dependencies:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Validation libraries
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve
from sklearn.model_selection import cross_val_score

from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Create engine and link to local postgres database:
engine = create_engine('postgresql://postgres:spring01@mht.ciic7sa0kxc0.us-west-2.rds.amazonaws.com:5432/postgres')
connect = engine.connect()

In [5]:
# Create session:
session = Session(engine)


In [6]:
# Import clean_dataset_2016 table:
clean_2016_df = pd.read_sql(sql = 'SELECT * FROM "survey_2016"',con=connect)

In [7]:
clean_2016_df.head()

Unnamed: 0,SurveyID,new_id,self_employed,company_size,tech_company,mh_coverage,mh_coverage_awareness,mh_employer_discussion,mh_resources_provided,mh_anonimity,...,age,gender,country_live,live_us_state,country_work,work_us_state,work_position,remote,quantile_age_1,quantile_age_2
0,2016,1,0,26-100,1,Not eligible for coverage / N/A,I am not sure,No,No,I don't know,...,39,male,United Kingdom,none,United Kingdom,none,Back-end Developer,Sometimes,"(38.0, 99.0]","(37.0, 39.0]"
1,2016,2,0,25-Jun,1,No,Yes,Yes,Yes,Yes,...,29,male,United States of America,Illinois,United States of America,Illinois,Back-end Developer|Front-end Developer,Never,"(28.0, 32.0]","(27.0, 29.0]"
2,2016,3,0,25-Jun,1,No,I am not sure,No,No,I don't know,...,38,male,United Kingdom,none,United Kingdom,none,Back-end Developer,Always,"(32.0, 38.0]","(37.0, 39.0]"
3,2016,4,0,25-Jun,0,Yes,Yes,No,No,No,...,43,female,United States of America,Illinois,United States of America,Illinois,Executive Leadership|Supervisor/Team Lead|Dev ...,Sometimes,"(38.0, 99.0]","(39.0, 44.0]"
4,2016,5,0,More than 1000,1,Yes,I am not sure,No,Yes,Yes,...,42,male,United Kingdom,none,United Kingdom,none,DevOps/SysAdmin|Support|Back-end Developer|Fro...,Sometimes,"(38.0, 99.0]","(39.0, 44.0]"


In [8]:
# Check data for insights:
print(clean_2016_df.shape)
print(clean_2016_df.columns.tolist())
print(clean_2016_df.value_counts)

(1004, 55)
['SurveyID', 'new_id', 'self_employed', 'company_size', 'tech_company', 'mh_coverage', 'mh_coverage_awareness', 'mh_employer_discussion', 'mh_resources_provided', 'mh_anonimity', 'mh_medical_leave', 'mh_discussion_negative_impact', 'ph_discussion_negative_impact', 'mh_discussion_coworkers', 'mh_discussion_supervisors', 'mh_equal_ph', 'mh_observed_consequences_coworkers', 'prev_employers', 'prev_mh_benefits', 'prev_mh_benefits_awareness', 'prev_mh_discussion', 'prev_mh_resources', 'prev_mh_anonimity', 'prev_mh_discuss_negative_consequences', 'prev_ph_discuss_negative_consequences', 'prev_mh_discussion_coworkers', 'prev_mh_discussion_supervisors', 'prev_mh_importance_employer', 'prev_mh_consequences_coworkers', 'future_ph_specification', 'future_mh_specification', 'mh_hurt_on_career', 'mh_neg_view_coworkers', 'mh_sharing_friends_family', 'mh_bad_response_workplace', 'mh_for_others_bad_response_workplace', 'mh_family_history', 'mh_dx_past', 'mh_dx_current', 'yes_what_dx?', 'mh_

In [9]:
##Test:
#Dataset filtered on tech_company = "yes"
#Target: 
#Features: company_size, age, gender, country_live, identified_with_mh, mh_employer, employer_discus_mh, employer_provide_mh_coverage,treatment_mh_from_professional, employers_options_help, protected_anonymity_mh

In [10]:
# Filter tech_or_not columns:
clean_2016_df["tech_company"].head()

0    1
1    1
2    1
3    0
4    1
Name: tech_company, dtype: int64

In [11]:
tech_df = pd.read_sql('SELECT * FROM "survey_2016" WHERE "tech_company" = 1', connect)
tech_df.shape

(768, 55)

In [12]:
ml_df = tech_df[["mh_sought_pro_tx","mh_dx_pro","company_size","mh_discussion_coworkers", "mh_discussion_supervisors","mh_employer_discussion","prev_mh_discussion_coworkers","prev_mh_discussion_supervisors","mh_sharing_friends_family"]]
ml_df

Unnamed: 0,mh_sought_pro_tx,mh_dx_pro,company_size,mh_discussion_coworkers,mh_discussion_supervisors,mh_employer_discussion,prev_mh_discussion_coworkers,prev_mh_discussion_supervisors,mh_sharing_friends_family
0,0,Yes,26-100,Maybe,Yes,No,Some of my previous employers,Some of my previous employers,Somewhat open
1,1,Yes,25-Jun,Maybe,Yes,Yes,"No, at none of my previous employers",Some of my previous employers,Somewhat open
2,1,No,25-Jun,Maybe,Maybe,No,Some of my previous employers,I don't know,Somewhat open
3,1,No,More than 1000,Maybe,Yes,No,"No, at none of my previous employers","No, at none of my previous employers",Somewhat open
4,0,No,26-100,Maybe,Yes,No,Some of my previous employers,"Yes, at all of my previous employers",Not applicable to me (I do not have a mental i...
...,...,...,...,...,...,...,...,...,...
763,0,No,100-500,Maybe,Yes,Yes,Some of my previous employers,"No, at none of my previous employers",Somewhat not open
764,1,Yes,100-500,Maybe,Maybe,No,Some of my previous employers,Some of my previous employers,Somewhat open
765,1,Yes,500-1000,Yes,Yes,No,Some of my previous employers,"No, at none of my previous employers",Very open
766,1,Yes,100-500,Yes,Yes,Yes,Some of my previous employers,Some of my previous employers,Somewhat open


In [13]:
# Encode dataset:

# Create label encoder instance:
le = LabelEncoder()

# Make a copy of desire data:
encoded_df = ml_df.copy()

# Apply encoder:
#encoded_df["age"] = le.fit_transform(encoded_df["age"] )
#encoded_df["company_size"] = le.fit_transform(encoded_df["company_size"])
#encoded_df["gender"] = le.fit_transform(encoded_df["gender"])
#encoded_df["country_live"] = le.fit_transform(encoded_df["country_live"])
#encoded_df["identified_with_mh"] = le.fit_transform(encoded_df["identified_with_mh"])
#encoded_df["dx_mh_disorder"] = le.fit_transform(encoded_df["dx_mh_disorder"])
#encoded_df["employer_discus_mh"] = le.fit_transform(encoded_df["employer_discus_mh"])
#encoded_df["mh_employer"] = le.fit_transform(encoded_df["mh_employer"])
#encoded_df["treatment_mh_from_professional"] = le.fit_transform(encoded_df["treatment_mh_from_professional"])
#encoded_df["employer_provide_mh_coverage"] = le.fit_transform(encoded_df["employer_provide_mh_coverage"])
#encoded_df["employers_options_help"] = le.fit_transform(encoded_df["employers_options_help"])
#encoded_df["protected_anonymity_mh"] = le.fit_transform(encoded_df["protected_anonymity_mh"])

features = encoded_df.columns.tolist()
for feature in features:
    encoded_df[feature] = le.fit_transform(encoded_df[feature])
    
# Check:
encoded_df.head()

Unnamed: 0,mh_sought_pro_tx,mh_dx_pro,company_size,mh_discussion_coworkers,mh_discussion_supervisors,mh_employer_discussion,prev_mh_discussion_coworkers,prev_mh_discussion_supervisors,mh_sharing_friends_family
0,0,1,2,0,2,1,1,2,4
1,1,1,1,0,2,2,0,2,4
2,1,0,1,0,0,1,1,0,4
3,1,0,5,0,2,1,0,1,4
4,0,0,2,0,2,1,1,3,1


In [14]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse = False)

encoded_df1 = ml_df.copy()

# Apply encoder:
encoded_df1 = encoder.fit_transform(encoded_df1)
encoded_df1

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

In [15]:
# Create our target:
y = encoded_df["mh_sought_pro_tx"]

# Create our features:
X = encoded_df.drop(columns = "mh_sought_pro_tx", axis =1)

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=40, 
                                                    stratify=y)
X_train.shape

(576, 8)

In [24]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [25]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [26]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)


Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,0,0
4,1,1
5,1,1
6,0,0
7,1,1
8,1,1
9,1,1


In [27]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.859375


In [28]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test,y_pred)
print(matrix)

[[69  7]
 [20 96]]


In [29]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.78      0.91      0.84        76
           1       0.93      0.83      0.88       116

    accuracy                           0.86       192
   macro avg       0.85      0.87      0.86       192
weighted avg       0.87      0.86      0.86       192

