In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Dependencies for interaction with database:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session


# Machine Learning dependencies:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Create engine and link to local postgres database:
engine = create_engine(f'postgresql://postgres:Lastlost1!@localhost:5432/Final_mh_project')
connect = engine.connect()

In [3]:
# Create session:
session = Session(engine)


In [4]:
# Import clean_dataset_2016 table:
clean_2016_df = pd.read_sql(sql = 'SELECT * FROM public."2016_surveydata"',con=connect)

In [6]:
clean_2016_df.head()

Unnamed: 0,SurveyID,new_id,self_employed,company_size,tech_company,mh_coverage,mh_coverage_awareness,mh_employer_discussion,mh_resources_provided,mh_anonimity,...,age,gender,country_live,live_us_state,country_work,work_us_state,work_position,remote,quantile_age_1,quantile_age_2
0,2016,1,0,26-100,1,Not eligible for coverage / N/A,I am not sure,No,No,I don't know,...,39,male,United Kingdom,none,United Kingdom,none,Back-end Developer,Sometimes,"(38.0, 99.0]","(37.0, 39.0]"
1,2016,2,0,25-Jun,1,No,Yes,Yes,Yes,Yes,...,29,male,United States of America,Illinois,United States of America,Illinois,Back-end Developer|Front-end Developer,Never,"(28.0, 32.0]","(27.0, 29.0]"
2,2016,3,0,25-Jun,1,No,I am not sure,No,No,I don't know,...,38,male,United Kingdom,none,United Kingdom,none,Back-end Developer,Always,"(32.0, 38.0]","(37.0, 39.0]"
3,2016,4,0,25-Jun,0,Yes,Yes,No,No,No,...,43,female,United States of America,Illinois,United States of America,Illinois,Executive Leadership|Supervisor/Team Lead|Dev ...,Sometimes,"(38.0, 99.0]","(39.0, 44.0]"
4,2016,5,0,More than 1000,1,Yes,I am not sure,No,Yes,Yes,...,42,male,United Kingdom,none,United Kingdom,none,DevOps/SysAdmin|Support|Back-end Developer|Fro...,Sometimes,"(38.0, 99.0]","(39.0, 44.0]"


In [7]:
# Check data for insights:
print(clean_2016_df.shape)
print(clean_2016_df.columns.tolist())
print(clean_2016_df.value_counts)

(1004, 55)
['SurveyID', 'new_id', 'self_employed', 'company_size', 'tech_company', 'mh_coverage', 'mh_coverage_awareness', 'mh_employer_discussion', 'mh_resources_provided', 'mh_anonimity', 'mh_medical_leave', 'mh_discussion_negative_impact', 'ph_discussion_negative_impact', 'mh_discussion_coworkers', 'mh_discussion_supervisors', 'mh_equal_ph', 'mh_observed_consequences_coworkers', 'prev_employers', 'prev_mh_benefits', 'prev_mh_benefits_awareness', 'prev_mh_discussion', 'prev_mh_resources', 'prev_mh_anonimity', 'prev_mh_discuss_negative_consequences', 'prev_ph_discuss_negative_consequences', 'prev_mh_discussion_coworkers', 'prev_mh_discussion_supervisors', 'prev_mh_importance_employer', 'prev_mh_consequences_coworkers', 'future_ph_specification', 'future_mh_specification', 'mh_hurt_on_career', 'mh_neg_view_coworkers', 'mh_sharing_friends_family', 'mh_bad_response_workplace', 'mh_for_others_bad_response_workplace', 'mh_family_history', 'mh_dx_past', 'mh_dx_current', 'yes_what_dx?', 'mh_

In [23]:
##Test:
#Dataset filtered on tech_company = "yes"
#Target: dx_mh_disorder
#Features: company_size, age, gender, country_live, identified_with_mh, mh_employer, employer_discus_mh, employer_provide_mh_coverage,treatment_mh_from_professional, employers_options_help, protected_anonymity_mh

In [24]:
# Filter tech_or_not columns:
clean_2016_df["tech_company"].head()

0    1
1    1
2    1
3    0
4    1
Name: tech_company, dtype: int64

In [25]:
tech_df = pd.read_sql('SELECT * FROM "2016_surveydata" WHERE "tech_company" = 1', connect)
tech_df.shape

(768, 55)