# Setup

In [None]:
import numpy as np
import pandas as pd
import datetime

In [None]:
data_2018 = pd.read_csv('data/OSMI 2019 Mental Health in Tech Survey Results.csv')

In [None]:
data_2018.shape

In [None]:
data_2018.columns

In [None]:
data_2018.head()

In [None]:
data_2018.describe()

In [None]:
list(data_2018.columns)

In [None]:
data_2018['believed_mh_disorder'] = [[x for x in disorders if isinstance(x, str)] for disorders in data_2018.loc[:, 'Anxiety Disorder (Generalized, Social, Phobia, etc).1':'Other.1'].values.tolist()]

In [None]:
data_2018['diagnosed_mh_disorder'] = [[x for x in disorders if isinstance(x, str)] for disorders in data_2018.loc[:, 'Anxiety Disorder (Generalized, Social, Phobia, etc).1':'Other.1'].values.tolist()]

In [None]:
data_2018

# Data Cleaning

## 2018 Data
### Dropping Columns


In [None]:
valid_columns = ['<strong>Are you self-employed?</strong>',
       'How many employees does your company or organization have?',
       'Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Does your employer provide mental health benefits as part of healthcare coverage?',
       'Do you know the options for mental health care available under your employer-provided health coverage?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health disorders and options for seeking help?',
       'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?',
       'Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?',
       'Have you ever discussed your mental health with your employer?',
       'Would you feel comfortable discussing a mental health issue with your coworkers?',
       'Have you ever discussed your mental health with coworkers?',
       "Have you ever had a coworker discuss their or another coworker's mental health with you?",
       'Overall, how much importance does your employer place on mental health?',
       'Do you currently have a mental health disorder?',
       'believed_mh_disorder',
       'diagnosed_mh_disorder',
       'What is your age?', 'What is your gender?',
       'What country do you <strong>live</strong> in?',
       'What US state or territory do you <strong>live</strong> in?',
       'What country do you <strong>work</strong> in?',
       'What US state or territory do you <strong>work</strong> in?']
data_2018 = data_2018[valid_columns]

In [None]:
data_2018

### Standardizing Data

In [None]:
column_name_mapping = {
    '<strong>Are you self-employed?</strong>': 'self_employed',
       'How many employees does your company or organization have?': 'employee_count',
       'Is your employer primarily a tech company/organization?': 'is_tech_company',
       'Is your primary role within your company related to tech/IT?': 'is_tech_role',
       'Does your employer provide mental health benefits as part of healthcare coverage?': 'provide_mh_benefits',
       'Do you know the options for mental health care available under your employer-provided health coverage?': 'know_mh_coverage',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': "formal_discuss_mh",
       'Does your employer offer resources to learn more about mental health disorders and options for seeking help?': "offer_mh_learning_resources",
       'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?': "mh_leave_difficult",
       'Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?': "discuss_mh_with_supervisor",
       'Would you feel comfortable discussing a mental health issue with your coworkers?': "discuss_mh_with_coworkers",
       'Overall, how much importance does your employer place on mental health?': "mh_importance",
       'Do you currently have a mental health disorder?': "has_mh_disorder",
       'What is your age?': "age",
       'What is your gender?': "gender",
       'What country do you <strong>live</strong> in?': "live_country",
       'What US state or territory do you <strong>live</strong> in?': "live_state",
       'What country do you <strong>work</strong> in?': "work_country",
       'What US state or territory do you <strong>work</strong> in?': "work_state"
}
data_2018 = data_2018.rename(columns=column_name_mapping)
data_2018.head()    

We have null data in columns that need to be addressed first

In [None]:
data_2018 = data_2018.fillna(-1)

In [None]:
for col in data_2018:
    if not isinstance(data_2018[col][0], list):
        print(col + ": " + str(data_2018[col].unique()))

In [None]:
employee_count_dict = {'1-5': 0, '6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5}
data_2018['employee_count'] = data_2018['employee_count'].replace(employee_count_dict)
data_2018

In [None]:
data_2018['is_tech_company'].value_counts()

In [None]:
data_2018['is_tech_role'].value_counts()

In [None]:
yes_no_mapping = {"Yes": 2, "I don't know": 1, "No": 0, "Not eligible for coverage / NA": 0}

data_2018['provide_mh_benefits'] = data_2018['provide_mh_benefits'].replace(yes_no_mapping)
data_2018['know_mh_coverage'] = data_2018['know_mh_coverage'].replace(yes_no_mapping)
data_2018['formal_discuss_mh'] = data_2018['formal_discuss_mh'].replace(yes_no_mapping)
data_2018['offer_mh_learning_resources'] = data_2018['offer_mh_learning_resources'].replace(yes_no_mapping)
data_2018

In [None]:
difficulty_mapping = {'Very easy': 5, 'Somewhat easy': 4, "I don't know": 3, 'Neither easy nor difficult': 3, 'Somewhat difficult': 2, 'Difficult': 1, 'Very difficult': 0}
data_2018['mh_leave_difficult'] = data_2018['mh_leave_difficult'].replace(difficulty_mapping)
data_2018

In [None]:
yes_no_maybe_mapping = {'Yes':2, 'Maybe':1, 'No':0}
data_2018['discuss_mh_with_supervisor'] = data_2018['discuss_mh_with_supervisor'].replace(yes_no_maybe_mapping)
data_2018['discuss_mh_with_coworkers'] = data_2018['discuss_mh_with_coworkers'].replace(yes_no_maybe_mapping)
data_2018

In [None]:
yes_no_possibly_mapping = {'Yes': 3, 'Possibly': 2, "Don't Know": 1, 'No': 0}
data_2018['has_mh_disorder'] = data_2018['has_mh_disorder'].replace(yes_no_possibly_mapping)
data_2018

In [None]:
male = ['Male', 'male', 'mail', 'M', 'm', 'mostly male', 'cisgender male', 'MAle']
female = ['female', 'Female', 'F', 'f', 'Woman', 'FEMALE', 'female, she/her',]
def gender_standardization(gender):
    if gender in male:
        return 0
    elif gender in female:
        return 1
    elif isinstance(gender, int) and gender == -1:
        return -1
    else:
        return 2
    
data_2018['gender'] = data_2018['gender'].apply(gender_standardization)
data_2018