# Setup

In [None]:
import numpy as np
import pandas as pd
import datetime

In [None]:
data_2019 = pd.read_csv('data/OSMI 2019 Mental Health in Tech Survey Results.csv')
data_2020 = pd.read_csv('data/OSMI 2020 Mental Health in Tech Survey Results.csv')

In [None]:
data_2019.shape

In [None]:
data_2019.columns

In [None]:
data_2019.head()

In [None]:
data_2019.describe()

In [None]:
data_2020.shape

In [None]:
list(data_2020.columns)

In [None]:
data_2020.head()

In [None]:
data_2020.describe()

# Data Cleaning

## 2019 Data
### Dropping Columns


In [None]:
valid_columns = ['*Are you self-employed?*',
       'How many employees does your company or organization have?',
       'Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Does your employer provide mental health benefits as part of healthcare coverage?',
       'Do you know the options for mental health care available under your employer-provided health coverage?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health disorders and options for seeking help?',
       'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?',
       'Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?',
       'Would you feel comfortable discussing a mental health issue with your coworkers?',
       'Overall, how much importance does your employer place on mental health?',
       'Do you *currently* have a mental health disorder?',
       '*If possibly, what disorder(s) do you believe you have?*',
       '*If so, what disorder(s) were you diagnosed with?*',
       'What is your age?', 'What is your gender?',
       'What country do you *live* in?',
       'What US state or territory do you *live* in?',
       'What country do you *work* in?',
       'What US state or territory do you *work* in?']
data_2019 = data_2019[valid_columns]

In [None]:
data_2019

### Standardizing Data

In [None]:
column_name_mapping = {
    '*Are you self-employed?*': 'self_employed',
       'How many employees does your company or organization have?': 'employee_count',
       'Is your employer primarily a tech company/organization?': 'is_tech_company',
       'Is your primary role within your company related to tech/IT?': 'is_tech_role',
       'Does your employer provide mental health benefits as part of healthcare coverage?': 'provide_mh_benefits',
       'Do you know the options for mental health care available under your employer-provided health coverage?': 'know_mh_coverage',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': "formal_discuss_mh",
       'Does your employer offer resources to learn more about mental health disorders and options for seeking help?': "offer_mh_learning_resources",
       'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?': "mh_leave_difficult",
       'Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?': "discuss_mh_with_supervisor",
       'Would you feel comfortable discussing a mental health issue with your coworkers?': "discuss_mh_with_coworkers",
       'Overall, how much importance does your employer place on mental health?': "mh_importance",
       'Do you *currently* have a mental health disorder?': "has_mh_disorder",
       '*If possibly, what disorder(s) do you believe you have?*': "believed_mh_disorder",
       '*If so, what disorder(s) were you diagnosed with?*': "diagnosed_mh_disorder",
       'What is your age?': "age",
       'What is your gender?': "gender",
       'What country do you *live* in?': "live_country",
       'What US state or territory do you *live* in?': "live_state",
       'What country do you *work* in?': "work_country",
       'What US state or territory do you *work* in?': "work_state"
}
data_2019 = data_2019.rename(columns=column_name_mapping)

In [None]:
data_2019.isna().any()

We have null data in columns that need to be addressed first

In [None]:
data_2019 = data_2019.fillna(-1)

In [None]:
for col in data_2019:
    print(col + ": " + str(data_2019[col].unique()))

In [None]:
# map_dict = {}
# map_dict['Not eligible for coverage / N/A'] = -1
# map_dict['N/A'] = -1
# # map_dict["I don't know"] = 1
# map_dict['I am not sure'] = 1
# map_dict["Don't Know"] = 1
# map_dict['Maybe'] = 1
# map_dict["Yes"] = 2
# map_dict["No"] = 0
# map_dict.update({'1-5': 0, '6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5})
# # need to specify 'I don't know' in 'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:'
# map_dict.update({'Very easy': 5, 'Somewhat easy': 4, 'Neither easy nor difficult': 3, 'Somewhat Difficult': 2, 'Difficult': 1, 'Very difficult': 0})

In [None]:
standardize_bool = lambda boolean: int(boolean)
data_2019['self_employed'] = data_2019['self_employed'].apply(standardize_bool)
data_2019['is_tech_company'] = data_2019['is_tech_company'].apply(standardize_bool)
data_2019['is_tech_role'] = data_2019['is_tech_role'].apply(standardize_bool)
data_2019

In [None]:
employee_count_dict = {'1-5': 0, '6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5}
data_2019['employee_count'] = data_2019['employee_count'].replace(employee_count_dict)
data_2019

In [None]:
yes_no_mapping = {"Yes": 2, "I don't know": 1, "No": 0, "Not eligible for coverage / NA": 0}

data_2019['provide_mh_benefits'] = data_2019['provide_mh_benefits'].replace(yes_no_mapping)
data_2019['know_mh_coverage'] = data_2019['know_mh_coverage'].replace(yes_no_mapping)
data_2019['formal_discuss_mh'] = data_2019['formal_discuss_mh'].replace(yes_no_mapping)
data_2019['offer_mh_learning_resources'] = data_2019['offer_mh_learning_resources'].replace(yes_no_mapping)
data_2019


In [None]:
difficulty_mapping = {'Very easy': 5, 'Somewhat easy': 4, "I don't know": 3, 'Neither easy nor difficult': 3, 'Somewhat difficult': 2, 'Difficult': 1, 'Very difficult': 0}
data_2019['mh_leave_difficult'] = data_2019['mh_leave_difficult'].replace(difficulty_mapping)
data_2019

In [None]:
yes_no_maybe_mapping = {'Yes':2, 'Maybe':1, 'No':0}
data_2019['discuss_mh_with_supervisor'] = data_2019['discuss_mh_with_supervisor'].replace(yes_no_maybe_mapping)
data_2019['discuss_mh_with_coworkers'] = data_2019['discuss_mh_with_coworkers'].replace(yes_no_maybe_mapping)
data_2019

In [None]:
data_2019['age'] = data_2019['age'].apply(lambda age: int(age))
data_2019['age'].value_counts()

In [None]:
male = ['Male', 'male', 'm', 'M',
 'Let\'s keep it simple and say "male"', 'Identify as male',
 'Male ', 'Masculine', 'Cishet male', 
 'Man', 'cis male', 'Cis Male', 'Trans man', 'man',
 'masculino', 'Make',
 'CIS Male']
female = ['female', 'Female', 'F', 'f',
 'Woman', 'Female-identified', 'woman', 'cis woman',
 'Agender trans woman', 'Female ', 'femmina', 'Femile' 'Female (cis)']
def gender_standardization(gender):
    if gender in male:
        return 0
    elif gender in female:
        return 1
    elif isinstance(gender, int) and gender == -1:
        return -1
    else:
        return 2
    
data_2019['gender'] = data_2019['gender'].apply(gender_standardization)
data_2019

In [None]:
data_2019['gender'].value_counts()

In [None]:
yes_no_possibly_mapping = {'Yes': 3, 'Possibly': 2, "Don't Know": 1, 'No': 0}
data_2019['has_mh_disorder'] = data_2019['has_mh_disorder'].replace(yes_no_possibly_mapping)
data_2019['has_mh_disorder'].value_counts()

In [None]:
def standardize_disorders(disorders):
    """
    Standardizes the list of disorders 
    from a string into a list of strings
    """
    if isinstance(disorders, int):
        return []
    
    disorder_list = []
    in_parenthesis = False
    disorder = ""
    for char in disorders:
        if char == "," and not in_parenthesis:
            disorder_list.append(disorder.strip())
            disorder = ""
            in_parenthesis = False
        elif char == "(":
            in_parenthesis = True
        elif char == ")":
            in_parenthesis = False
        else:
            disorder += char
    disorder_list.append(disorder.strip())
    return disorder_list

def to_1D(series):
    """
    Converts input to a panda compatible series so functions
    like value counts works on lists
    """
    return pd.Series([x for _list in series for x in _list])

In [None]:
data_2019['believed_mh_disorder'] = data_2019['believed_mh_disorder'].apply(standardize_disorders)
data_2019['diagnosed_mh_disorder'] = data_2019['diagnosed_mh_disorder'].apply(standardize_disorders)

In [None]:
data_2019.describe()

## 2020 Data

### Fixing Data Inconsistency
2020 changes how disorders are reported, we need to wrangle it to match the other years

In [None]:
data_2020['believed_mh_disorder'] = [[x for x in disorders if isinstance(x, str)] for disorders in data_2020.loc[:, 'Anxiety Disorder (Generalized, Social, Phobia, etc).1':'Other.1'].values.tolist()]
data_2020['diagnosed_mh_disorder'] = [[x for x in disorders if isinstance(x, str)] for disorders in data_2020.loc[:, 'Anxiety Disorder (Generalized, Social, Phobia, etc).1':'Other.1'].values.tolist()]
data_2020

### Dropping Columns

In [None]:
valid_columns = ['*Are you self-employed?*',
       'How many employees does your company or organization have?',
       'Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Does your employer provide mental health benefits as part of healthcare coverage?',
       'Do you know the options for mental health care available under your employer-provided health coverage?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health disorders and options for seeking help?',
       'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?',
       'Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?',
       'Have you ever discussed your mental health with your employer?',
       'Would you feel comfortable discussing a mental health issue with your coworkers?',
       'Have you ever discussed your mental health with coworkers?',
       "Have you ever had a coworker discuss their or another coworker's mental health with you?",
       'Overall, how much importance does your employer place on mental health?',
       'Do you *currently* have a mental health disorder?',
       'believed_mh_disorder',
       'diagnosed_mh_disorder',
       'What is your age?', 'What is your gender?',
       'What country do you *live* in?',
       'What US state or territory do you *live* in?',
       'What country do you *work* in?',
       'What US state or territory do you *work* in?']
data_2020 = data_2020[valid_columns]

In [None]:
data_2020.head()

In [None]:
column_name_mapping = {
    '*Are you self-employed?*': 'self_employed',
       'How many employees does your company or organization have?': 'employee_count',
       'Is your employer primarily a tech company/organization?': 'is_tech_company',
       'Is your primary role within your company related to tech/IT?': 'is_tech_role',
       'Does your employer provide mental health benefits as part of healthcare coverage?': 'provide_mh_benefits',
       'Do you know the options for mental health care available under your employer-provided health coverage?': 'know_mh_coverage',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': "formal_discuss_mh",
       'Does your employer offer resources to learn more about mental health disorders and options for seeking help?': "offer_mh_learning_resources",
       'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?': "mh_leave_difficult",
       'Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?': "discuss_mh_with_supervisor",
       'Would you feel comfortable discussing a mental health issue with your coworkers?': "discuss_mh_with_coworkers",
       'Overall, how much importance does your employer place on mental health?': "mh_importance",
       'Do you *currently* have a mental health disorder?': "has_mh_disorder",
       'What is your age?': "age",
       'What is your gender?': "gender",
       'What country do you *live* in?': "live_country",
       'What US state or territory do you *live* in?': "live_state",
       'What country do you *work* in?': "work_country",
       'What US state or territory do you *work* in?': "work_state"
}
data_2020 = data_2020.rename(columns=column_name_mapping)
data_2020.head()

There are still NaNs, so we will fill with -1 as a placeholder

In [None]:
data_2020 = data_2020.fillna(-1)

In [None]:
for col in data_2020:
    if not isinstance(data_2020[col][0], list):
        print(col + ": " + str(data_2020[col].unique()))

In [None]:
employee_count_dict = {'1-5': 0, '6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5}
data_2020['employee_count'] = data_2020['employee_count'].replace(employee_count_dict)
data_2020

In [None]:
data_2020['is_tech_company'].value_counts()

In [None]:
data_2020['is_tech_role'].value_counts()

In [None]:
yes_no_mapping = {"Yes": 2, "I don't know": 1, "No": 0, "Not eligible for coverage / NA": 0}

data_2020['provide_mh_benefits'] = data_2020['provide_mh_benefits'].replace(yes_no_mapping)
data_2020['know_mh_coverage'] = data_2020['know_mh_coverage'].replace(yes_no_mapping)
data_2020['formal_discuss_mh'] = data_2020['formal_discuss_mh'].replace(yes_no_mapping)
data_2020['offer_mh_learning_resources'] = data_2020['offer_mh_learning_resources'].replace(yes_no_mapping)
data_2020


In [None]:
difficulty_mapping = {'Very easy': 5, 'Somewhat easy': 4, "I don't know": 3, 'Neither easy nor difficult': 3, 'Somewhat difficult': 2, 'Difficult': 1, 'Very difficult': 0}
data_2020['mh_leave_difficult'] = data_2020['mh_leave_difficult'].replace(difficulty_mapping)
data_2020

In [None]:
yes_no_maybe_mapping = {'Yes':2, 'Maybe':1, 'No':0}
data_2020['discuss_mh_with_supervisor'] = data_2020['discuss_mh_with_supervisor'].replace(yes_no_maybe_mapping)
data_2020['discuss_mh_with_coworkers'] = data_2020['discuss_mh_with_coworkers'].replace(yes_no_maybe_mapping)
data_2020

In [None]:
yes_no_possibly_mapping = {'Yes': 3, 'Possibly': 2, "Don't Know": 1, 'No': 0}
data_2020['has_mh_disorder'] = data_2020['has_mh_disorder'].replace(yes_no_possibly_mapping)
data_2020

In [None]:
male = ['Male', 'male', 'mail', 'M', 'm', 'mostly male', 'cisgender male', 'MAle']
female = ['female', 'Female', 'F', 'f', 'Woman', 'FEMALE', 'female, she/her',]
def gender_standardization(gender):
    if gender in male:
        return 0
    elif gender in female:
        return 1
    elif isinstance(gender, int) and gender == -1:
        return -1
    else:
        return 2
    
data_2020['gender'] = data_2020['gender'].apply(gender_standardization)
data_2020

# Merging Dataframes

In [None]:
data_2019['year'] = datetime.datetime(2019, 1, 1)
data_2020['year'] = datetime.datetime(2020, 1, 1)

In [None]:
data_2019

In [None]:
data_2020['mh_leave_difficult'].value_counts()