In [None]:
import numpy as np
import pandas as pd

# Setup

In [None]:
data_2016 = pd.read_csv('data/OSMI_2016_Mental_Health_in_Tech_Survey_Results.csv')

In [None]:
data_2016.shape

In [None]:
data_2016.head()

In [None]:
data_2016.columns

# Data Cleaning
## 2016 Data
### Dropping columns

In [None]:
valid_columns = ['Are you self-employed?',
       'How many employees does your company or organization have?',
       'Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Does your employer provide mental health benefits as part of healthcare coverage?',
       'Do you know the options for mental health care available under your employer-provided coverage?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health concerns and options for seeking help?',
       'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:',
       'Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?',
       'Would you feel comfortable discussing a mental health disorder with your coworkers?',
       'Do you currently have a mental health disorder?',
       'If maybe, what condition(s) do you believe you have?',
       'If yes, what condition(s) have you been diagnosed with?',
       'What is your age?', 'What is your gender?',
       'What country do you live in?',
       'What US state or territory do you live in?',
       'What country do you work in?',
       'What US state or territory do you work in?']
data_2016 = data_2016[valid_columns]

### Standardizing data

In [None]:
column_name_mapping = {
    'Are you self-employed?': 'self_employed',
       'How many employees does your company or organization have?': 'employee_count',
       'Is your employer primarily a tech company/organization?': 'is_tech_company',
       'Is your primary role within your company related to tech/IT?': 'is_tech_role',
       'Does your employer provide mental health benefits as part of healthcare coverage?': 'provide_mh_benefits',
       'Do you know the options for mental health care available under your employer-provided coverage?': 'know_mh_coverage',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': "formal_discuss_mh",
       'Does your employer offer resources to learn more about mental health concerns and options for seeking help?': "offer_mh_learning_resources",
       'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:': "mh_leave_difficult",
       'Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?': "discuss_mh_with_supervisor",
       'Would you feel comfortable discussing a mental health disorder with your coworkers?': "discuss_mh_with_coworkers",
       'Do you currently have a mental health disorder?': "has_mh_disorder",
       'If maybe, what condition(s) do you believe you have?': "believed_mh_disorder",
       'If yes, what condition(s) have you been diagnosed with?': "diagnosed_mh_disorder",
       'What is your age?': "age",
       'What is your gender?': "gender",
       'What country do you live in?': "live_country",
       'What US state or territory do you live in?': "live_state",
       'What country do you work in?': "work_country",
       'What US state or territory do you work in?': "work_state"
}
data_2016 = data_2016.rename(columns=column_name_mapping)

In [None]:
data_2016.isna().any()

In [None]:
data_2016 = data_2016.fillna(-1)

In [None]:
for col in data_2016:
    print(col + ": " + str(data_2016[col].unique()))

In [None]:
standardize_float = lambda value: int(value)
data_2016['is_tech_company'] = data_2016['is_tech_company'].apply(standardize_float)
data_2016['is_tech_role'] = data_2016['is_tech_role'].apply(standardize_float)
data_2016['age'] = data_2016['age'].apply(standardize_float)

In [None]:
employee_count_dict = {'1-5': 0, '6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5}
data_2016['employee_count'] = data_2016['employee_count'].replace(employee_count_dict)

In [None]:
yes_no_mapping = {"Yes": 2, "I don't know": 1, "I am not sure": 1, "No": 0, "Not eligible for coverage / N/A": 0}

data_2016['provide_mh_benefits'] = data_2016['provide_mh_benefits'].replace(yes_no_mapping)
data_2016['know_mh_coverage'] = data_2016['know_mh_coverage'].replace(yes_no_mapping)
data_2016['formal_discuss_mh'] = data_2016['formal_discuss_mh'].replace(yes_no_mapping)
data_2016['offer_mh_learning_resources'] = data_2016['offer_mh_learning_resources'].replace(yes_no_mapping)

In [None]:
difficulty_mapping = {'Very easy': 5, 'Somewhat easy': 4, 'Neither easy nor difficult': 3, "I don't know": 3, 'Somewhat difficult': 2, 'Difficult': 1, 'Very difficult': 0}
data_2016['mh_leave_difficult'] = data_2016['mh_leave_difficult'].replace(difficulty_mapping)

In [None]:
yes_no_maybe_mapping = {'Yes':2, 'Maybe':1, 'No':0}
data_2016['discuss_mh_with_supervisor'] = data_2016['discuss_mh_with_supervisor'].replace(yes_no_maybe_mapping)
data_2016['discuss_mh_with_coworkers'] = data_2016['discuss_mh_with_coworkers'].replace(yes_no_maybe_mapping)

In [None]:
data_2016['gender'].unique()

In [None]:
male = ['Male', 'male', 'Male ', 'M', 'm', 'man',  'Cis male', 
        'Male.', 'Male (cis)','Man', 'Sex is male', 'Male (trans, FtM)',
        'cis male', 'Malr', 'Dude', 'Male/genderqueer',
        "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
        'mail', 'M|', 'male ', 'Cis Male', 'cisdude', 'cis man', 'MALE']
female = ['Female', 'female', 'I identify as female.', 'female ', 'Female assigned at birth ',
         'F', 'Woman', 'fm', 'f', 'Cis female ', 'Transitioned, M2F', 'Genderfluid (born female)', 'Female or Multi-Gender Femme', 
         'Female ', 'woman', 'female/woman', 'Cisgender Female', 'fem', 'Female (props for making this a freeform field, though)',
         ' Female', 'Cis-woman', 'Genderflux demi-girl', 'female-bodied; no feelings about gender',
         'Transgender woman', 'genderqueer woman']

na = ['none of your business', 'Human', 'human']
def gender_standardization(gender):
    if gender in male:
        return 0
    elif gender in female:
        return 1
    elif gender in na:
        return -1
    elif isinstance(gender, int) and gender == -1:
        return -1
    else:
        return 2
data_2016['gender'] = data_2016['gender'].apply(gender_standardization)

In [None]:
data_2016['gender'].value_counts()

In [None]:
yes_no_possibly_mapping = {'Yes': 3, 'Maybe': 2, 'No': 0}
data_2016['has_mh_disorder'] = data_2016['has_mh_disorder'].replace(yes_no_possibly_mapping)
data_2016['has_mh_disorder'].value_counts()

In [None]:
def standardize_disorders(disorders):
    """
    Standardizes the list of disorders 
    from a string into a list of strings
    """
    if isinstance(disorders, int):
        return []
    
    disorder_list = []
    in_parenthesis = False
    disorder = ""
    for char in disorders:
        if char == "," and not in_parenthesis:
            disorder_list.append(disorder.strip())
            disorder = ""
            in_parenthesis = False
        elif char == "(":
            in_parenthesis = True
        elif char == ")":
            in_parenthesis = False
        else:
            disorder += char
    disorder_list.append(disorder.strip())
    return disorder_list

def to_1D(series):
    """
    Converts input to a panda compatible series so functions
    like value counts works on lists
    """
    return pd.Series([x for _list in series for x in _list])


In [None]:
data_2016['believed_mh_disorder'] = data_2016['believed_mh_disorder'].apply(standardize_disorders)
data_2016['diagnosed_mh_disorder'] = data_2016['diagnosed_mh_disorder'].apply(standardize_disorders)