In [96]:
# This file will be for preprocessing the data. It will include the following steps:
# 1. Loading the data
# 2. Checking for missing values

In the next cell we will import the required libraries for all the preprocessing steps.

In [97]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile


In [98]:
# load sample data
# log into kaggle

api = KaggleApi()
api.authenticate()

In [99]:
# download the dataset
kaggle_user = 'osmi'
kaggle_project = 'mental-health-in-tech-2016'
api.dataset_download_files(kaggle_user + '/' + kaggle_project)

#unzip the dataset
zip = zipfile.ZipFile(kaggle_project + '.zip').extractall()


Dataset URL: https://www.kaggle.com/datasets/osmi/mental-health-in-tech-2016


In [100]:
# load the data
tech_df = pd.read_csv('mental-heath-in-tech-2016_20161114.csv')

In [101]:
# make a copy of the data
tech_df_copy = tech_df.copy()

In [102]:
# Dataset has values that are 'N/A' and need to be considered as missing values
tech_df_copy = tech_df_copy.replace('N/A', np.nan)

In [103]:
# check the data
# make sure that the number of rows and columns are correct
tech_df_copy.shape

(1433, 63)

In [104]:
# check that the name of columns is correct
tech_df_copy.head()

Unnamed: 0,Are you self-employed?,How many employees does your company or organization have?,Is your employer primarily a tech company/organization?,Is your primary role within your company related to tech/IT?,Does your employer provide mental health benefits as part of healthcare coverage?,Do you know the options for mental health care available under your employer-provided coverage?,"Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",Does your employer offer resources to learn more about mental health concerns and options for seeking help?,Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?,"If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:",...,"If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?","If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?",What is your age?,What is your gender?,What country do you live in?,What US state or territory do you live in?,What country do you work in?,What US state or territory do you work in?,Which of the following best describes your work position?,Do you work remotely?
0,0,26-100,1.0,,Not eligible for coverage / N/A,,No,No,I don't know,Very easy,...,Not applicable to me,Not applicable to me,39,Male,United Kingdom,,United Kingdom,,Back-end Developer,Sometimes
1,0,6-25,1.0,,No,Yes,Yes,Yes,Yes,Somewhat easy,...,Rarely,Sometimes,29,male,United States of America,Illinois,United States of America,Illinois,Back-end Developer|Front-end Developer,Never
2,0,6-25,1.0,,No,,No,No,I don't know,Neither easy nor difficult,...,Not applicable to me,Not applicable to me,38,Male,United Kingdom,,United Kingdom,,Back-end Developer,Always
3,1,,,,,,,,,,...,Sometimes,Sometimes,43,male,United Kingdom,,United Kingdom,,Supervisor/Team Lead,Sometimes
4,0,6-25,0.0,1.0,Yes,Yes,No,No,No,Neither easy nor difficult,...,Sometimes,Sometimes,43,Female,United States of America,Illinois,United States of America,Illinois,Executive Leadership|Supervisor/Team Lead|Dev ...,Sometimes


In [105]:
# Name of each column
tech_df_copy.columns 

Index(['Are you self-employed?',
       'How many employees does your company or organization have?',
       'Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Does your employer provide mental health benefits as part of healthcare coverage?',
       'Do you know the options for mental health care available under your employer-provided coverage?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health concerns and options for seeking help?',
       'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?',
       'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:',
       'Do you think that dis

In [106]:
# check the data types of each column
tech_df_copy.dtypes

Are you self-employed?                                                                 int64
How many employees does your company or organization have?                            object
Is your employer primarily a tech company/organization?                              float64
Is your primary role within your company related to tech/IT?                         float64
Does your employer provide mental health benefits as part of healthcare coverage?     object
                                                                                      ...   
What US state or territory do you live in?                                            object
What country do you work in?                                                          object
What US state or territory do you work in?                                            object
Which of the following best describes your work position?                             object
Do you work remotely?                                                 

In [107]:
# give an overlook of the dataset
# only looks at numerical columns
tech_df_copy.describe()

Unnamed: 0,Are you self-employed?,Is your employer primarily a tech company/organization?,Is your primary role within your company related to tech/IT?,Do you have medical coverage (private insurance or state-provided) which includes treatment of mental health issues?,Do you have previous employers?,Have you ever sought treatment for a mental health issue from a mental health professional?,What is your age?
count,1433.0,1146.0,263.0,287.0,1433.0,1433.0,1433.0
mean,0.200279,0.770506,0.942966,0.644599,0.882066,0.585485,34.286113
std,0.400349,0.420691,0.23235,0.479471,0.322643,0.49281,11.290931
min,0.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,28.0
50%,0.0,1.0,1.0,1.0,1.0,1.0,33.0
75%,0.0,1.0,1.0,1.0,1.0,1.0,39.0
max,1.0,1.0,1.0,1.0,1.0,1.0,323.0


In [108]:
# check for missing values
missing_values = tech_df_copy.isnull().sum() 
missing_values

Are you self-employed?                                                                  0
How many employees does your company or organization have?                            287
Is your employer primarily a tech company/organization?                               287
Is your primary role within your company related to tech/IT?                         1170
Does your employer provide mental health benefits as part of healthcare coverage?     287
                                                                                     ... 
What US state or territory do you live in?                                            593
What country do you work in?                                                            0
What US state or territory do you work in?                                            582
Which of the following best describes your work position?                               0
Do you work remotely?                                                                   0
Length: 63

In [109]:
# Find the count of 1's (are you self-employed) in "Are you self-employed?"
tech_df_copy['Are you self-employed?'].value_counts()

Are you self-employed?
0    1146
1     287
Name: count, dtype: int64

In [110]:
# Delete all rows where "Are you self-employed?" is 1, because they are otherwise empty rows.
tech_df_copy = tech_df_copy[tech_df_copy['Are you self-employed?'] == 0]
# only the rows where "Are you self-employed?" is 0 are kept.



In [111]:
missing_values = tech_df_copy.isnull().sum() 
missing_values

Are you self-employed?                                                                 0
How many employees does your company or organization have?                             0
Is your employer primarily a tech company/organization?                                0
Is your primary role within your company related to tech/IT?                         883
Does your employer provide mental health benefits as part of healthcare coverage?      0
                                                                                    ... 
What US state or territory do you live in?                                           437
What country do you work in?                                                           0
What US state or territory do you work in?                                           430
Which of the following best describes your work position?                              0
Do you work remotely?                                                                  0
Length: 63, dtype: in

In [112]:
tech_df_copy.shape
# Check that the number of rows and columns are correct (after removing the rows that were remote workers)

(1146, 63)

In [113]:
# Fill in Gender column missing values with "missing".
tech_df_copy['What is your gender?'] = tech_df_copy['What is your gender?'].fillna('missing')

In [114]:
# Fill in the missing values with "missing" as assumed MNAR
tech_df_copy['Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?'] = tech_df_copy['Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?'].fillna('missing')

In [115]:
missing_values = tech_df_copy.isnull().sum() 
missing_values


Are you self-employed?                                                                 0
How many employees does your company or organization have?                             0
Is your employer primarily a tech company/organization?                                0
Is your primary role within your company related to tech/IT?                         883
Does your employer provide mental health benefits as part of healthcare coverage?      0
                                                                                    ... 
What US state or territory do you live in?                                           437
What country do you work in?                                                           0
What US state or territory do you work in?                                           430
Which of the following best describes your work position?                              0
Do you work remotely?                                                                  0
Length: 63, dtype: in

In [116]:
columns_to_delete = [
    'Do you believe your productivity is ever affected by a mental health issue?', 
    'If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?', 
    'Do you know local or online resources to seek help for a mental health disorder?', 
    'If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?', 
    'If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?', 
    'Do you have medical coverage (private insurance or state-provided) which includes treatment of  mental health issues?', 
    'If you have revealed a mental health issue to a client or business contact, do you believe this has impacted you negatively?', 
    'If you have revealed a mental health issue to a coworker or employee, do you believe this has impacted you negatively?']

tech_df_copy = tech_df_copy.drop(columns=columns_to_delete, axis=1)

In [117]:
# check the data
tech_df_copy.shape

(1146, 55)

In [118]:
# Fill in missing values with 0, while assuming that the missing values are MNAR, because the individuals do not live or work in the US.
tech_df_copy['What US state or territory do you work in?'] = \
    (tech_df_copy['What US state or territory do you work in?'].fillna('0'))

tech_df_copy['What US state or territory do you live in?'] = \
    (tech_df_copy['What US state or territory do you live in?'].fillna('0'))

In [119]:
# Replace missing value in column with 0, as it should not have an answer.

tech_df_copy.loc[
    tech_df_copy['Have you been diagnosed with a mental health condition by a medical professional?'] == 'No',
    'If so, what condition(s) were you diagnosed with?'] = \
    (tech_df_copy['If so, what condition(s) were you diagnosed with?'].fillna('0'))

tech_df_copy.loc[
    (tech_df_copy['Do you currently have a mental health disorder?'] == 'No') | 
    (tech_df_copy['Do you currently have a mental health disorder?'] == 'Maybe'),
    'If yes, what condition(s) have you been diagnosed with?'] = \
    tech_df_copy['If yes, what condition(s) have you been diagnosed with?'].fillna('0')

tech_df_copy.loc[
    (tech_df_copy['Do you currently have a mental health disorder?'] == 'No') | 
    (tech_df_copy['Do you currently have a mental health disorder?'] == 'Yes'),
    'If maybe, what condition(s) do you believe you have?'] = \
    tech_df_copy['If maybe, what condition(s) do you believe you have?'].fillna('0')

# Replace missing value with 0, because it was left blank. MNAR.

tech_df_copy.loc[
    tech_df_copy['Have you been diagnosed with a mental health condition by a medical professional?'] == 'Yes',
    'If so, what condition(s) were you diagnosed with?'] = \
    (tech_df_copy['If so, what condition(s) were you diagnosed with?'].fillna('missing'))

tech_df_copy.loc[
    tech_df_copy['Do you currently have a mental health disorder?'] == 'Yes',
    'If yes, what condition(s) have you been diagnosed with?'] = \
    (tech_df_copy['If yes, what condition(s) have you been diagnosed with?'].fillna('missing'))

tech_df_copy.loc[
    tech_df_copy['Do you currently have a mental health disorder?'] == 'Maybe',
    'If maybe, what condition(s) do you believe you have?'] = \
    (tech_df_copy['If maybe, what condition(s) do you believe you have?'].fillna('missing'))


In [120]:
# Calculate percentage of missing values in each column
missing_percentage = tech_df_copy.isnull().sum() / len(tech_df_copy) * 100
missing_percentage

Are you self-employed?                                                                                                                                                               0.000000
How many employees does your company or organization have?                                                                                                                           0.000000
Is your employer primarily a tech company/organization?                                                                                                                              0.000000
Is your primary role within your company related to tech/IT?                                                                                                                        77.050611
Does your employer provide mental health benefits as part of healthcare coverage?                                                                                                    0.000000
Do you know the options for mental health care ava

In [121]:
# Drop columns with more than 60% missing values
columns_to_delete = ['Is your primary role within your company related to tech/IT?']
tech_df_copy = tech_df_copy.drop(columns=columns_to_delete, axis=1)


In [122]:
# Input missing value with "missing" as assumed MNAR-- Concerned Mode with a percentage of 55% would introduce too much bias. While treating as its own separate category would be more appropriate.

tech_df_copy['Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?'] = \
    (tech_df_copy['Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?'].fillna('missing'))

# Will fill in missing values with "missing" as assumed MNAR for the following columns.
tech_df_copy['Why or why not?.1'] = \
    (tech_df_copy['Why or why not?.1'].fillna('missing'))

tech_df_copy['Why or why not?'] = \
    (tech_df_copy['Why or why not?'].fillna('missing'))



In [123]:
# Missing Value for a non-sensitive question? Will treat as MNAR and fill in with "missing"
tech_df_copy['Do you know the options for mental health care available under your employer-provided coverage?'] = \
    (tech_df_copy['Do you know the options for mental health care available under your employer-provided coverage?'].fillna('missing'))

In [124]:
# Fill in the remaining columns with "0" as assumed non-applicable as all related to previous employer.  I suspect that the missing values are due to the fact that the individual does not have a previous employer. As all of these columns all have the same missing number of participants. 131.

columns_to_fill = [ 'Have your previous employers provided mental health benefits?', 
                    'Were you aware of the options for mental health care provided by your previous employers?', 
                    'Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?', 
                    'Did your previous employers provide resources to learn more about mental health issues and how to seek help?', 
                    'Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?',
                    'Do you think that discussing a mental health disorder with previous employers would have negative consequences?',
                    'Do you think that discussing a physical health issue with previous employers would have negative consequences?',
                    'Would you have been willing to discuss a mental health issue with your previous co-workers?',
                    'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?',
                    'Did you feel that your previous employers took mental health as seriously as physical health?',
                    'Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?']

tech_df_copy[columns_to_fill] = tech_df_copy[columns_to_fill].fillna('0')


In [125]:
# Check for missing values
missing_values = tech_df_copy.isnull().sum()
missing_values

Are you self-employed?                                                                                                                                                              0
How many employees does your company or organization have?                                                                                                                          0
Is your employer primarily a tech company/organization?                                                                                                                             0
Does your employer provide mental health benefits as part of healthcare coverage?                                                                                                   0
Do you know the options for mental health care available under your employer-provided coverage?                                                                                     0
Has your employer ever formally discussed mental health (for example, as part of a wellnes

In [126]:
# Save DataFrame to CSV File to be used in next module: Encode_and_Scale

tech_df_copy.to_csv('tech_df_copy.csv', index=False)