##***Downloading and Saving Kaggle Dataset to Google Drive in Google Colab***

In [3]:
# Mounting Google Drive:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Installing Kaggle and uploading the api in the next cell
!pip install kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
#downloading the Dataset:
!kaggle datasets download -d sivas1292/mental-health-in-tech-dashboard-2016-to-2020

In [None]:
#Unzipping the Dataset:
import zipfile

zip_path = "/content/mental-health-in-tech-dashboard-2016-to-2020.zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/mental-health-in-tech-dashboard-2016-to-2020')


In [None]:
#Copying the Dataset to Google Drive:
!cp -r /content/mental-health-in-tech-dashboard-2016-to-2020 /content/drive/MyDrive/research_internship_dataset

##***Pre processing and cleaning the dataset***

In [94]:
#Reading mental health survey data from CSV files for years 2016-2020 into Pandas DataFrames and printing the number of columns in each DataFrame:
#before this have added a year field to each csv file too, for identification reasons, since all were to be combined in one:
import numpy as np
import pandas as pd

survey_2016 = pd.read_csv("/content/drive/MyDrive/research_internship_dataset/mental-health-in-tech-dashboard-2016-to-2020/OSMI Mental Health in Tech Survey 2016.csv", na_values = np.nan)
survey_2017 = pd.read_csv("/content/drive/MyDrive/research_internship_dataset/mental-health-in-tech-dashboard-2016-to-2020/OSMI Mental Health in Tech Survey 2017.csv", na_values = np.nan)
survey_2018 = pd.read_csv("/content/drive/MyDrive/research_internship_dataset/mental-health-in-tech-dashboard-2016-to-2020/OSMI Mental Health in Tech Survey 2018.csv", na_values = np.nan)
survey_2019 = pd.read_csv("/content/drive/MyDrive/research_internship_dataset/mental-health-in-tech-dashboard-2016-to-2020/OSMI Mental Health in Tech Survey 2019.csv", na_values = np.nan)
survey_2020 = pd.read_csv("/content/drive/MyDrive/research_internship_dataset/mental-health-in-tech-dashboard-2016-to-2020/OSMI Mental Health in Tech Survey 2020 .csv", na_values = np.nan)
survey_data_ls = [survey_2016, survey_2017, survey_2018, survey_2019, survey_2020]
print(f"survey_2016.columns =  {len(survey_2016.columns)}")
print(f"survey_2017.columns =  {len(survey_2017.columns)}")
print(f"survey_2018.columns =  {len(survey_2018.columns)}")
print(f"survey_2019.columns =  {len(survey_2019.columns)}")
print(f"survey_2020.columns =  {len(survey_2020.columns)}")


survey_2016.columns =  64
survey_2017.columns =  124
survey_2018.columns =  124
survey_2019.columns =  83
survey_2020.columns =  121


***cleaning column/field names***

In [95]:
#removing HTML tags (<b>...</b>) and asterisks (*) from the column names of multiple survey datasets stored in survey_data_ls
import re as re
def remove_tags(string):
    result = re.sub('<.*?>|\*','',string)
    return result
for survey in survey_data_ls:
    renamed_cols = []
    for i in range (len(survey.columns)):
        renamed_cols.append(remove_tags(survey.columns[i]))
    survey.columns = renamed_cols

***identifying relevant fields to help analyze workplace mental health trends***

In [96]:
 #selecting key features related to demographics, mental health history, workplace support
#Since the 2016 dataset has too few common fields, it is excluded. The datasets from 2017 to 2020 are standardized by keeping only shared columns
#and then combined into a single CSV file for further analysis.
important_feat = [
    # Demographics
    "What is your age?",
    "What is your gender?",
    "What country do you live in?",
    "What country do you work in?",
    "year",

    # Mental Health History
    "Do you currently have a mental health disorder?",
    "Have you ever been diagnosed with a mental health disorder?",
    "What disorder(s) have you been diagnosed with?",
    "If possibly, what disorder(s) do you believe you have?",
    "Have you had a mental health disorder in the past?",
    "Do you have a family history of mental illness?",

    # Workplace Mental Health Support
    "Is your employer primarily a tech company/organization?",
    "Is your primary role within your company related to tech/IT?",
    "Does your employer provide mental health benefits as part of healthcare coverage?",
    "Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",
    "Does your employer offer resources to learn more about mental health disorders and options for seeking help?",
    "If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?",
    "Overall, how much importance does your employer place on physical health?",
    "Overall, how much importance does your employer place on mental health?",
    "Have you observed or experienced a supportive or well-handled response to a mental health issue in your current or previous workplace?",
    "Overall, how well do you think the tech industry supports employees with mental health issues?",
    "Briefly describe what you think the industry as a whole and/or employers could do to improve mental health support for employees?",
    "Describe the circumstances of the badly handled or unsupportive response.",
    "If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?",
    "If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or friends?",
    "Did your previous employers provide resources to learn more about mental health disorders and how to seek help?",
    "Would you have been willing to discuss your mental health with your direct supervisor(s)?",
    "Would you have felt more comfortable talking to your previous employer about your physical health or your mental health?",

    # Impact on Work and Career
    "Do you believe your productivity is ever affected by a mental health issue?",
    "If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?",
    "If you have a mental health disorder, how often do you feel that it interferes with your work when being treated effectively?",
    "If you have a mental health disorder, how often do you feel that it interferes with your work when NOT being treated effectively (i.e., when you are experiencing symptoms)?",
    "Have your observations of how another individual who discussed a mental health issue made you less likely to reveal a mental health issue yourself in your current workplace?",
    "Are you openly identified at work as a person with a mental health issue?",
    "Has being identified as a person with a mental health issue affected your career?",
    "How has it affected your career?",
    "If they knew you suffered from a mental health disorder, how do you think that your team members/co-workers would react?",

    # Willingness to Disclose Mental Health Issues
    "How willing would you be to share with friends and family that you have a mental illness?",
    "Would you be willing to bring up a physical health issue with a potential employer in an interview?",
    "Would you bring up your mental health with a potential employer in an interview?",

    # Anonymity and Privacy
    "Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?",
    "Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?",

    #policy awareness
    "Do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?",
    "Do you know local or online resources to seek help for a mental health disorder?",
    "Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?"

]

common_cols = survey_2017.columns.intersection(survey_2018.columns)
common_cols = common_cols.intersection(survey_2019.columns)
common_cols = common_cols.intersection(survey_2020.columns)
common_cols = common_cols.intersection(important_feat)

survey_2017 = survey_2017.reindex(columns=common_cols)
survey_2018 = survey_2018.reindex(columns=common_cols)
survey_2019 = survey_2019.reindex(columns=common_cols)
survey_2020 = survey_2020.reindex(columns=common_cols)


survey = pd.concat([survey_2017[common_cols], survey_2018[common_cols], survey_2019[common_cols], survey_2020[common_cols]], axis=0)
print(f"survey final columns =  {len(survey.columns)}")

output_path = "/content/drive/MyDrive/research_internship_dataset/mental-health-survey-2017-2020_t2.csv"
survey.to_csv(output_path, index=False)

print(f"Combined dataset saved to: {output_path}")


survey final columns =  36
Combined dataset saved to: /content/drive/MyDrive/research_internship_dataset/mental-health-survey-2017-2020_t2.csv


In [97]:
common_cols

Index(['Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health disorders and options for seeking help?',
       'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?',
       'If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?',
       'Overall, how much importance does your employer place on physical health?',
       'Overall, how much importance does your employer place on mental health?',
       'Do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?',
       '

***Handling missing data***

In [98]:
# Calculates the percentage of missing values for each column, sorts them in descending order,
# and filters only columns with missing values for further analysis.
nulls_df = ((survey.isnull().sum()/survey.shape[0])*100).sort_values(ascending=False).reset_index().rename(columns = { 0 : "Nulls_Count"})
nulls_df[nulls_df['Nulls_Count'] > 0]

Unnamed: 0,index,Nulls_Count
0,How has it affected your career?,95.601173
1,"If yes, what percentage of your work time (tim...",89.266862
2,Has being identified as a person with a mental...,87.97654
3,Do you believe your productivity is ever affec...,85.806452
4,Do you have medical coverage (private insuranc...,85.806452
5,If you have been diagnosed or treated for a me...,85.806452
6,Describe the circumstances of the badly handle...,77.360704
7,Have you ever been diagnosed with a mental hea...,58.181818
8,Have your observations of how another individu...,21.290323
9,Would you have been willing to discuss your me...,14.31085


In [99]:
 #Removes rows where missing values are less than or equal to 14% (minor data loss).
# - Drops columns where more than 50% of values are missing (to avoid unreliable data).
# - Displays remaining columns with missing values for further inspection.

nulls_df = ((survey.isnull().sum()/survey.shape[0])*100).sort_values(ascending=False).reset_index().rename(columns = { 0 : "Nulls_Count"})
# drop na values which have null values < threshold
dropped_rows = nulls_df[nulls_df['Nulls_Count'] <= 14.0]
dropped_rows = list(dropped_rows[dropped_rows['Nulls_Count'] > 0].values[:,0])
## when the number of nulls in a row is very small, we can simply drop this small number of rows
survey = survey.dropna(axis = 0, subset = dropped_rows)


# drop the columns which have null values > threshold
dropped_cols = nulls_df[nulls_df['Nulls_Count'] > 50]
dropped_cols = list(dropped_cols.values[:,0])
survey = survey.drop(columns = dropped_cols)
nulls_df[nulls_df['Nulls_Count'] > 0]

Unnamed: 0,index,Nulls_Count
0,How has it affected your career?,95.601173
1,"If yes, what percentage of your work time (tim...",89.266862
2,Has being identified as a person with a mental...,87.97654
3,Do you believe your productivity is ever affec...,85.806452
4,Do you have medical coverage (private insuranc...,85.806452
5,If you have been diagnosed or treated for a me...,85.806452
6,Describe the circumstances of the badly handle...,77.360704
7,Have you ever been diagnosed with a mental hea...,58.181818
8,Have your observations of how another individu...,21.290323
9,Would you have been willing to discuss your me...,14.31085


In [58]:
pd.set_option('display.max_columns', None)
survey.head()


Unnamed: 0,Is your employer primarily a tech company/organization?,Is your primary role within your company related to tech/IT?,"Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",Does your employer offer resources to learn more about mental health disorders and options for seeking help?,Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?,"If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?","Overall, how much importance does your employer place on physical health?","Overall, how much importance does your employer place on mental health?",Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?,Did your previous employers provide resources to learn more about mental health disorders and how to seek help?,Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?,Would you have felt more comfortable talking to your previous employer about your physical health or your mental health?,Would you have been willing to discuss your mental health with your direct supervisor(s)?,Do you currently have a mental health disorder?,Have you had a mental health disorder in the past?,Do you have a family history of mental illness?,"If you have a mental health disorder, how often do you feel that it interferes with your work when being treated effectively?",Have your observations of how another individual who discussed a mental health issue made you less likely to reveal a mental health issue yourself in your current workplace?,How willing would you be to share with friends and family that you have a mental illness?,Would you be willing to bring up a physical health issue with a potential employer in an interview?,Would you bring up your mental health with a potential employer in an interview?,Are you openly identified at work as a person with a mental health issue?,"Overall, how well do you think the tech industry supports employees with mental health issues?",What is your age?,What is your gender?,What country do you live in?,What country do you work in?,year
0,1.0,1.0,No,I don't know,I don't know,I don't know,6.0,0.0,Some did,Some did,"Yes, always",Physical health,"Yes, all of my previous supervisors",Possibly,Possibly,No,Sometimes,No,5,Yes,No,0.0,1.0,27.0,Female,United Kingdom,United Kingdom,2017
1,1.0,1.0,No,No,I don't know,I don't know,7.0,2.0,None did,None did,I don't know,Physical health,"No, none of my previous supervisors",Possibly,Possibly,No,Not applicable to me,No,4,Yes,No,0.0,2.0,31.0,male,United Kingdom,United Kingdom,2017
2,1.0,1.0,I don't know,No,Yes,Difficult,0.0,1.0,None did,None did,I don't know,Physical health,"No, none of my previous supervisors",Yes,Yes,Yes,Sometimes,Yes,5,Maybe,No,1.0,1.0,36.0,male,United States of America,United States of America,2017
3,1.0,1.0,I don't know,I don't know,Yes,Difficult,7.0,5.0,,,,,,Yes,No,I don't know,Sometimes,No,10,No,No,0.0,2.0,22.0,Male,United States of America,United States of America,2017
4,,,,,,,,,None did,None did,"Yes, always",Same level of comfort for each,"No, none of my previous supervisors",No,Yes,Yes,Often,No,10,Maybe,No,0.0,1.0,52.0,female,United States of America,United States of America,2017


In [127]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Assuming 'survey' is your original dataset
# Impute NaN values with the most frequent value (mode) on every row
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(survey)

# Apply the transformation to the dataset
imp_data = pd.DataFrame(data = imp.transform(survey), columns = survey.columns)

prep_data = imp_data.copy()

# Check if there are still missing values
missing_values_after_imputation = prep_data.isna().sum()


print(missing_values_after_imputation)


Is your employer primarily a tech company/organization?                                                                                                                          0
Is your primary role within your company related to tech/IT?                                                                                                                     0
Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?                                           0
Does your employer offer resources to learn more about mental health disorders and options for seeking help?                                                                     0
Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?                                   0
If a mental health issue prompted you to request a medical leave from work, how easy or difficult would i

***Cleaning data and handling inconsistencies***

In [132]:
# - Renames columns for better readability.
# - Recodes gender values into three categories: Male (1), Female (2), and Other (3) to handle inconsistencies.
# - Standardizes country names (e.g., "United States of America" → "USA").
# - Fixes unrealistic age values (e.g., 3, 323) by replacing them with the dataset's mean age (excluding outliers).
# - Saves the cleaned dataset as a CSV file.
# Column rename
df = prep_data.copy()
renamed_columns = ['tech_comp_flag', 'tech_role_flag', 'mh_employer_discussion_or_wellness_campaign', 'mh_resources_provided','is_anonimity_protected_or_not','mh_issue_request_medical_leave',
'ph_employer_rate', 'mh_employer_rate','prev_mh_discussion','prev_mh_resources-learn','was_anonimity_protected_or_not','mh_or_ph','mh_direct_superviser', 'curr_have_mh_disorder', 'past_have_mh_disorder', 'fam_history',
'how often do you feel that it interferes with your work when being treated effectively?', 'disscussion_reveal_ur_mh', 'admit_mh_issue_with_friends/family','admit_ph_issue_in_interview',
'admit_mh_issue_in_interview', 'openly_id_as_person_wth_mh_issue', 'rate_tech_industry_support',
'age', 'gender', 'country_live_in', 'country_work_in', 'year']
df.columns = renamed_columns
# gender column needs to be recoded (number of unique values = 70)
df['gender'].replace(to_replace = ['Male', 'male', 'Male ', 'M', 'm',
       'man', 'Cis male', 'Male.', 'male 9:1 female, roughly', 'Male (cis)', 'Man', 'Sex is male',
       'cis male', 'Malr', 'Dude', "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
       'mail', 'M|', 'Male/genderqueer', 'male ', 'cisgender male', 'MAle'
       'Cis Male', 'Male (trans, FtM)', 'mostly male', 'Masculine', 'Let\'s keep it simple and say "male"', 'cis-male', 'Mail', 'male/androgynous ',
       'cis hetero male',
       'cisdude', 'cis man', 'MALE'], value = 1, inplace = True)

df['gender'].replace(to_replace = ['Female', 'female', 'I identify as female.', 'female ',
       'Female assigned at birth ', 'F', 'Woman', 'fm', 'f', 'Cis female ', 'Transitioned, M2F',
       'Genderfluid (born female)', 'Female or Multi-Gender Femme', 'Female ', 'woman', 'female/woman',
       'Cisgender Female', 'fem', 'Female (props for making this a freeform field, though)',
       'Female (cis)', 'FEMALE', 'femmina' 'Agender trans woman', 'Female-identified',
       ' Female', 'Cis-woman', 'female-bodied; no feelings about gender', 'female, she/her',
       'Nonbinary/femme', 'gender non-conforming woman', 'Female/gender non-binary.',
       'Cis woman', 'Female (cisgender)', 'Cis-Female','Cisgendered woman', 'Trans woman',
       'uhhhhhhhhh fem genderqueer?', 'My sex is female.', 'femalw', 'femail', 'cis female', 'femmina','Female-ish',
       'Woman-identified', 'female (cisgender)', 'Female (cis) ','F, cisgender', 'Trans female', 'Femile',
       'AFAB'], value = 2, inplace = True)

other = list(df['gender'].unique())
other = [o for o in other if o not in (1, 2)]
df['gender'].replace(to_replace = other , value = 3, inplace = True)

# Recode Comp size & country columns (for ease when doing plots)
df['country_live_in'].replace(to_replace = ['United States of America'], value = 'USA', inplace = True)
df['country_live_in'].replace(to_replace = ['United Kingdom'], value = 'UK', inplace = True)
df['country_work_in'].replace(to_replace = ['United States of America'], value = 'USA', inplace = True)
df['country_work_in'].replace(to_replace = ['United Kingdom'], value = 'UK', inplace = True)

# Max age is 323, min age is 3.
# There are only 5 people that have weird ages (3yo, 15yo, or 99yo or 323 yo.)
# These people will take the average age of the dataset (the correct calculated one, w/out outliers)
mean_age = df[(df['age'] >= 18) | (df['age'] <= 75)]['age'].mean()
df['age'].replace(to_replace = df[(df['age'] < 18) | (df['age'] > 75)]['age'].tolist(),
                          value = mean_age, inplace = True)

df.to_csv('/content/drive/MyDrive/research_internship_dataset/prep_data_t2.csv', index=False)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [133]:
# Check if there are still missing values
missing_values_after_imputation = df.isna().sum()

# Print the result
print(missing_values_after_imputation)

tech_comp_flag                                                                             0
tech_role_flag                                                                             0
mh_employer_discussion_or_wellness_campaign                                                0
mh_resources_provided                                                                      0
is_anonimity_protected_or_not                                                              0
mh_issue_request_medical_leave                                                             0
ph_employer_rate                                                                           0
mh_employer_rate                                                                           0
prev_mh_discussion                                                                         0
prev_mh_resources-learn                                                                    0
was_anonimity_protected_or_not                                        

***FILTERING DATA***

In [134]:
csv_path='/content/drive/MyDrive/research_internship_dataset/prep_data_t2.csv'
df_fltr_data = pd.read_csv(csv_path)
# Keeping only countries with at least 20 responses to ensure reliable analysis.
# Countries retained: USA, UK, India, Canada, Germany.
df_fltr_data = df_fltr_data[df_fltr_data['country_work_in'].isin(['USA', 'UK', 'India', 'Canada', 'Germany'])]
df_fltr_data = df_fltr_data[df_fltr_data['country_live_in'].isin(['USA', 'UK', 'India', 'Canada', 'Germany'])]



In [135]:
# Check if there are still missing values
missing_values_after_imputation = df_fltr_data.isna().sum()

# Print the result
print(missing_values_after_imputation)

tech_comp_flag                                                                             0
tech_role_flag                                                                             0
mh_employer_discussion_or_wellness_campaign                                                0
mh_resources_provided                                                                      0
is_anonimity_protected_or_not                                                              0
mh_issue_request_medical_leave                                                             0
ph_employer_rate                                                                           0
mh_employer_rate                                                                           0
prev_mh_discussion                                                                         0
prev_mh_resources-learn                                                                    0
was_anonimity_protected_or_not                                        

***Identifying Numerical and Categorical Data:***

In [136]:
# - `num_data`: Stores column names with numerical data (excluding object types).
# - `cat_data`: Stores column names with categorical data (only object types).

# Categorizing Categorical Data:
# - `nominal_data`: List of unordered categorical variables (categories with no specific order).
# - `ordinal_data`: List of ordered categorical variables (categories with a meaningful sequence).

# Finally, the code prints all identified numerical, nominal, and ordinal data categories.

num_data = list(df_fltr_data.select_dtypes(exclude=['object']).columns)
cat_data = list(df_fltr_data.select_dtypes(include=['object']).columns)


nominal_data = ['year', 'country_live_in', 'country_work_in', 'gender',
                'mh_or_ph', 'tech_comp_flag', 'tech_role_flag', 'mh_employer_discussion_or_wellness_campaign',
                'mh_resources_provided', 'is_anonimity_protected_or_not', 'was_anonimity_protected_or_not',
                'mh_direct_superviser', 'curr_have_mh_disorder', 'past_have_mh_disorder', 'fam_history',
                'admit_mh_issue_with_friends/family', 'admit_ph_issue_in_interview', 'admit_mh_issue_in_interview',
                'openly_id_as_person_wth_mh_issue'
               ]

ordinal_data = list(set(cat_data) - set(nominal_data))

print("Numerical Data:", num_data)
print("Nominal Data:", nominal_data)
print("Ordinal Data:", ordinal_data)


Numerical Data: ['ph_employer_rate', 'mh_employer_rate', 'admit_mh_issue_with_friends/family', 'openly_id_as_person_wth_mh_issue', 'rate_tech_industry_support', 'age', 'gender', 'year']
Nominal Data: ['year', 'country_live_in', 'country_work_in', 'gender', 'mh_or_ph', 'tech_comp_flag', 'tech_role_flag', 'mh_employer_discussion_or_wellness_campaign', 'mh_resources_provided', 'is_anonimity_protected_or_not', 'was_anonimity_protected_or_not', 'mh_direct_superviser', 'curr_have_mh_disorder', 'past_have_mh_disorder', 'fam_history', 'admit_mh_issue_with_friends/family', 'admit_ph_issue_in_interview', 'admit_mh_issue_in_interview', 'openly_id_as_person_wth_mh_issue']
Ordinal Data: ['prev_mh_resources-learn', 'mh_issue_request_medical_leave', 'disscussion_reveal_ur_mh', 'prev_mh_discussion', 'how often do you feel that it interferes with your work when being treated effectively?']


***ENCODING***

In [137]:
#preprocesses data by encoding ordinal columns using Label Encoding and keeping other columns unchanged.
#transforming categorical data into numerical values for machine learning,
# Importing OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

def encode(df, ordinal_data):
    encoded_data = pd.DataFrame() # to dataframe
    le = LabelEncoder()

    # Iterate through ordinal columns and encode if present in the DataFrame
    for col in ordinal_data:
        if col in df.columns:
            encoded_data[col] = le.fit_transform(df[col])
        else:
            print(f"Warning: Column '{col}' not found in DataFrame. Skipping encoding.")

    return encoded_data

encoded_data = encode(df_fltr_data, ordinal_data)

# Preprocessed data
# Ensure that num_data and nominal_data columns are present in prep_data
available_columns = df_fltr_data.columns
data_not_encode_df = df_fltr_data[[col for col in nominal_data + num_data if col in available_columns]]
df_prep_data = pd.concat(objs = [encoded_data, data_not_encode_df], axis = 1)

In [138]:
df_fltr_data = df_fltr_data.loc[:, ~df_fltr_data.columns.duplicated()]
print(df_fltr_data.columns)

Index(['tech_comp_flag', 'tech_role_flag',
       'mh_employer_discussion_or_wellness_campaign', 'mh_resources_provided',
       'is_anonimity_protected_or_not', 'mh_issue_request_medical_leave',
       'ph_employer_rate', 'mh_employer_rate', 'prev_mh_discussion',
       'prev_mh_resources-learn', 'was_anonimity_protected_or_not', 'mh_or_ph',
       'mh_direct_superviser', 'curr_have_mh_disorder',
       'past_have_mh_disorder', 'fam_history',
       'how often do you feel that it interferes with your work when being treated effectively?',
       'disscussion_reveal_ur_mh', 'admit_mh_issue_with_friends/family',
       'admit_ph_issue_in_interview', 'admit_mh_issue_in_interview',
       'openly_id_as_person_wth_mh_issue', 'rate_tech_industry_support', 'age',
       'gender', 'country_live_in', 'country_work_in', 'year'],
      dtype='object')


In [139]:
output_path = "/content/drive/MyDrive/research_internship_dataset/prep_data_t2.csv"
df_fltr_data.to_csv(output_path, index=False)

In [140]:
# Check if there are still missing values
missing_values_after_imputation = df_fltr_data.isna().sum()

# Print the result
print(missing_values_after_imputation)

tech_comp_flag                                                                             0
tech_role_flag                                                                             0
mh_employer_discussion_or_wellness_campaign                                                0
mh_resources_provided                                                                      0
is_anonimity_protected_or_not                                                              0
mh_issue_request_medical_leave                                                             0
ph_employer_rate                                                                           0
mh_employer_rate                                                                           0
prev_mh_discussion                                                                         0
prev_mh_resources-learn                                                                    0
was_anonimity_protected_or_not                                        

##***Yearly Trends***

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import pandas as pd
!pip install -U kaleido



In [4]:
# Load the dataset (assuming it's in CSV format)
df_trends = pd.read_csv("/content/drive/MyDrive/research_internship_dataset/prep_data_t2.csv")

In [187]:
df_trends.head()

Unnamed: 0,tech_comp_flag,tech_role_flag,mh_employer_discussion_or_wellness_campaign,mh_resources_provided,is_anonimity_protected_or_not,mh_issue_request_medical_leave,ph_employer_rate,mh_employer_rate,prev_mh_discussion,prev_mh_resources-learn,was_anonimity_protected_or_not,mh_or_ph,mh_direct_superviser,curr_have_mh_disorder,past_have_mh_disorder,fam_history,how often do you feel that it interferes with your work when being treated effectively?,disscussion_reveal_ur_mh,admit_mh_issue_with_friends/family,admit_ph_issue_in_interview,admit_mh_issue_in_interview,openly_id_as_person_wth_mh_issue,rate_tech_industry_support,age,gender,country_live_in,country_work_in,year
0,1.0,1.0,No,I don't know,I don't know,I don't know,6.0,0.0,Some did,Some did,"Yes, always",Physical health,"Yes, all of my previous supervisors",Possibly,Possibly,No,Sometimes,No,5,Yes,No,0.0,1.0,27.0,2,UK,UK,2017
1,1.0,1.0,No,No,I don't know,I don't know,7.0,2.0,None did,None did,I don't know,Physical health,"No, none of my previous supervisors",Possibly,Possibly,No,Not applicable to me,No,4,Yes,No,0.0,2.0,31.0,1,UK,UK,2017
2,1.0,1.0,I don't know,No,Yes,Difficult,0.0,1.0,None did,None did,I don't know,Physical health,"No, none of my previous supervisors",Yes,Yes,Yes,Sometimes,Yes,5,Maybe,No,1.0,1.0,36.0,1,USA,USA,2017
3,1.0,1.0,I don't know,I don't know,Yes,Difficult,7.0,5.0,None did,None did,I don't know,Physical health,Some of my previous supervisors,Yes,No,I don't know,Sometimes,No,10,No,No,0.0,2.0,22.0,1,USA,USA,2017
4,1.0,1.0,No,No,I don't know,Somewhat easy,7.0,5.0,None did,None did,"Yes, always",Same level of comfort for each,"No, none of my previous supervisors",No,Yes,Yes,Often,No,10,Maybe,No,0.0,1.0,52.0,2,USA,USA,2017


In [159]:
# Check if there are still missing values
missing_values_after_imputation = df_trends.isna().sum()

# Print the result
print(missing_values_after_imputation)

tech_comp_flag                                                                             0
tech_role_flag                                                                             0
mh_employer_discussion_or_wellness_campaign                                                0
mh_resources_provided                                                                      0
is_anonimity_protected_or_not                                                              0
mh_issue_request_medical_leave                                                             0
ph_employer_rate                                                                           0
mh_employer_rate                                                                           0
prev_mh_discussion                                                                         0
prev_mh_resources-learn                                                                    0
was_anonimity_protected_or_not                                        

In [160]:
df_trends['year'] = pd.to_numeric(df_trends['year'], errors='coerce')
print(df_trends['year'].dtype)

int64


In [161]:
print("Unique Years:", years)

Unique Years: [2017 2018 2019 2020]


In [162]:
print(df_trends['curr_have_mh_disorder'].unique())

['Possibly' 'Yes' 'No' "Don't Know"]


***Prevalence of Mental Health Conditions***



In [9]:


# Clean and prepare data
df_trends['fam_history'] = df_trends['fam_history'].str.strip().str.lower()
df_trends['past_have_mh_disorder'] = df_trends['past_have_mh_disorder'].str.strip().str.lower()
df_trends['curr_have_mh_disorder'] = df_trends['curr_have_mh_disorder'].str.strip().str.lower()
df_trends['tech_role_flag'] = pd.to_numeric(df_trends['tech_role_flag'], errors='coerce')

# Get unique years in sorted order
years = sorted(df_trends['year'].unique())

# Compute percentage of "Yes" responses for each category per year
percent_fam_history = [
    (df_trends[(df_trends['year'] == year) & (df_trends['fam_history'] == 'yes')]['fam_history'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

percent_past_mh = [
    (df_trends[(df_trends['year'] == year) & (df_trends['past_have_mh_disorder'] == 'yes')]['past_have_mh_disorder'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

percent_curr_mh = [
    (df_trends[(df_trends['year'] == year) & (df_trends['curr_have_mh_disorder'] == 'yes')]['curr_have_mh_disorder'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

# Create a figure with three bar charts
fig = go.Figure()

# Family History of Mental Health Issues (Percentage)
fig.add_trace(go.Bar(
    x=years,
    y=percent_fam_history,
    name="Family History (%)",
    marker=dict(color='blue')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_fam_history,
    mode='lines+markers',
    name="Trend Line (Family History)",
    line=dict(color='red', width=2, dash='dash')
))

# Had Mental Health Disorder in the Past (Percentage)
fig.add_trace(go.Bar(
    x=years,
    y=percent_past_mh,
    name="Past MH Disorder (%)",
    marker=dict(color='green')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_past_mh,
    mode='lines+markers',
    name="Trend Line (Past MH Disorder)",
    line=dict(color='darkgreen', width=2, dash='dash')
))

# Currently Have Mental Health Disorder (Percentage)
fig.add_trace(go.Bar(
    x=years,
    y=percent_curr_mh,
    name="Current MH Disorder (%)",
    marker=dict(color='orange')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_curr_mh,
    mode='lines+markers',
    name="Trend Line (Current MH Disorder)",
    line=dict(color='darkorange', width=2, dash='dash')
))

# Update layout
fig.update_layout(
    title="Mental Health Disorder Trends Over the Years (Percentage)",
    xaxis_title="Year",
    yaxis_title="Percentage of 'Yes' Responses",
    showlegend=True,
    height=600, width=900,
    barmode='group'  # Bars grouped together
)

# Show plot
fig.show()
fig.write_image("Mental Health Disorder Trends Over the Years (Percentage).png")


 ***Workplace Support***

In [10]:


df_trends['mh_employer_discussion_or_wellness_campaign'] = df_trends['mh_employer_discussion_or_wellness_campaign'].str.strip().str.lower()
df_trends['mh_resources_provided'] = df_trends['mh_resources_provided'].str.strip().str.lower()
df_trends['is_anonimity_protected_or_not'] = df_trends['is_anonimity_protected_or_not'].str.strip().str.lower()

years = sorted(df_trends['year'].unique())

percent_employer_discussion = [
    (df_trends[(df_trends['year'] == year) & (df_trends['mh_employer_discussion_or_wellness_campaign'] == 'yes')]['mh_employer_discussion_or_wellness_campaign'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

percent_resources_provided = [
    (df_trends[(df_trends['year'] == year) & (df_trends['mh_resources_provided'] == 'yes')]['mh_resources_provided'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

percent_anonymity_protected = [
    (df_trends[(df_trends['year'] == year) & (df_trends['is_anonimity_protected_or_not'] == 'yes')]['is_anonimity_protected_or_not'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=years,
    y=percent_employer_discussion,
    name="Employer Discussion (%)",
    marker=dict(color='blue')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_employer_discussion,
    mode='lines+markers',
    name="Trend Line (Employer Discussion)",
    line=dict(color='darkblue', width=2, dash='dash')
))

fig.add_trace(go.Bar(
    x=years,
    y=percent_resources_provided,
    name="Resources Provided (%)",
    marker=dict(color='green')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_resources_provided,
    mode='lines+markers',
    name="Trend Line (Resources Provided)",
    line=dict(color='darkgreen', width=2, dash='dash')
))

fig.add_trace(go.Bar(
    x=years,
    y=percent_anonymity_protected,
    name="Anonymity Protected (%)",
    marker=dict(color='orange')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_anonymity_protected,
    mode='lines+markers',
    name="Trend Line (Anonymity Protected)",
    line=dict(color='darkorange', width=2, dash='dash')
))

fig.update_layout(
    title="Workplace Mental Health Support Trends Over the Years (Percentage)",
    xaxis_title="Year",
    yaxis_title="Percentage of 'Yes' Responses",
    showlegend=True,
    height=600, width=900,
    barmode='group'
)

fig.show()

fig.write_image("Workplace Mental Health Support Trends Over the Years (Percentage).png")


In [11]:

df_trends['mh_employer_rate'] = pd.to_numeric(df_trends['mh_employer_rate'], errors='coerce')
df_trends['ph_employer_rate'] = pd.to_numeric(df_trends['ph_employer_rate'], errors='coerce')

years = sorted(df_trends['year'].unique())

avg_mh_employer_rate = [
    df_trends[df_trends['year'] == year]['mh_employer_rate'].mean()
    for year in years
]

avg_ph_employer_rate = [
    df_trends[df_trends['year'] == year]['ph_employer_rate'].mean()
    for year in years
]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=years,
    y=avg_mh_employer_rate,
    name="Mental Health Employer Importance (Avg Rating)",
    marker=dict(color='royalblue')
))

fig.add_trace(go.Scatter(
    x=years,
    y=avg_mh_employer_rate,
    mode='lines+markers',
    name="Trend Line (MH Employer Support)",
    line=dict(color='darkblue', width=2, dash='dash')
))

fig.add_trace(go.Bar(
    x=years,
    y=avg_ph_employer_rate,
    name="Physical Health Employer Importance (Avg Rating)",
    marker=dict(color='lightgreen')
))

fig.add_trace(go.Scatter(
    x=years,
    y=avg_ph_employer_rate,
    mode='lines+markers',
    name="Trend Line (PH Employer Importance)",
    line=dict(color='darkgreen', width=2, dash='dash')
))

fig.update_layout(
    title="Employer Importance Ratings for Mental & Physical Health Over the Years",
    xaxis_title="Year",
    yaxis_title="Average Rating (1 to 10)",
    showlegend=True,
    height=600, width=900,
    barmode='group'
)

fig.show()
fig.write_image("Employer Importance Ratings for Mental & Physical Health Over the Years.png")

In [198]:
print(df_trends['mh_issue_request_medical_leave'].unique())

["I don't know" 'Difficult' 'Somewhat easy' 'Very easy'
 'Neither easy nor difficult' 'Somewhat difficult']


In [199]:
print(df_trends['mh_issue_request_medical_leave'].isna().sum())



0


In [12]:
import plotly.graph_objects as go
import pandas as pd

# Check if the column 'mh_issue_request_medical_leave' is numeric
if not pd.api.types.is_numeric_dtype(df_trends['mh_issue_request_medical_leave']):
    # Mapping categorical values to numeric values
    leave_mapping = {
        "I don't know": 0,
        "Difficult": 1,
        "Somewhat difficult": 2,
        "Neither easy nor difficult": 3,
        "Somewhat easy": 4,
        "Very easy": 5
    }

    # Apply the mapping to convert categorical data to numeric
    df_trends['mh_issue_request_medical_leave'] = df_trends['mh_issue_request_medical_leave'].map(leave_mapping)

# Get unique years in sorted order
years = sorted(df_trends['year'].unique())

# Compute the average rating per year
avg_mh_leave_by_year = [
    df_trends[df_trends['year'] == year]['mh_issue_request_medical_leave'].mean()
    for year in years
]

# Create figure
fig = go.Figure()

# Bar chart for average rating by year
fig.add_trace(go.Bar(
    x=years,
    y=avg_mh_leave_by_year,
    name="Average Request MH Issue Medical Leave",
    marker=dict(color='teal')
))

# Update layout
fig.update_layout(
    title="Average easiness in Requesting Medical Leave for MH Issues Over the Years",
    xaxis_title="Year",
    yaxis_title="Average Rating (1 to 5)",
    xaxis=dict(tickmode='array', tickvals=years, ticktext=years),
    showlegend=True,
    height=600, width=900
)

# Show plot
fig.show()
fig.write_image("Average easiness in Requesting Medical Leave for MH Issues Over the Years.png")


### ***comfort discussing mental health***

In [13]:
df_trends['admit_mh_issue_in_interview'] = df_trends['admit_mh_issue_in_interview'].astype(str).str.strip().str.lower()
df_trends['admit_ph_issue_in_interview'] = df_trends['admit_ph_issue_in_interview'].astype(str).str.strip().str.lower()
df_trends['openly_id_as_person_wth_mh_issue'] = pd.to_numeric(df_trends['openly_id_as_person_wth_mh_issue'], errors='coerce')

years = sorted(df_trends['year'].unique())

percent_admit_mh = [
    (df_trends[(df_trends['year'] == year) & (df_trends['admit_mh_issue_in_interview'] == 'yes')]['admit_mh_issue_in_interview'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

percent_admit_ph = [
    (df_trends[(df_trends['year'] == year) & (df_trends['admit_ph_issue_in_interview'] == 'yes')]['admit_ph_issue_in_interview'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

percent_openly_id_mh = [
    (df_trends[(df_trends['year'] == year) & (df_trends['openly_id_as_person_wth_mh_issue'] == 1)]['openly_id_as_person_wth_mh_issue'].count() /
     df_trends[df_trends['year'] == year].shape[0]) * 100
    for year in years
]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=years,
    y=percent_admit_mh,
    name="Admit MH Issue (%)",
    marker=dict(color='purple')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_admit_mh,
    mode='lines+markers',
    name="Trend Line (Admit MH Issue)",
    line=dict(color='mediumblue', width=2, dash='dash')
))

fig.add_trace(go.Bar(
    x=years,
    y=percent_admit_ph,
    name="Admit PH Issue (%)",
    marker=dict(color='cyan')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_admit_ph,
    mode='lines+markers',
    name="Trend Line (Admit PH Issue)",
    line=dict(color='darkcyan', width=2, dash='dash')
))

fig.add_trace(go.Bar(
    x=years,
    y=percent_openly_id_mh,
    name="Openly Identify as MH Person (%)",
    marker=dict(color='orange')
))
fig.add_trace(go.Scatter(
    x=years,
    y=percent_openly_id_mh,
    mode='lines+markers',
    name="Trend Line (Openly Identify MH)",
    line=dict(color='darkorange', width=2, dash='dash')
))

fig.update_layout(
    title="Openness in Discussing Mental and Physical Health in Interviews Over the Years",
    xaxis_title="Year",
    yaxis_title="Percentage of 'Yes' or '1' Responses",
    showlegend=True,
    height=600, width=900,
    barmode='group'
)

fig.show()
fig.write_image("Openness in Discussing Mental and Physical Health in Interviews Over the Years.png")


In [15]:
df_trends['admit_mh_issue_with_friends/family'] = pd.to_numeric(df_trends['admit_mh_issue_with_friends/family'], errors='coerce')

years = sorted(df_trends['year'].unique())

avg_admit_mh_friends_family = [
    df_trends[df_trends['year'] == year]['admit_mh_issue_with_friends/family'].mean()
    for year in years
]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=years,
    y=avg_admit_mh_friends_family,
    name="Average Admit MH Issue with Friends/Family",
    marker=dict(color='royalblue')
))

fig.add_trace(go.Scatter(
    x=years,
    y=avg_admit_mh_friends_family,
    mode='lines+markers',
    name="Trend Line",
    line=dict(color='darkblue', width=2, dash='dash')
))

fig.update_layout(
    title="Average Comfort in Discussing Mental Health Issues with Friends/Family Over the Years",
    xaxis_title="Year",
    yaxis_title="Average Rating (1 to 10)",
    showlegend=True,
    height=600, width=900
)

fig.show()
fig.write_image("Average_Comfort_in_Discussing_Mental_Health_Issues_with_Friends_Family_Over_the_Years.png")


In [16]:
df_trends['mh_or_ph'] = df_trends['mh_or_ph'].astype(str).str.strip().str.lower()

years = sorted(df_trends['year'].unique())

labels = df_trends['mh_or_ph'].unique()

data_dict = {label: [] for label in labels}

for year in years:
    total_responses = df_trends[df_trends['year'] == year].shape[0]
    for label in labels:
        count = df_trends[(df_trends['year'] == year) & (df_trends['mh_or_ph'] == label)].shape[0]
        percentage = (count / total_responses) * 100 if total_responses > 0 else 0
        data_dict[label].append(percentage)

fig = go.Figure()

for label in labels:
    fig.add_trace(go.Bar(
        x=years,
        y=data_dict[label],
        name=label
    ))

fig.update_layout(
    title="Trend of MH or PH Over the Years",
    xaxis_title="Year",
    yaxis_title="Percentage of Responses",
    barmode='group',
    showlegend=True,
    height=600, width=900
)

fig.show()

fig.write_image("Trend of MH or PH Over the Years.png")

# ***Demographic Insights based on gender***

In [17]:
df_trends['fam_history'] = df_trends['fam_history'].str.strip().str.lower()
df_trends['past_have_mh_disorder'] = df_trends['past_have_mh_disorder'].str.strip().str.lower()
df_trends['curr_have_mh_disorder'] = df_trends['curr_have_mh_disorder'].str.strip().str.lower()
df_trends['tech_role_flag'] = pd.to_numeric(df_trends['tech_role_flag'], errors='coerce')

genders = sorted(df_trends['gender'].unique())

percent_past_mh_gender = [
    (df_trends[(df_trends['gender'] == gender) & (df_trends['past_have_mh_disorder'] == 'yes')]['past_have_mh_disorder'].count() /
     df_trends[df_trends['gender'] == gender].shape[0]) * 100
    for gender in genders
]

percent_curr_mh_gender = [
    (df_trends[(df_trends['gender'] == gender) & (df_trends['curr_have_mh_disorder'] == 'yes')]['curr_have_mh_disorder'].count() /
     df_trends[df_trends['gender'] == gender].shape[0]) * 100
    for gender in genders
]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=genders,
    y=percent_past_mh_gender,
    name="Past MH Disorder (%)",
    marker=dict(color='green')
))

fig.add_trace(go.Bar(
    x=genders,
    y=percent_curr_mh_gender,
    name="Current MH Disorder (%)",
    marker=dict(color='orange')
))

fig.update_layout(
    title="Mental Health Disorder by Gender (Percentage)",
    xaxis_title="Gender",
    yaxis_title="Percentage of 'Yes' Responses",
    showlegend=True,
    height=600, width=900,
    barmode='group'
)

fig.show()
fig.write_image("Mental Health Disorder by Gender (Percentage).png")

In [20]:
df_trends['admit_mh_issue_with_friends/family'] = pd.to_numeric(df_trends['admit_mh_issue_with_friends/family'], errors='coerce')

genders = sorted(df_trends['gender'].unique())

avg_admit_mh_friends_family_gender = [
    df_trends[df_trends['gender'] == gender]['admit_mh_issue_with_friends/family'].mean()
    for gender in genders
]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=genders,
    y=avg_admit_mh_friends_family_gender,
    name="Average Admit MH Issue with Friends/Family",
    marker=dict(color='royalblue')
))

fig.add_trace(go.Scatter(
    x=genders,
    y=avg_admit_mh_friends_family_gender,
    mode='lines+markers',
    name="Trend Line",
    line=dict(color='darkblue', width=2, dash='dash')
))

fig.update_layout(
    title="Average Comfort in Discussing Mental Health Issues with Friends/Family by Gender",
    xaxis_title="Gender",
    yaxis_title="Average Rating (1 to 10)",
    xaxis=dict(tickmode='array', tickvals=genders, ticktext=["Male", "Female", "Other"]),
    showlegend=True,
    height=600, width=900
)

fig.show()
fig.write_image("Average_Comfort_in_Discussing_Mental_Health_Issues_with_FriendsFamily_by_Gender.png")

In [19]:
df_trends['admit_mh_issue_in_interview'] = df_trends['admit_mh_issue_in_interview'].astype(str).str.strip().str.lower()
df_trends['admit_ph_issue_in_interview'] = df_trends['admit_ph_issue_in_interview'].astype(str).str.strip().str.lower()

genders = sorted(df_trends['gender'].unique())

percent_admit_mh = [
    (df_trends[(df_trends['gender'] == gender) & (df_trends['admit_mh_issue_in_interview'] == 'yes')]['admit_mh_issue_in_interview'].count() /
     df_trends[df_trends['gender'] == gender].shape[0]) * 100
    for gender in genders
]

percent_admit_ph = [
    (df_trends[(df_trends['gender'] == gender) & (df_trends['admit_ph_issue_in_interview'] == 'yes')]['admit_ph_issue_in_interview'].count() /
     df_trends[df_trends['gender'] == gender].shape[0]) * 100
    for gender in genders
]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=genders,
    y=percent_admit_mh,
    name="Admit MH Issue (%)",
    marker=dict(color='purple')
))

fig.add_trace(go.Bar(
    x=genders,
    y=percent_admit_ph,
    name="Admit PH Issue (%)",
    marker=dict(color='cyan')
))

fig.update_layout(
    title="Openness in Discussing Mental and Physical Health in Interviews by Gender",
    xaxis_title="Gender",
    yaxis_title="Percentage of 'Yes' or '1' Responses",
    xaxis=dict(tickmode='array', tickvals=genders, ticktext=["Male", "Female", "Other"]),
    showlegend=True,
    height=600, width=900,
    barmode='group'
)

fig.show()

fig.write_image("Openness in Discussing Mental and Physical Health in Interviews by Gender.png")


In [21]:

df_trends['mh_or_ph'] = df_trends['mh_or_ph'].astype(str).str.strip().str.lower()
df_trends['gender'] = df_trends['gender'].astype(str).str.strip().str.lower()

genders = sorted(df_trends['gender'].unique())
labels = df_trends['mh_or_ph'].unique()

data_dict = {label: [] for label in labels}

for gender in genders:
    total_responses = df_trends[df_trends['gender'] == gender].shape[0]
    for label in labels:
        count = df_trends[(df_trends['gender'] == gender) & (df_trends['mh_or_ph'] == label)].shape[0]
        percentage = (count / total_responses) * 100 if total_responses > 0 else 0
        data_dict[label].append(percentage)

fig = go.Figure()

for label in labels:
    fig.add_trace(go.Bar(
        x=genders,
        y=data_dict[label],
        name=label
    ))

fig.update_layout(
    title="Trend of MH or PH by Gender",
    xaxis_title="Gender",
    yaxis_title="Percentage of Responses",
    barmode='group',
    showlegend=True,
    height=600, width=900
)

fig.show()


fig.write_image("plot_mh_or_ph_by_gender.png")
