In this notebook, I will perform the following tasks:
1.) Encode Data 
2.) Standardize Data
3.) Feature Selection and Feature Engineering
4.) Fit Models
5.) Evaluate Models

In [92]:
# Import Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

In [93]:
# Load Data

data = pd.read_csv('tech_df_cleaned.csv', index_col=None)
data.describe(include='all')

Unnamed: 0,number_of_employees,tech_company,mental_health_benefits,mental_health_coverage_awareness,mental_health_offical_communication,mental_health_resources,mental_health_anonymity,asking_for_leave_for_mental_health,discussing_mental_health_with_employer,discussing_physical_health_with_employer,...,at_any_point_diagnosed_MH_disorder,treatment_MH_disorder,interference_with_work_with_effective_treatment,interference_with_work_NOT_effective_treatment,gender,country_residence,country_work,work_position,remote_work,age_group
count,1146,1146,1146,1146,1146,1146,1146,1146,1146,1146,...,1146,1146,1146,1146,1146,1146,1146,1146,1146,1146
unique,6,2,4,4,3,3,3,6,3,3,...,2,2,5,5,3,7,7,180,3,6
top,26-100,Yes,Yes,No,No,No,I don't know,Somewhat easy,Maybe,No,...,No,Yes,Not applicable to me,Often,Male,North America,North America,Back-end Developer,Sometimes,30-39
freq,292,883,531,354,813,531,742,281,487,837,...,579,657,455,422,846,774,779,238,611,543


In [94]:
# Encode Data
# Encode Work Positions using MultiLabelBinarizer() because some participants have multiple roles.

one_hot_multiclass = MultiLabelBinarizer()
data_encoded = one_hot_multiclass.fit_transform(data['work_position'].str.split('|'))
data = pd.concat([data.drop(columns=['work_position']), pd.DataFrame(data_encoded, columns=one_hot_multiclass.classes_)], axis=1)
one_hot_multiclass.classes_

array(['Back-end Developer', 'Designer', 'Dev Evangelist/Advocate',
       'DevOps/SysAdmin', 'Executive Leadership', 'Front-end Developer',
       'HR', 'One-person shop', 'Other', 'Sales', 'Supervisor/Team Lead',
       'Support'], dtype=object)

In [95]:
# One hot encode categorical variables
# Use if_binary to drop one of the binary columns to avoid multicollinearity

categorical_columns = data.select_dtypes(include='object').columns.tolist()
encoder = OneHotEncoder(sparse_output=False, drop='if_binary')

one_hot_encoded = encoder.fit_transform(data[categorical_columns])

one_hot_data =pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

data = pd.concat([data.drop(columns=categorical_columns), one_hot_data], axis=1)

data.head()


Unnamed: 0,Back-end Developer,Designer,Dev Evangelist/Advocate,DevOps/SysAdmin,Executive Leadership,Front-end Developer,HR,One-person shop,Other,Sales,...,country_work_South America,remote_work_Always,remote_work_Never,remote_work_Sometimes,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60+,age_group_< 20
0,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,0,0,0,0,1,0,0,0,0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1,0,1,1,1,1,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,1,0,1,0,1,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [96]:
# Inverse transform data to verify encoding

"""data_encoded = pd.concat([data.drop(columns=one_hot_data.columns), pd.DataFrame(encoder.inverse_transform(one_hot_data), columns=categorical_columns)], axis=1)

data_encoded.head()"""


'data_encoded = pd.concat([data.drop(columns=one_hot_data.columns), pd.DataFrame(encoder.inverse_transform(one_hot_data), columns=categorical_columns)], axis=1)\n\ndata_encoded.head()'

In [97]:
# Standardize Data



Unnamed: 0,Back-end Developer,Designer,Dev Evangelist/Advocate,DevOps/SysAdmin,Executive Leadership,Front-end Developer,HR,One-person shop,Other,Sales,...,country_work_South America,remote_work_Always,remote_work_Never,remote_work_Sometimes,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60+,age_group_< 20
count,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,...,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0,1146.0
mean,0.524433,0.077661,0.071553,0.19459,0.051483,0.348168,0.008726,0.02007,0.129145,0.009599,...,0.013962,0.189354,0.277487,0.533159,0.337696,0.473822,0.152705,0.027051,0.006108,0.002618
std,0.499621,0.267755,0.257859,0.396057,0.221078,0.476597,0.093045,0.1403,0.335507,0.097544,...,0.117383,0.391961,0.447954,0.499117,0.473131,0.499532,0.35986,0.162302,0.07795,0.05112
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
