<p><strong><span style="font-size: 30px;">Imports</span></strong></p>

In [None]:
pip install kagglehub

In [1]:
import pandas as pd
import os
import kagglehub
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

In [2]:
import statsmodels.api as sm

In [3]:
path = kagglehub.dataset_download("osmi/mental-health-in-tech-survey")

print("Path to dataset files:", path)

Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/osmi/mental-health-in-tech-survey/versions/3


In [4]:
print(os.listdir("/home/jovyan/.cache/kagglehub/datasets/osmi/mental-health-in-tech-survey/versions/3"))

['survey.csv']


<p><strong><span style="font-size: 30px;">Exploratory Data Analysis and Cleaning</span></strong></p>

In [107]:
data_frame = pd.read_csv("/home/jovyan/.cache/kagglehub/datasets/osmi/mental-health-in-tech-survey/versions/3/survey.csv")

In [108]:
pd.set_option("display.max_columns", None)

In [109]:
print(data_frame.dtypes)

Timestamp                    object
Age                           int64
Gender                       object
Country                      object
state                        object
self_employed                object
family_history               object
treatment                    object
work_interfere               object
no_employees                 object
remote_work                  object
tech_company                 object
benefits                     object
care_options                 object
wellness_program             object
seek_help                    object
anonymity                    object
leave                        object
mental_health_consequence    object
phys_health_consequence      object
coworkers                    object
supervisor                   object
mental_health_interview      object
phys_health_interview        object
mental_vs_physical           object
obs_consequence              object
comments                     object
dtype: object


In [110]:
data_frame.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [111]:
data_frame.iloc[5].T

Timestamp                    2014-08-27 11:31:22
Age                                           33
Gender                                      Male
Country                            United States
state                                         TN
self_employed                                NaN
family_history                               Yes
treatment                                     No
work_interfere                         Sometimes
no_employees                                6-25
remote_work                                   No
tech_company                                 Yes
benefits                                     Yes
care_options                            Not sure
wellness_program                              No
seek_help                             Don't know
anonymity                             Don't know
leave                                 Don't know
mental_health_consequence                     No
phys_health_consequence                       No
coworkers           

In [112]:
data_frame.shape

(1259, 27)

In [113]:
skip_cols = ["Timestamp", "comments"]

for col in data_frame.columns:
    if col not in skip_cols:
        unique_vals = sorted(data_frame[col].dropna().unique())
        print(f"\n\033[1mColumn:\033[0m {col}")
        print(unique_vals)



[1mColumn:[0m Age
[np.int64(-1726), np.int64(-29), np.int64(-1), np.int64(5), np.int64(8), np.int64(11), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(60), np.int64(61), np.int64(62), np.int64(65), np.int64(72), np.int64(329), np.int64(99999999999)]

[1mColumn:[0m Gender
['A little about you', 'Agender', 'All', 'Androgyne', 'Cis Female', 'Cis Male', 'Cis Man', 'Enby', 'F', 'Femake', 'Female', 'Female ', 'Female (cis)', 'Female (trans)', 'Genderqueer', 'Guy (-ish) ^_^'

In [114]:
df = data_frame[(data_frame["Age"] >= 15) & (data_frame["Age"] <= 100)].copy()

In [115]:
def clean_gender(g):
    
    if pd.isnull(g):
        return "Other"
    
    g = g.strip().lower()
    
    male_terms = {
        'male', 'm', 'man', 'cis male', 'cis man', 'msle', 'malr', 'mal', 'maile', 
        'make', 'mail', 'male ', 'male (cis)'
    }
    
    female_terms = {
        'female', 'f', 'woman', 'cis female', 'cis-female/femme', 'femail', 
        'femake', 'female ', 'female (cis)', 'female (trans)', 
        'cis woman'
    }
    
    if g in male_terms:
        return "Male"
    elif g in female_terms:
        return "Female"
    else:
        return "Other"

df['Gender'] = df['Gender'].astype(str)
df['Gender'] = df['Gender'].apply(clean_gender)

In [116]:
df.head(70)

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,More than 1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,2014-08-27 11:51:07,40,Male,United States,CA,No,Yes,No,Sometimes,More than 1000,Yes,Yes,Yes,Yes,Yes,Yes,Don't know,Don't know,Yes,Maybe,Some of them,No,No,No,No,Yes,
66,2014-08-27 11:51:34,23,Female,Australia,,No,Yes,Yes,Often,1-5,Yes,Yes,No,Not sure,No,No,Don't know,Very easy,No,No,Some of them,Yes,No,Maybe,Yes,No,Thanks for doing this research.
67,2014-08-27 11:52:07,36,Male,United States,TX,No,No,No,Sometimes,100-500,Yes,Yes,Yes,No,Don't know,Yes,Don't know,Don't know,Maybe,No,Some of them,Some of them,Maybe,Yes,Yes,No,
68,2014-08-27 11:52:41,31,Female,United States,NM,No,No,No,,26-100,Yes,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,Some of them,No,No,Maybe,Don't know,No,


In [117]:
df = df.drop(columns=["Timestamp", "comments"])
df = df[df["work_interfere"].notna()]

<p><strong><span style="font-size: 30px;">Primary Learning Process</span></strong></p>

<p><strong><span style="font-size: 15px;">We are trying to predict the "Work Interference Column" </span></strong></p>

In [118]:
df.columns

Index(['Age', 'Gender', 'Country', 'state', 'self_employed', 'family_history',
       'treatment', 'work_interfere', 'no_employees', 'remote_work',
       'tech_company', 'benefits', 'care_options', 'wellness_program',
       'seek_help', 'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [119]:
# df['Age'] = df['Age'].astype(int)

# df['Gender'] = df['Gender'].astype(str)
# #df['Country'] = df['Country'].astype(str)
# #df['state'] = df['state'].astype(str)

# columns_to_convert = ['self_employed', 'family_history', 'treatment', 'work_interfere', 'no_employees', 'remote_work',
#        'tech_company', 'benefits', 'care_options', 'wellness_program',
#        'seek_help', 'anonymity', 'leave', 'mental_health_consequence',
#        'phys_health_consequence', 'coworkers', 'supervisor',
#        'mental_health_interview', 'phys_health_interview',
#        'mental_vs_physical', 'obs_consequence']
# df[columns_to_convert] = df[columns_to_convert].astype(str)

In [120]:
# label_encoders = {}
# for col in X.columns:
#     if X[col].dtype == "object":
#         le = LabelEncoder()
#         X[col] = le.fit_transform(X[col].astype(str))
#         label_encoders[col] = le

In [121]:
# le = LabelEncoder()
# y = le.fit_transform(y)

In [122]:
df['Gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [123]:


# 2. Ordinal mappings
ordinal_maps = {
    'no_employees': {
        '1-5': 1,
        '6-25': 2,
        '26-100': 3,
        '100-500': 4,
        '500-1000': 5,
        'More than 1000': 6
    },
    'leave': {
        "Don't know": 0,
        'Very difficult': 1,
        'Somewhat difficult': 2,
        'Somewhat easy': 3,
        'Very easy': 4
    }    
}

# 3. Binary or ordinal columns (Label Encoding)
label_encode_cols = [
    'self_employed', 'family_history', 'treatment',
    'remote_work', 'tech_company', 'benefits',
    'care_options', 'wellness_program', 'seek_help',
    'anonymity', 'mental_health_consequence', 'phys_health_consequence',
    'coworkers', 'supervisor', 'mental_health_interview',
    'phys_health_interview', 'mental_vs_physical', 'obs_consequence'
]

# 4. Nominal (One-Hot Encode) columns
one_hot_encode_cols = ['Gender']

# 5. Leave numerical columns as-is
numeric_cols = ['Age']

# -- Apply encoding --

# A. Ordinal columns with mappings
for col, mapping in ordinal_maps.items():
    df[col] = df[col].map(mapping)

# B. Label Encoding for binary/categorical yes/no/maybe columns
le = LabelEncoder()
for col in label_encode_cols:
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string in case of NaNs or mixed types

# C. One-Hot Encoding
df = pd.get_dummies(df, columns=one_hot_encode_cols)


# Convert boolean columns to integers (if necessary)
df['Gender_Female'] = df['Gender_Female'].astype(int)
df['Gender_Male'] = df['Gender_Male'].astype(int)
df['Gender_Other'] = df['Gender_Other'].astype(int)

X = df.drop(columns=["work_interfere", "Country", "state"])
y = df["work_interfere"]



In [124]:
X = df.drop(columns=["work_interfere", "Country", "state"])
y = df["work_interfere"]

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [126]:
print(X_train.dtypes)

Age                          int64
self_employed                int64
family_history               int64
treatment                    int64
no_employees                 int64
remote_work                  int64
tech_company                 int64
benefits                     int64
care_options                 int64
wellness_program             int64
seek_help                    int64
anonymity                    int64
leave                        int64
mental_health_consequence    int64
phys_health_consequence      int64
coworkers                    int64
supervisor                   int64
mental_health_interview      int64
phys_health_interview        int64
mental_vs_physical           int64
obs_consequence              int64
Gender_Female                int64
Gender_Male                  int64
Gender_Other                 int64
dtype: object


In [127]:
# # EDA (checking which features are signficant by checking OLS regression results)
# # OLS only works with numerical data 

# model1 = sm.OLS(y_train, X_train).fit() # (OLS) ordinary least square
# print(model1.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [81]:
# Try with only significant columns first

# sig_cols = ["Gender", "family_history", "treatment", "tech_company", "mental_health_interview"]

# X_train = X_train[sig_cols]
# X_test = X_test[sig_cols]

In [128]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import loguniform
from sklearn.metrics import accuracy_score

In [156]:
# Hyperparameter tuning for base models

# Define the base models
#dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(random_state=42)

In [157]:
# Hyperparameter tuning for random foresr

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(100, 500),          # number of trees
    'max_depth': randint(4, 8),                # max depth of each tree
    'min_samples_split': randint(2, 10),        # min samples to split a node
    'min_samples_leaf': randint(1, 10),         # min samples at a leaf node
    'max_features': ['sqrt', 'log2', None],     # number of features to consider at split
    'bootstrap': [True],                 # use bootstrap sampling
    'criterion': ['gini', 'entropy']            # impurity function
}

# Randomized search setup for RandomForestClassifier()
search_obj_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,                 # number of random combinations
    cv=5,                      # 5-fold cross-validation
    scoring='f1_macro',        # or 'accuracy', 'f1_weighted'
    n_jobs=-1,                 # use all CPU cores
    random_state=42,
    verbose=1
)

In [158]:
search_obj_rf.fit(X_train, y_train)
best_rf = search_obj_rf.best_estimator_
best_rf

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [132]:
# Hyperparameter tuning for logistic regression

# Base model
lr = LogisticRegression(multi_class='multinomial', max_iter=1000)

# Define parameter distributions
param_dist = {
    'C': loguniform(1e-4, 1e3),           # Regularization strength (inverse)
    'penalty': ['l2'],                   # For multinomial, only 'l2' is supported by most solvers
    'solver': ['lbfgs', 'saga'],         # Solvers that support multi_class='multinomial'
}

# Randomized search setup
search_obj_lr = RandomizedSearchCV(
    estimator=lr,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='f1_macro',       # good for multiclass with balanced classes
    n_jobs=-1,
    random_state=42
)

In [133]:
search_obj_lr.fit(X_train, y_train)
best_lr = search_obj_lr.best_estimator_
best_lr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [160]:

# Ensemble method
# Random Forest + Logistic Regression 
subset_columns = ["Gender", "family_history", "treatment", "tech_company", "mental_health_interview"]

estimators = []
estimators.append(('rf', best_rf))
estimators.append(('lr', best_lr))
VC = VotingClassifier(estimators = estimators, voting='soft')
VC.fit(X_train, y_train)



In [161]:
y_train_pred = VC.predict(X_train)

# Training accuracy
print(accuracy_score(y_train, y_train_pred))

0.5967130214917825


In [162]:
# Test accuracy
y_test_pred = VC.predict(X_test)
print(accuracy_score(y_test, y_test_pred))

0.5909090909090909


In [137]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [138]:
# Precision, recall, and F-1 stats

# Get metrics (for binary or multiclass)
precision = precision_score(y_test, y_test_pred, average='macro')  # or 'micro', 'weighted'
recall = recall_score(y_test, y_test_pred, average='macro')
f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")

Precision: 0.46
Recall:    0.44
F1 Score:  0.38


In [None]:
# Feature engineering

In [159]:
# Accuracy of just random forest

# Make predictions on the train set
y_rf_train_pred = best_rf.predict(X_train)

# Make predictions on the test set
y_rf_test_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy_train = accuracy_score(y_train, y_rf_train_pred)
accuracy_test = accuracy_score(y_test, y_rf_test_pred)
print(f"RF Train Accuracy: {accuracy_train:.2f}")
print(f"RF Test Accuracy: {accuracy_test:.2f}")



# # Print classification report
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

# # Print confusion matrix
# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, y_pred))

RF Train Accuracy: 0.63
RF Test Accuracy: 0.60


In [150]:
# Accuracy of just logistic regression

# Make predictions on the train set
y_lr_train_pred = best_lr.predict(X_train)

# Make predictions on the test set
y_lr_test_pred = best_lr.predict(X_test)

# Evaluate the model
accuracy_train = accuracy_score(y_train, y_lr_train_pred)
accuracy_test = accuracy_score(y_test, y_lr_test_pred)
print(f"LR Train Accuracy: {accuracy_train:.2f}")
print(f"LR Test Accuracy: {accuracy_test:.2f}")



# # Print classification report
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

# # Print confusion matrix
# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, y_pred))

RF Train Accuracy: 0.56
RF Test Accuracy: 0.57
