In [39]:
import pandas as pd

import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")

spliting train set to train and test, since the given test set has no labels

In [40]:
data_dir = "./data/"
data_df = pd.read_csv(data_dir + "train.csv")
train, test = train_test_split(data_df, test_size=0.2,shuffle=True)


In [41]:
train.to_csv('./data/initial_train.csv')  
test.to_csv('./data/initial_test.csv') 

In [42]:
data_dir = "./data/"

train_df = pd.read_csv(data_dir + "initial_train.csv")
test_df = pd.read_csv(data_dir + "initial_test.csv")


**Train dataset - Preprocessing**

In [43]:
train_col = train_df.columns.tolist()
test_col = test_df.columns.tolist()

train_df.shape, test_df.shape

((112560, 21), (28140, 21))

In [44]:
print('Duplicates in train:', train_df.duplicated().sum())
print('Duplicates in test:', test_df.duplicated().sum())

Duplicates in train: 0
Duplicates in test: 0


In [45]:
print(train_df["Depression"].value_counts())
print(test_df["Depression"].value_counts())

Depression
0    92089
1    20471
Name: count, dtype: int64
Depression
0    23044
1     5096
Name: count, dtype: int64


In [46]:
missing_values_train = train_df.isnull().sum()
print(missing_values_train)


Unnamed: 0                                   0
id                                           0
Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                               29272
Academic Pressure                        90251
Work Pressure                            22324
CGPA                                     90249
Study Satisfaction                       90250
Job Satisfaction                         22318
Sleep Duration                               0
Dietary Habits                               4
Degree                                       1
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             4
Family History of Mental Illness             0
Depression                                   0
dtype: int64


In [47]:
print("Num of rows with missing value in Depression (target column) in training dataset:", train_df["Depression"].isnull().sum())

Num of rows with missing value in Depression (target column) in training dataset: 0


In [48]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112560 entries, 0 to 112559
Data columns (total 21 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Unnamed: 0                             112560 non-null  int64  
 1   id                                     112560 non-null  int64  
 2   Name                                   112560 non-null  object 
 3   Gender                                 112560 non-null  object 
 4   Age                                    112560 non-null  float64
 5   City                                   112560 non-null  object 
 6   Working Professional or Student        112560 non-null  object 
 7   Profession                             83288 non-null   object 
 8   Academic Pressure                      22309 non-null   float64
 9   Work Pressure                          90236 non-null   float64
 10  CGPA                                   22311 non-null   

In [49]:
train_df.head(10)

Unnamed: 0.1,Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,...,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,26473,26473,Ansh,Male,36.0,Ahmedabad,Working Professional,Business Analyst,,5.0,...,,1.0,5-6 hours,Healthy,MBA,Yes,7.0,3.0,No,1
1,138186,138186,Anushka,Female,35.0,Indore,Working Professional,HR Manager,,3.0,...,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,No,0
2,140584,140584,Vibha,Female,22.0,Kanpur,Student,,4.0,,...,2.0,,More than 8 hours,Unhealthy,M.Com,Yes,6.0,5.0,No,1
3,95466,95466,Lavanya,Female,49.0,Kalyan,Working Professional,Research Analyst,,2.0,...,,2.0,7-8 hours,Unhealthy,MBA,Yes,8.0,3.0,No,0
4,99540,99540,Vrinda,Female,40.0,Indore,Working Professional,,,4.0,...,,5.0,7-8 hours,Healthy,Class 12,No,6.0,1.0,No,0
5,29267,29267,Gauri,Female,50.0,Kalyan,Working Professional,Entrepreneur,,4.0,...,,2.0,Less than 5 hours,Healthy,BBA,No,10.0,5.0,Yes,0
6,97874,97874,Rashi,Female,57.0,Indore,Working Professional,Teacher,,5.0,...,,5.0,7-8 hours,Healthy,PhD,Yes,9.0,5.0,Yes,0
7,45563,45563,Ishan,Male,46.0,Delhi,Working Professional,Educational Consultant,,4.0,...,,4.0,More than 8 hours,Healthy,M.Ed,Yes,11.0,4.0,No,0
8,40646,40646,Aahana,Female,48.0,Kalyan,Working Professional,Teacher,,2.0,...,,4.0,More than 8 hours,Unhealthy,M.Pharm,Yes,4.0,3.0,Yes,0
9,109604,109604,Anushka,Female,46.0,Kolkata,Working Professional,Content Writer,,1.0,...,,2.0,Less than 5 hours,Healthy,B.Ed,No,1.0,2.0,Yes,0


In [50]:
train_df["City"].unique()

array(['Ahmedabad', 'Indore', 'Kanpur', 'Kalyan', 'Delhi', 'Kolkata',
       'Ludhiana', 'Surat', 'Agra', 'Rajkot', 'Bhopal', 'Bangalore',
       'Ghaziabad', 'Mumbai', 'Vadodara', 'Faridabad', 'Thane',
       'Vasai-Virar', 'Meerut', 'Nagpur', 'Jaipur', 'Lucknow', 'Pune',
       'Varanasi', 'Visakhapatnam', 'Nashik', 'Chennai', 'Srinagar',
       'Patna', 'Hyderabad', 'Less than 5 Kalyan', 'Mihir', 'Galesabad',
       'Molkata', 'Kagan', 'Kibara', 'Keshav', 'MSc', 'Parth', 'Tolkata',
       'Less Delhi', 'Researcher', 'Nandini', 'Pratyush', 'Aaradhya',
       'MCA', 'Vaishnavi', 'No', 'Saanvi', 'City', 'Jhanvi', 'Itheg',
       'Ayush', '3.0', 'Mira', 'Khaziabad', 'Mahi', 'Tushar', 'Plata',
       'Raghavendra', 'Dhruv', 'Atharv', 'Nalini', 'Moreadhyay', 'Ayansh',
       'Vidya', 'Krishna', 'Shrey', 'Vaanya', 'Gurgaon', 'Anvi', 'Bhavna',
       'ME', 'M.Com', 'Armaan', 'Harsha', 'Chhavi', 'Ivaan', 'Reyansh',
       'Malyansh', 'M.Tech'], dtype=object)

In [51]:
train_df["Gender"].unique()

array(['Male', 'Female'], dtype=object)

In [52]:
train_df["Working Professional or Student"].unique()

array(['Working Professional', 'Student'], dtype=object)

In [53]:
train_df["Profession"].unique()

array(['Business Analyst', 'HR Manager', nan, 'Research Analyst',
       'Entrepreneur', 'Teacher', 'Educational Consultant',
       'Content Writer', 'Lawyer', 'Marketing Manager', 'UX/UI Designer',
       'Chemist', 'Graphic Designer', 'Travel Consultant', 'Pharmacist',
       'Pilot', 'Electrician', 'Chef', 'Architect', 'Sales Executive',
       'Civil Engineer', 'Doctor', 'Mechanical Engineer',
       'Digital Marketer', 'Plumber', 'Consultant', 'Finanancial Analyst',
       'Judge', 'Data Scientist', 'Financial Analyst', 'Researcher',
       'Software Engineer', 'Manager', 'Customer Support', 'Accountant',
       'Profession', 'LLM', 'Investment Banker', 'Medical Doctor',
       'Student', 'Academic', 'PhD', 'Yogesh', 'Unemployed',
       'Working Professional', 'BCA', 'Yuvraj', 'Dev',
       'Family Consultant', 'B.Ed', 'MBBS', 'BE', 'FamilyVirar',
       'Moderate', 'City Manager', 'Pranav', 'M.Ed', 'Nagpur', 'B.Com'],
      dtype=object)

In [54]:
train_df["Academic Pressure"].unique()

array([nan,  4.,  3.,  5.,  2.,  1.])

In [55]:
train_df["Work Pressure"].unique()

array([ 5.,  3., nan,  2.,  4.,  1.])

In [56]:
train_df["Study Satisfaction"].unique()

array([nan,  2.,  5.,  3.,  1.,  4.])

In [57]:
train_df["Job Satisfaction"].unique()

array([ 1., nan,  2.,  5.,  4.,  3.])

In [58]:
train_df["Sleep Duration"].unique()

array(['5-6 hours', 'More than 8 hours', '7-8 hours', 'Less than 5 hours',
       '35-36 hours', '6-7 hours', 'Work_Study_Hours', '6-8 hours',
       '2-3 hours', '1-3 hours', '4-5 hours', '10-11 hours', '4-6 hours',
       '3-4 hours', '55-66 hours', '1-6 hours', '8-9 hours', 'Moderate',
       'No', 'Indore', 'Unhealthy', '3-6 hours', '10-6 hours',
       '9-6 hours', '45', 'than 5 hours', '40-45 hours', '9-5',
       'Sleep_Duration', '9-11 hours', '9-5 hours', '49 hours'],
      dtype=object)

In [59]:
train_df["Dietary Habits"].unique()

array(['Healthy', 'Unhealthy', 'Moderate', 'No', 'Less Healthy',
       'Class 12', 'Hormonal', 'Less than Healthy', 'Mihir', nan,
       'Pratham', 'Yes', 'Indoor', 'More Healthy', 'Gender', '1.0',
       'Male', '3', 'M.Tech', 'BSc', 'Electrician'], dtype=object)

In [60]:
train_df["Degree"].unique()

array(['MBA', 'BBA', 'M.Com', 'Class 12', 'PhD', 'M.Ed', 'M.Pharm',
       'B.Ed', 'BCA', 'LLB', 'MA', 'LLM', 'ME', 'B.Arch', 'B.Tech', 'MSc',
       'MBBS', 'BHM', 'MD', 'BE', 'M.Tech', 'MHM', 'BSc', 'B.Com',
       'B.Pharm', 'MCA', 'BA', 'S.Tech', 'LLEd', '20', 'BArch', 'B.03',
       'B B.Com', '7.06', '0', 'MEd', 'LLS', 'M_Tech', '5.88', 'M.Arch',
       'Esha', 'Unite', 'Nalini', 'S.Pharm', 'P.Com', 'LLCom', 'LLBA',
       'L.Ed', 'LLTech', 'HCA', 'B.Sc', 'B.3.79', 'Veda', 'Mihir',
       'Badhya', 'Pihu', 'Bhopal', 'LL B.Ed', '29', 'B.Student', '5.61',
       'S.Arch', 'B', 'Navya', 'Data Scientist', 'K.Ed', 'HR Manager',
       'UX/UI Designer', 'Kalyan', 'Ritik', 'BEd', 'ACA', 'MPharm', nan,
       'Working Professional', 'Entrepreneur', 'LL.Com', 'Plumber', 'LHM',
       'M', 'MPA', 'E.Tech', 'MTech', 'Jhanvi', 'Degree', 'B BA',
       'Business Analyst', 'Class 11', 'H_Pharm', 'CGPA', 'Vrinda',
       'Brit', 'Advait', 'M. Business Analyst', 'Brithika', 'Vivaan',
       'BH'

In [61]:
train_df["Have you ever had suicidal thoughts ?"].unique()

array(['Yes', 'No'], dtype=object)

In [62]:
train_df["Work/Study Hours"].unique()

array([ 7.,  9.,  6.,  8., 10., 11.,  4.,  1.,  2., 12.,  0.,  3.,  5.])

In [63]:
train_df["Financial Stress"].unique()

array([ 3.,  4.,  5.,  1.,  2., nan])

In [64]:
train_df["Family History of Mental Illness"].unique()

array(['No', 'Yes'], dtype=object)

In [65]:
train_df["Depression"].unique()

array([1, 0])

In [66]:
num_features = ['Academic Pressure', 'Work Pressure','CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Financial Stress', 
                'Age', 'Work/Study Hours' ]
num_imputer = SimpleImputer(strategy="median")
#num_imputer = KNNImputer(n_neighbors=2, weights="uniform")
# Impute numerical fields
train_df[num_features] = num_imputer.fit_transform(train_df[num_features])

scaler = StandardScaler()
train_df[num_features] = scaler.fit_transform(train_df[num_features])

In [67]:
cat_features = ['Gender', 'City', 'Working Professional or Student','Profession', 'Sleep Duration', 'Dietary Habits', 
                'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
train_df[cat_features] = cat_imputer.fit_transform(train_df[cat_features])
print("Before encoding: ", train_df.shape)



Before encoding:  (112560, 21)


In [70]:
#OneHotEncoder
cat_features_oh = ['Gender', 'Working Professional or Student', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
encoder = OneHotEncoder( sparse=False, handle_unknown='ignore')
#encoder = OneHotEncoder( sparse_output=False, handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(train_df[cat_features_oh])
print(one_hot_encoded.shape)
one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names(cat_features_oh))
# one_hot_df = pd.DataFrame(one_hot_encoded, 
#                           columns=encoder.get_feature_names_out(cat_features_oh))
print(one_hot_df.columns)
final_train_df = pd.concat([train_df.drop(cat_features_oh, axis=1), one_hot_df], axis=1)
print(final_train_df.columns)
print(final_train_df.shape)



(112560, 8)
Index(['Gender_Female', 'Gender_Male',
       'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
      dtype='object')
Index(['Unnamed: 0', 'id', 'Name', 'Age', 'City', 'Profession',
       'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
       'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree',
       'Work/Study Hours', 'Financial Stress', 'Depression', 'Gender_Female',
       'Gender_Male', 'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
  

In [71]:
# OrdinalEncoder
cat_features_ord = ['City', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree']
ord_encoder = OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1)
final_train_df[cat_features_ord] = ord_encoder.fit_transform(final_train_df[cat_features_ord])
print(final_train_df.columns)
print(final_train_df.shape)


Index(['Unnamed: 0', 'id', 'Name', 'Age', 'City', 'Profession',
       'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
       'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree',
       'Work/Study Hours', 'Financial Stress', 'Depression', 'Gender_Female',
       'Gender_Male', 'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
      dtype='object')
(112560, 25)


In [72]:
final_train_df = final_train_df.drop(['id', 'Name'], axis=1)
print(final_train_df.columns)
print(final_train_df.shape)

Index(['Unnamed: 0', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree', 'Work/Study Hours',
       'Financial Stress', 'Depression', 'Gender_Female', 'Gender_Male',
       'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
      dtype='object')
(112560, 23)


In [73]:
final_train_df.head()

Unnamed: 0.1,Unnamed: 0,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,...,Financial Stress,Depression,Gender_Female,Gender_Male,Working Professional or Student_Student,Working Professional or Student_Working Professional,Have you ever had suicidal thoughts ?_No,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_No,Family History of Mental Illness_Yes
0,26473,-0.353416,3,7,-0.044897,1.588657,0.034426,0.017201,-1.56478,13,...,0.007101,1,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,138186,-0.434164,23,27,-0.044897,0.000127,0.034426,0.017201,-1.56478,13,...,0.714395,0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,140584,-1.483885,30,58,1.574336,0.000127,-1.740702,-1.636193,0.013397,26,...,1.421689,1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,95466,0.696304,29,46,-0.044897,-0.794138,0.034426,0.017201,-0.775692,17,...,0.007101,0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,99540,-0.030425,23,58,-0.044897,0.794392,0.034426,0.017201,1.591574,17,...,-1.407488,0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [74]:
# Move target column ('Depression') to be the 1st column
column_to_move = final_train_df.pop("Depression")
final_train_df.insert(0, "Depression", column_to_move)
final_train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112560 entries, 0 to 112559
Data columns (total 23 columns):
 #   Column                                                Non-Null Count   Dtype  
---  ------                                                --------------   -----  
 0   Depression                                            112560 non-null  int64  
 1   Unnamed: 0                                            112560 non-null  int64  
 2   Age                                                   112560 non-null  float64
 3   City                                                  112560 non-null  int32  
 4   Profession                                            112560 non-null  int32  
 5   Academic Pressure                                     112560 non-null  float64
 6   Work Pressure                                         112560 non-null  float64
 7   CGPA                                                  112560 non-null  float64
 8   Study Satisfaction                          

In [75]:
final_train_df.head(10)

Unnamed: 0.1,Depression,Unnamed: 0,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,...,Work/Study Hours,Financial Stress,Gender_Female,Gender_Male,Working Professional or Student_Student,Working Professional or Student_Working Professional,Have you ever had suicidal thoughts ?_No,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_No,Family History of Mental Illness_Yes
0,1,26473,-0.353416,3,7,-0.044897,1.588657,0.034426,0.017201,-1.56478,...,0.192731,0.007101,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0,138186,-0.434164,23,27,-0.044897,0.000127,0.034426,0.017201,-1.56478,...,0.711776,0.714395,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1,140584,-1.483885,30,58,1.574336,0.000127,-1.740702,-1.636193,0.013397,...,-0.066792,1.421689,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0,95466,0.696304,29,46,-0.044897,-0.794138,0.034426,0.017201,-0.775692,...,0.452253,0.007101,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0,99540,-0.030425,23,58,-0.044897,0.794392,0.034426,0.017201,1.591574,...,-0.066792,-1.407488,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
5,0,29267,0.777052,29,21,-0.044897,0.794392,0.034426,0.017201,-0.775692,...,0.971299,1.421689,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,0,97874,1.342286,23,51,-0.044897,1.588657,0.034426,0.017201,1.591574,...,0.711776,1.421689,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
7,0,45563,0.454061,15,19,-0.044897,0.794392,0.034426,0.017201,0.802485,...,1.230821,0.714395,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
8,0,40646,0.615557,29,51,-0.044897,-0.794138,0.034426,0.017201,0.802485,...,-0.585837,0.007101,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
9,0,109604,0.454061,34,13,-0.044897,-1.588403,0.034426,0.017201,-0.775692,...,-1.364405,-0.700194,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


**Test dataset - Preprocessing**

In [76]:
missing_values_test = test_df.isnull().sum()
print(missing_values_test)

Unnamed: 0                                   0
id                                           0
Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                                7358
Academic Pressure                        22552
Work Pressure                             5594
CGPA                                     22553
Study Satisfaction                       22553
Job Satisfaction                          5592
Sleep Duration                               0
Dietary Habits                               0
Degree                                       1
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             0
Family History of Mental Illness             0
Depression                                   0
dtype: int64


In [77]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28140 entries, 0 to 28139
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Unnamed: 0                             28140 non-null  int64  
 1   id                                     28140 non-null  int64  
 2   Name                                   28140 non-null  object 
 3   Gender                                 28140 non-null  object 
 4   Age                                    28140 non-null  float64
 5   City                                   28140 non-null  object 
 6   Working Professional or Student        28140 non-null  object 
 7   Profession                             20782 non-null  object 
 8   Academic Pressure                      5588 non-null   float64
 9   Work Pressure                          22546 non-null  float64
 10  CGPA                                   5587 non-null   float64
 11  St

In [78]:
test_df.head(10)

Unnamed: 0.1,Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,...,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,77278,77278,Rupal,Female,54.0,Chennai,Working Professional,Teacher,,3.0,...,,4.0,More than 8 hours,Unhealthy,M.Tech,No,1.0,4.0,No,0
1,43230,43230,Ansh,Male,51.0,Varanasi,Working Professional,Pharmacist,,3.0,...,,3.0,More than 8 hours,Moderate,MBBS,Yes,6.0,3.0,No,0
2,71236,71236,Pallavi,Female,38.0,Lucknow,Working Professional,Doctor,,3.0,...,,5.0,More than 8 hours,Moderate,B.Pharm,No,10.0,4.0,Yes,0
3,4780,4780,Dev,Male,27.0,Srinagar,Student,,2.0,,...,3.0,,7-8 hours,Unhealthy,BHM,Yes,6.0,5.0,No,1
4,39527,39527,Riya,Female,57.0,Hyderabad,Working Professional,Teacher,,5.0,...,,4.0,More than 8 hours,Moderate,B.Arch,Yes,3.0,2.0,Yes,0
5,3118,3118,Anvi,Female,48.0,Kolkata,Working Professional,Content Writer,,5.0,...,,3.0,More than 8 hours,Healthy,B.Ed,Yes,6.0,2.0,No,0
6,67864,67864,Vikram,Male,33.0,Vasai-Virar,Working Professional,Mechanical Engineer,,4.0,...,,3.0,7-8 hours,Moderate,PhD,No,11.0,1.0,Yes,1
7,5222,5222,Anjali,Female,54.0,Chennai,Working Professional,Content Writer,,2.0,...,,2.0,5-6 hours,Healthy,M.Ed,Yes,9.0,2.0,Yes,0
8,54788,54788,Lakshay,Male,31.0,Varanasi,Working Professional,Accountant,,3.0,...,,5.0,7-8 hours,Healthy,B.Com,No,11.0,2.0,Yes,0
9,1239,1239,Ansh,Male,55.0,Nagpur,Working Professional,Teacher,,5.0,...,,3.0,7-8 hours,Unhealthy,B.Arch,No,12.0,2.0,Yes,0


In [79]:

num_imputer = SimpleImputer(strategy="median")
#num_imputer = KNNImputer(n_neighbors=2, weights="uniform")
# Impute numerical fields
test_df[num_features] = num_imputer.fit_transform(test_df[num_features])

scaler = StandardScaler()
test_df[num_features] = scaler.fit_transform(test_df[num_features])



In [80]:
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
test_df[cat_features] = cat_imputer.fit_transform(test_df[cat_features])
print("Before encoding: ", test_df.shape)

Before encoding:  (28140, 21)


In [82]:
#OneHotEncoder
#encoder = OneHotEncoder( sparse_output=False, handle_unknown='ignore')
encoder = OneHotEncoder( sparse=False, handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(test_df[cat_features_oh])
print(one_hot_encoded.shape)
# one_hot_df = pd.DataFrame(one_hot_encoded, 
#                           columns=encoder.get_feature_names_out(cat_features_oh))
one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names(cat_features_oh))
print(one_hot_df.columns)
final_test_df = pd.concat([test_df.drop(cat_features_oh, axis=1), one_hot_df], axis=1)
print(final_test_df.columns)
print(final_test_df.shape)

(28140, 8)
Index(['Gender_Female', 'Gender_Male',
       'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
      dtype='object')
Index(['Unnamed: 0', 'id', 'Name', 'Age', 'City', 'Profession',
       'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
       'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree',
       'Work/Study Hours', 'Financial Stress', 'Depression', 'Gender_Female',
       'Gender_Male', 'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
   

In [83]:
ord_encoder = OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1)
final_test_df[cat_features_ord] = ord_encoder.fit_transform(final_test_df[cat_features_ord])
print(final_test_df.columns)
print(final_test_df.shape)

Index(['Unnamed: 0', 'id', 'Name', 'Age', 'City', 'Profession',
       'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
       'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree',
       'Work/Study Hours', 'Financial Stress', 'Depression', 'Gender_Female',
       'Gender_Male', 'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
      dtype='object')
(28140, 25)


In [84]:
# Move target column ('Depression') to be the 1st column
column_to_move = final_test_df.pop("Depression")
final_test_df.insert(0, "Depression", column_to_move)
final_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28140 entries, 0 to 28139
Data columns (total 25 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Depression                                            28140 non-null  int64  
 1   Unnamed: 0                                            28140 non-null  int64  
 2   id                                                    28140 non-null  int64  
 3   Name                                                  28140 non-null  object 
 4   Age                                                   28140 non-null  float64
 5   City                                                  28140 non-null  int32  
 6   Profession                                            28140 non-null  int32  
 7   Academic Pressure                                     28140 non-null  float64
 8   Work Pressure                                         28

In [85]:
final_test_df = final_test_df.drop(['id', 'Name'], axis=1)
print(final_test_df.columns)
print(final_test_df.shape)

Index(['Depression', 'Unnamed: 0', 'Age', 'City', 'Profession',
       'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction',
       'Job Satisfaction', 'Sleep Duration', 'Dietary Habits', 'Degree',
       'Work/Study Hours', 'Financial Stress', 'Gender_Female', 'Gender_Male',
       'Working Professional or Student_Student',
       'Working Professional or Student_Working Professional',
       'Have you ever had suicidal thoughts ?_No',
       'Have you ever had suicidal thoughts ?_Yes',
       'Family History of Mental Illness_No',
       'Family History of Mental Illness_Yes'],
      dtype='object')
(28140, 23)


In [86]:
final_train_df.to_csv('./data/final_train.csv', index=False, header=False)  
final_test_df.to_csv('./data/final_test.csv', index=False, header=False)  

In [87]:
final_train_df

Unnamed: 0.1,Depression,Unnamed: 0,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,...,Work/Study Hours,Financial Stress,Gender_Female,Gender_Male,Working Professional or Student_Student,Working Professional or Student_Working Professional,Have you ever had suicidal thoughts ?_No,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_No,Family History of Mental Illness_Yes
0,1,26473,-0.353416,3,7,-0.044897,1.588657,0.034426,0.017201,-1.564780,...,0.192731,0.007101,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0,138186,-0.434164,23,27,-0.044897,0.000127,0.034426,0.017201,-1.564780,...,0.711776,0.714395,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1,140584,-1.483885,30,58,1.574336,0.000127,-1.740702,-1.636193,0.013397,...,-0.066792,1.421689,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0,95466,0.696304,29,46,-0.044897,-0.794138,0.034426,0.017201,-0.775692,...,0.452253,0.007101,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0,99540,-0.030425,23,58,-0.044897,0.794392,0.034426,0.017201,1.591574,...,-0.066792,-1.407488,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112555,0,80349,1.019295,80,51,-0.044897,0.000127,0.034426,0.017201,-0.775692,...,-1.364405,0.007101,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
112556,0,32293,1.019295,75,51,-0.044897,1.588657,0.034426,0.017201,1.591574,...,-0.585837,-0.700194,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
112557,0,23436,-0.757155,69,58,-3.283363,0.000127,2.513485,0.017201,0.013397,...,0.452253,0.007101,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
112558,0,71016,0.454061,64,12,-0.044897,-0.794138,0.034426,0.017201,0.802485,...,-1.364405,-1.407488,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [88]:
final_train_df

Unnamed: 0.1,Depression,Unnamed: 0,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,...,Work/Study Hours,Financial Stress,Gender_Female,Gender_Male,Working Professional or Student_Student,Working Professional or Student_Working Professional,Have you ever had suicidal thoughts ?_No,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_No,Family History of Mental Illness_Yes
0,1,26473,-0.353416,3,7,-0.044897,1.588657,0.034426,0.017201,-1.564780,...,0.192731,0.007101,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0,138186,-0.434164,23,27,-0.044897,0.000127,0.034426,0.017201,-1.564780,...,0.711776,0.714395,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1,140584,-1.483885,30,58,1.574336,0.000127,-1.740702,-1.636193,0.013397,...,-0.066792,1.421689,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0,95466,0.696304,29,46,-0.044897,-0.794138,0.034426,0.017201,-0.775692,...,0.452253,0.007101,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0,99540,-0.030425,23,58,-0.044897,0.794392,0.034426,0.017201,1.591574,...,-0.066792,-1.407488,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112555,0,80349,1.019295,80,51,-0.044897,0.000127,0.034426,0.017201,-0.775692,...,-1.364405,0.007101,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
112556,0,32293,1.019295,75,51,-0.044897,1.588657,0.034426,0.017201,1.591574,...,-0.585837,-0.700194,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
112557,0,23436,-0.757155,69,58,-3.283363,0.000127,2.513485,0.017201,0.013397,...,0.452253,0.007101,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
112558,0,71016,0.454061,64,12,-0.044897,-0.794138,0.034426,0.017201,0.802485,...,-1.364405,-1.407488,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
