In [1]:
# Ignoring the warnings
import warnings
warnings.filterwarnings('ignore')

# Imorting required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Importing require AI/ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier as xgb
from sklearn.preprocessing import StandardScaler


In [2]:
# Location of the datasets
train_data=r'C:\Users\DELL\train_depression.csv'
test_data=r'C:\Users\DELL\test_depression.csv'

In [3]:
# Locading the datasets in memory
df_train=pd.read_csv(train_data)

### EDA

In [4]:
# Checking the % of null entries
df_train.isnull().sum()/len(df_train)*100

id                                        0.000000
Name                                      0.000000
Gender                                    0.000000
Age                                       0.000000
City                                      0.000000
Working Professional or Student           0.000000
Profession                               26.034115
Academic Pressure                        80.172708
Work Pressure                            19.842217
CGPA                                     80.171997
Study Satisfaction                       80.172708
Job Satisfaction                         19.836532
Sleep Duration                            0.000000
Dietary Habits                            0.002843
Degree                                    0.001421
Have you ever had suicidal thoughts ?     0.000000
Work/Study Hours                          0.000000
Financial Stress                          0.002843
Family History of Mental Illness          0.000000
Depression                     

In [5]:
# Extracting the columns which have null values along with their data type
null_value_cols=[(col,round(100*null_values/len(df_train),2),df_train[col].dtype)
                 for col,null_values in df_train.isnull().sum().items() if null_values>0]

In [6]:
# Checking the null values along with their data types
null_value_cols

[('Profession', 26.03, dtype('O')),
 ('Academic Pressure', 80.17, dtype('float64')),
 ('Work Pressure', 19.84, dtype('float64')),
 ('CGPA', 80.17, dtype('float64')),
 ('Study Satisfaction', 80.17, dtype('float64')),
 ('Job Satisfaction', 19.84, dtype('float64')),
 ('Dietary Habits', 0.0, dtype('O')),
 ('Degree', 0.0, dtype('O')),
 ('Financial Stress', 0.0, dtype('float64'))]

In [7]:
# I will extract the columns which have very low missing values
[(col,dt) for col,missing_pct,dt in null_value_cols if missing_pct<1]

[('Dietary Habits', dtype('O')),
 ('Degree', dtype('O')),
 ('Financial Stress', dtype('float64'))]

In [8]:
# Checking the Financial stress parameter --> It is clear that this needs to be treated as a categorical variable
df_train['Financial Stress'].unique()

array([ 2.,  3.,  1.,  4.,  5., nan])

In [9]:
# Changing the data type of the Financial Stress to object
df_train['Financial Stress']=df_train['Financial Stress'].astype(object)

In [10]:
# Imputing the missing values in the financial stress
df_train['Financial Stress']=df_train['Financial Stress'].fillna(df_train['Financial Stress'].mode()[0])

In [11]:
# Checking the % of null entries
df_train.isnull().sum()/len(df_train)*100

id                                        0.000000
Name                                      0.000000
Gender                                    0.000000
Age                                       0.000000
City                                      0.000000
Working Professional or Student           0.000000
Profession                               26.034115
Academic Pressure                        80.172708
Work Pressure                            19.842217
CGPA                                     80.171997
Study Satisfaction                       80.172708
Job Satisfaction                         19.836532
Sleep Duration                            0.000000
Dietary Habits                            0.002843
Degree                                    0.001421
Have you ever had suicidal thoughts ?     0.000000
Work/Study Hours                          0.000000
Financial Stress                          0.000000
Family History of Mental Illness          0.000000
Depression                     