### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, root_mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


### Load data 

In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


### Data Exploration

In [6]:
print(df.shape)
print(f'Data set has {df.shape[0]} rows and {df.shape[1]} columns')

(140700, 20)
Data set has 140700 rows and 20 columns


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

### Checking for null values

In [8]:
df.isnull().sum()

id                                            0
Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

In [12]:
df[(df['Working Professional or Student'] == 'Student') & (df['Academic Pressure'].isnull())]

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
17549,17549,Tanya,Female,20.0,Patna,Student,,,,5.55,,,5-6 hours,Moderate,Class 12,No,0.0,3.0,Yes,0
21880,21880,Aarush,Male,38.0,Chennai,Student,,,5.0,,,4.0,5-6 hours,Healthy,Class 12,No,2.0,3.0,No,0
70453,70453,Veda,Female,20.0,Ahmedabad,Student,,,,,,2.0,Less than 5 hours,Moderate,Class 12,Yes,12.0,3.0,Yes,1
75007,75007,Aarav,Male,21.0,Lucknow,Student,,,2.0,,,1.0,7-8 hours,Moderate,Class 12,Yes,3.0,3.0,Yes,0
105773,105773,Anand,Male,18.0,Ahmedabad,Student,,,,,,1.0,Less than 5 hours,Moderate,Class 12,Yes,9.0,5.0,No,1
122983,122983,Saanvi,Female,30.0,Ghaziabad,Student,,,,5.47,2.0,,Less than 5 hours,Unhealthy,B.Com,Yes,5.0,1.0,No,0
129756,129756,Kian,Male,18.0,Rajkot,Student,,,5.0,,,4.0,7-8 hours,Moderate,Class 12,Yes,9.0,4.0,No,1
134830,134830,Aaradhya,Female,24.0,Meerut,Student,,,,,,2.0,More than 8 hours,Unhealthy,Class 12,No,0.0,5.0,No,0
137013,137013,Vikram,Male,36.0,Varanasi,Student,,,,8.54,3.0,,More than 8 hours,Moderate,Class 12,Yes,8.0,5.0,Yes,1


In [18]:
for col in df.describe(include='all').columns:
    if col not in ['id','Name', 'Age', 'City', 'CGPA', 'Work/Study Hours']: 
        print(df[col].value_counts())
        print()

Gender
Male      77464
Female    63236
Name: count, dtype: int64

Working Professional or Student
Working Professional    112799
Student                  27901
Name: count, dtype: int64

Profession
Teacher           24906
Content Writer     7814
Architect          4370
Consultant         4229
HR Manager         4022
                  ...  
BBA                   1
City Manager          1
FamilyVirar           1
B.Com                 1
Yuvraj                1
Name: count, Length: 64, dtype: int64

Academic Pressure
3.0    7463
5.0    6296
4.0    5158
1.0    4801
2.0    4179
Name: count, dtype: int64

Work Pressure
2.0    24373
4.0    22512
5.0    22436
3.0    21899
1.0    21562
Name: count, dtype: int64

Study Satisfaction
4.0    6360
2.0    5840
3.0    5823
1.0    5451
5.0    4423
Name: count, dtype: int64

Job Satisfaction
2.0    24783
5.0    22812
1.0    22324
3.0    21951
4.0    20920
Name: count, dtype: int64

Sleep Duration
Less than 5 hours    38784
7-8 hours            36969
More