In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_csv('train.csv')

In [4]:
df_test = pd.read_csv('test.csv')

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93800 entries, 0 to 93799
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     93800 non-null  int64  
 1   Name                                   93800 non-null  object 
 2   Gender                                 93800 non-null  object 
 3   Age                                    93800 non-null  float64
 4   City                                   93800 non-null  object 
 5   Working Professional or Student        93800 non-null  object 
 6   Profession                             69168 non-null  object 
 7   Academic Pressure                      18767 non-null  float64
 8   Work Pressure                          75022 non-null  float64
 9   CGPA                                   18766 non-null  float64
 10  Study Satisfaction                     18767 non-null  float64
 11  Jo

In [7]:
df_train.isna().sum()

id                                            0
Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

In [8]:
probabilities = df_train['Profession'].value_counts(normalize=True)


def impute_with_probabilities(series, value_probs):
    na_count = series.isna().sum()
    sampled = np.random.choice(value_probs.index, size=na_count, p=value_probs.values)
    series.loc[series.isna()] = sampled
    return series

df_train['Profession'] = impute_with_probabilities(df_train['Profession'], probabilities)

In [9]:
probabilities = df_test['Profession'].value_counts(normalize=True)
df_test['Profession'] = impute_with_probabilities(df_test['Profession'], probabilities)

In [10]:
df_train = df_train.drop('Academic Pressure',axis = 1)
df_test = df_test.drop('Academic Pressure',axis = 1)

In [11]:
df_train = df_train.drop('CGPA',axis = 1)
df_test = df_test.drop('CGPA',axis = 1)

In [12]:
df_train = df_train.drop('Study Satisfaction',axis = 1)
df_test = df_test.drop('Study Satisfaction',axis = 1)

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 17 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             140700 non-null  object 
 7   Work Pressure                          112782 non-null  float64
 8   Job Satisfaction                       112790 non-null  float64
 9   Sleep Duration                         140700 non-null  object 
 10  Dietary Habits                         140696 non-null  

In [14]:
df_train.isna().sum()

id                                           0
Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                                   0
Work Pressure                            27918
Job Satisfaction                         27910
Sleep Duration                               0
Dietary Habits                               4
Degree                                       2
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             4
Family History of Mental Illness             0
Depression                                   0
dtype: int64

In [15]:
df_train['Work Pressure'] = df_train['Work Pressure'].fillna(df_train['Work Pressure'].median())
df_test['Work Pressure'] = df_test['Work Pressure'].fillna(df_test['Work Pressure'].median())

In [16]:
df_train['Job Satisfaction'] = df_train['Job Satisfaction'].fillna(df_train['Job Satisfaction'].median())
df_test['Job Satisfaction'] = df_test['Job Satisfaction'].fillna(df_test['Job Satisfaction'].median())

In [17]:
df_train['Degree'].value_counts()

Degree
Class 12    14729
B.Ed        11691
B.Arch       8742
B.Com        8113
B.Pharm      5856
            ...  
LCA             1
B B.Com         1
RCA             1
Mihir           1
Advait          1
Name: count, Length: 115, dtype: int64

In [18]:
df_train['Dietary Habits'] = df_train['Dietary Habits'].fillna(np.random.choice(a=['Moderate','Unhealthy','Healthy'],p=[0.35,0.35,0.3]))

In [19]:
df_test['Dietary Habits'] = df_test['Dietary Habits'].fillna(np.random.choice(a=['Moderate','Unhealthy','Healthy'],p=[0.35,0.35,0.3]))

In [20]:
df_train['Degree'] = df_train['Degree'].fillna('Class 12')
df_test['Degree'] = df_test['Degree'].fillna('Class 12')

In [21]:
df_train['Financial Stress'] = df_train['Financial Stress'].fillna(df_train['Financial Stress'].median())
df_test['Financial Stress'] = df_test['Financial Stress'].fillna(df_test['Financial Stress'].median())

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
label_encoders = {}

for train_col, test_col in zip(df_train.columns, df_test.columns):
    if df_train[train_col].dtype == 'object':
        label = LabelEncoder()

        df_train[train_col] = df_train[train_col].astype(str)
        label.fit(df_train[train_col])
        df_train[train_col] = label.transform(df_train[train_col])

        df_test[test_col] = df_test[test_col].astype(str)
        known = set(label.classes_)
        df_test[test_col] = df_test[test_col].apply(lambda x: x if x in known else 'Unknown')

        if 'Unknown' not in label.classes_:
            label.classes_ = np.append(label.classes_, 'Unknown')

        df_test[test_col] = label.transform(df_test[test_col])

        label_encoders[train_col] = label

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 17 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  int64  
 2   Gender                                 140700 non-null  int64  
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  int64  
 5   Working Professional or Student        140700 non-null  int64  
 6   Profession                             140700 non-null  int64  
 7   Work Pressure                          140700 non-null  float64
 8   Job Satisfaction                       140700 non-null  float64
 9   Sleep Duration                         140700 non-null  int64  
 10  Dietary Habits                         140700 non-null  

In [25]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93800 entries, 0 to 93799
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     93800 non-null  int64  
 1   Name                                   93800 non-null  int64  
 2   Gender                                 93800 non-null  int64  
 3   Age                                    93800 non-null  float64
 4   City                                   93800 non-null  int64  
 5   Working Professional or Student        93800 non-null  int64  
 6   Profession                             93800 non-null  int64  
 7   Work Pressure                          93800 non-null  float64
 8   Job Satisfaction                       93800 non-null  float64
 9   Sleep Duration                         93800 non-null  int64  
 10  Dietary Habits                         93800 non-null  int64  
 11  De

In [26]:
X_train = df_train.drop('Depression',axis = 1)
y_train = df_train['Depression']
X_test = df_test.copy()

In [28]:
from sklearn.tree import DecisionTreeClassifier

In [29]:
model = DecisionTreeClassifier(max_depth=3, random_state=42)

In [30]:
model.fit(X_train,y_train)

In [31]:
y_pred_proba = model.predict_proba(X_test)

In [32]:
sub = pd.read_csv('sample_submission.csv')

In [33]:
sub['Depression'] = y_pred_proba

In [34]:
sub.to_csv('sub.csv', index = False, index_label = False)