# Depression among Indian students

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
df = pd.read_csv("student_depression_dataset.csv")

### Exploratory Data Analysis

In [4]:
display(df.info(verbose=True, show_counts=True))
display(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

None

(27901, 18)

In [None]:
df.head()

In [None]:
# there are no np.NaN values
df.isnull().sum()

In [None]:
# there are no duplicates
df[df.duplicated()]

In [None]:
# renaming columns
df.columns = ['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Suicidal Thoughts', 'Study Hours',
       'Financial Stress', 'Family History', 'Depression']

In [None]:
#given that 99.9% of people are students, I will only focus on students.
df.Profession.value_counts(normalize=True)

In [None]:
df_s=df.loc[df["Profession"]=="Student",:]
df_s

In [None]:
# do we care about work pressure?
# there seems to be only a very few students working and who thus have high work pressure. I delete this column.
df_s["Work Pressure"].value_counts()

In [None]:
df_s.drop(["Work Pressure"], axis=1, inplace=True)

#### Depressed students

In [None]:
f'Depressed students amount to {round(df_s["Depression"].value_counts(normalize=True)[1],2)} of the total'

#### Gender

In [None]:
df_s["Gender"].value_counts(normalize=True)

In [None]:
stud_depression=df_s["Depression"].value_counts()[1]

In [None]:
depressed_students=df_s.groupby(["Gender"])["Depression"].sum()
total_depressed_students=depressed_students.sum()

In [None]:
# in our sample, 56% of all depressed are men, and 44% are women.
display(f'The % of depression among female students is {round(depressed_students[0]/stud_depression,2)}')
display(f'The % of depression among male students is {round(depressed_students[1]/stud_depression,2)}')

In [None]:
plt.style.available
plt.style.use("seaborn")

In [None]:
ax = (depressed_students/total_depressed_students).plot(kind="bar", figsize=(8,6), fontsize=12, color=["pink","cyan"], lw=1)

for bar in ax.patches: #ax.patches is a list of all rectangular shapes (bars) and each bar is a rectangle object
    height = bar.get_height() #this gets the height of the bar
    ax.text( #this gets the actual number as a label on the bar
        bar.get_x() + bar.get_width() / 2, #this finds the horizontal center of the bar
        height,
        f'{height:.2f}', #this creates the text string that will be shown
        ha='center', #horizontal alignment = center
        va='bottom', #vertical alignment = "bottom" means aligning the bottom of the text to the y-position so it sits on top of the bar.
        fontsize=12
    )
plt.title("Percentage of male and female depressed students").set_size(15);

#### Age

In [None]:
df_s.Age.value_counts().sort_index()

In [None]:
age_depression_gender=df_s.pivot_table(values="Depression", index="Age", columns="Gender", aggfunc="sum").sort_index()[18:34]

In [None]:
ax = age_depression_gender.plot(kind="bar", stacked=True, color=["pink", "cyan"], figsize=(10, 7), fontsize=15)
ax.set_title("Age and Depression, by gender", fontsize=20)
ax.legend(fontsize=15)
ax.set_xlabel("Age", fontsize=15)
plt.show()

#### Academic Pressure

In [None]:
# among the people that are depressed, how many display high academic pressure or smth like that?
df_s["Academic Pressure"].value_counts().sort_index()

In [None]:
df_s["Academic Pressure"] = pd.Categorical(df_s["Academic Pressure"], categories=[0,1,2,3,4,5])
crosstab=pd.crosstab(df_s["Academic Pressure"], df_s["Depression"], margins=True)
crosstab

In [None]:
crosstab = crosstab/crosstab.loc["All","All"]
crosstab

In [None]:
crosstab = crosstab.loc[[0,1,2,3,4,5],[0,1]]

In [None]:
# depressed students with highest academic pressure are 20% of all students.
ax = crosstab.plot(kind="bar", figsize=(8,6))

for bar in ax.patches: 
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom', fontsize=10)

plt.xticks(rotation=0);


#### Dietary Habits

In [None]:
# I deleted "others"
df_s["Dietary Habits"].value_counts(normalize=True).iloc[0:3]

In [None]:
pivot_table_diet=pd.pivot_table(df_s, values="Depression", columns="Dietary Habits" , index="Gender", aggfunc="sum", margins=True).loc[:,["Healthy", "Moderate","Unhealthy"]]/7289
pivot_table_diet=pivot_table_diet.iloc[0:2,:]
pivot_table_diet

In [None]:
# male depressed students are more unhealthy than female students. Of all unhealthy depressed students, 60% are male.
pivot_table_diet.plot(kind="bar");
plt.title("Depressed students' dietary habits, by gender");

In [None]:
# check
pd.pivot_table(df_s, values="Depression", columns="Dietary Habits" , index="Gender", aggfunc="sum" ).sum(axis=1)
pd.pivot_table(df_s, values="Depression", columns="Dietary Habits" , index="Gender", aggfunc="sum" ).sum(axis=1).sum()

In [None]:
pd.crosstab(df["Dietary Habits"], df["Suicidal Thoughts"]).drop(["Others"]).plot(kind="bar");
plt.ylabel("Number of people")
plt.title("Suicidal Thoughts and Dietary Habits")
plt.tight_layout()
plt.show()

#### Study Satisfaction

In [None]:
df_s.groupby(["Study Satisfaction"])["Depression"].sum()

In [None]:
pivot_table_ss=pd.pivot_table(df_s, values="Depression", columns="Gender", index="Study Satisfaction", aggfunc="sum", margins=True)/16308
pivot_table_ss=pivot_table_ss.iloc[[1,2,3,4,5],[0,1]]
pivot_table_ss

In [None]:
ax=pivot_table_ss.plot(kind="bar", color=["pink","cyan"], figsize=(8,6));
plt.xticks(rotation=0);
plt.title("Study satisfaction in depressed female and male students");

for bar in ax.patches: 
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom', fontsize=10)

In [None]:
df_0=df_s[df_s["Depression"]==0]

In [None]:
ax = (pd.crosstab(df_0["Study Satisfaction"], df_0["Gender"], margins=True)/11562).iloc[[1,2,3,4,5],[0,1]].plot(kind="bar", color=["pink","cyan"], figsize=(8,6));
plt.xticks(rotation=0);

for bar in ax.patches: 
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom', fontsize=10)

In [None]:
pivot_table_ss=pd.pivot_table(df_s, values="Depression", columns="Gender", index="Study Satisfaction", aggfunc="sum", margins=True)/16308


#### City and Depression

In [None]:
df_s.groupby(["City"])["Depression"].sum().sort_values(ascending=False)

#### Sleep Duration

In [None]:
df["Sleep Duration"]=df["Sleep Duration"].apply(lambda x: x.strip("'"))
df

In [None]:
group = df.groupby(["Sleep Duration"])["Depression"].value_counts().unstack(fill_value=0)
group.drop(["Others"], inplace=True)
group

In [None]:
group=group.loc[["Less than 5 hours", "5-6 hours", "7-8 hours", "More than 8 hours"],:]
group

In [None]:
group.plot(kind="bar", figsize=(10,6), colormap="viridis")
plt.xlabel("Sleep Duration")
plt.ylabel("Number of People")
plt.title("Depression Levels by Sleep Duration")
plt.legend(title="Depression")
plt.tight_layout()
plt.show()

In [None]:
group.sum(axis=0)

In [None]:
df.loc[df["Sleep Duration"]!="Others",:].Depression.value_counts()

In [None]:
display(pd.crosstab(df["Degree"], df["Depression"], normalize=True).plot(kind="bar"));
display(pd.crosstab(df["Degree"], df["Depression"], normalize="index"))
#almost 16% of all people with depression only have a Class 12 diploma. Of Class 12 graduates in our sample, 70% have depression.

In [None]:
df_class12=df.loc[df["Degree"]=="'Class 12'",:]
crosstab = pd.crosstab(df_class12["Gender"], df_class12["Depression"])
ax = crosstab.plot(kind="bar", figsize=(8,6))

for bar in ax.patches: 
    height = bar.get_height() 
    ax.text(bar.get_x() + bar.get_width() / 2, height, f'{int(height)}', ha='center', va='bottom',fontsize=10)

plt.ylabel("Count")
plt.title("Depression by Gender")
plt.legend(title="Depression")
plt.tight_layout()
plt.show()

#### Study Hours

In [None]:
crosstab_study_depression=pd.crosstab(df_s["Study Hours"], df_s["Depression"])
crosstab_study_depression

In [None]:
# the plot shows that the more depressed people tend to study longer hours (6+)
crosstab_study_depression.plot(kind="bar", stacked=True)

In [None]:
# depressed students tend to study longer hours. Most depressed students study 10,11,12 hours.
pivot_depression_studyhours=df_s.pivot_table(values="Depression", index="Study Hours", columns="Gender", aggfunc="sum")
more_than_10_hours = pivot_depression_studyhours.loc[[ 10.0, 11.0, 12.0],:].sum(axis=0)
male_and_females_studying_more_than_10_hours=more_than_10_hours/pivot_depression_studyhours.sum(axis=0)
print(f'Depressed female students studying more than 10 hours are {round(male_and_females_studying_more_than_10_hours[0],2)} of the total')
print(f'Depressed male students studying more than 10 hours are {round(male_and_females_studying_more_than_10_hours[1],2)} of the total')

In [None]:
pivot_depression_studyhours.plot(kind="bar");

#### Suicidal Thoughts

In [None]:
pd.crosstab(df_s["Suicidal Thoughts"], df_s["Depression"]).plot(kind="bar")


In [None]:
# people who study are more likely to have suicidal thoughts.
pd.crosstab(df_s["Study Hours"], df_s["Suicidal Thoughts"]).plot(kind="bar")

In [None]:
# do people that pursue certain studies tend to have more suicidal thoughts and be depressed? pivot table?

In [None]:
# do people that have a family history of depression have a higher tendency to be depressed themselves?

In [None]:
# there doesn't seem to be a strong relation here
pd.crosstab(df_s["Family History"], df_s["Depression"])

## Machine Learning Models

#### Questions

#### Features I look at:

In [None]:
#is a student with these features likely to experience depression?

In [None]:
#which factors are mostly associated with depression?
# I can plot a logistic regression model and look at coefficients for log. regression or feature importances for a random forest
#to understand what's the most imp feature. 

In [None]:
# policy outcome and proposal: what are the students that need the highest help? How can we help them?

### Classification

### Logistic Regression

### Random Forest model

### Regression

In [None]:
# discrete variables
df_s.corr()

In [None]:
#strong correlation: age and depression (negative) - 0.2
#strong correlation: academic pressure and depression (positive) + 0.47
#strong correlation: study satisfaction and depression (negative) - 0.168
#strong correlation: academic pressure and study satisfaction (negative) -0.11
#strong correlation: study hours and depression (positive) +0.2


In [None]:
#check the heat map!!
sns.heatmap(df_s.corr(), vmin=-1, vmax=+1, annot=True, linecolor="black", cbar=True);

### Dimensionality Reduction