In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load YOUR specific dataset
df = pd.read_csv('student_depression_dataset.csv')

# Check the data
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nColumns and Data Types:")
print(df.info())

Dataset Shape: (27901, 18)

First 5 rows:


Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0



Columns and Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 

In [3]:
# Phase 3: Data Inspection (Corrected for your 18-column dataset)

# 1. Drop the 'id' column (it is not needed)
df.drop('id', axis=1, inplace=True, errors='ignore')

# 2. Check the text inside the columns so we can clean them
# We need to see if it says "Yes/No" or "True/False"
cols_to_check = ['Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Financial Stress']

for col in cols_to_check:
    print(f"\n--- {col} ---")
    print(df[col].unique())


--- Sleep Duration ---
["'5-6 hours'" "'Less than 5 hours'" "'7-8 hours'" "'More than 8 hours'"
 'Others']

--- Dietary Habits ---
['Healthy' 'Moderate' 'Unhealthy' 'Others']

--- Degree ---
['B.Pharm' 'BSc' 'BA' 'BCA' 'M.Tech' 'PhD' "'Class 12'" 'B.Ed' 'LLB' 'BE'
 'M.Ed' 'MSc' 'BHM' 'M.Pharm' 'MCA' 'MA' 'B.Com' 'MD' 'MBA' 'MBBS' 'M.Com'
 'B.Arch' 'LLM' 'B.Tech' 'BBA' 'ME' 'MHM' 'Others']

--- Have you ever had suicidal thoughts ? ---
['Yes' 'No']

--- Financial Stress ---
['1.0' '2.0' '5.0' '3.0' '4.0' '?']


In [4]:
# Phase 3: Data Cleaning & Encoding (The "A+" Code)
from sklearn.preprocessing import LabelEncoder

# 1. Clean 'Financial Stress' (Fixing the hidden '?')
# We replace '?' with '3' (average stress) so the AI can read it as a number
df['Financial Stress'] = df['Financial Stress'].replace('?', '3')
df['Financial Stress'] = df['Financial Stress'].astype(float)

# 2. Map Text to Numbers (Manual Mapping)
# This helps the AI understand orders (e.g., Healthy is better than Unhealthy)
sleep_map = {'Less than 5 hours': 0, '5-6 hours': 1, '7-8 hours': 2, 'More than 8 hours': 3, 'Others': 1}
diet_map = {'Unhealthy': 0, 'Moderate': 1, 'Healthy': 2, 'Others': 1}
suicide_map = {'Yes': 1, 'No': 0}

df['Sleep Duration'] = df['Sleep Duration'].map(sleep_map)
df['Dietary Habits'] = df['Dietary Habits'].map(diet_map)
df['Have you ever had suicidal thoughts ?'] = df['Have you ever had suicidal thoughts ?'].map(suicide_map)

# 3. Convert remaining text (Degree, City, etc.) to Numbers automatically
le = LabelEncoder()
cols_to_encode = ['Degree', 'City', 'Profession', 'Gender']

for col in cols_to_encode:
    df[col] = le.fit_transform(df[col].astype(str))

# 4. Final Check
print("Data is cleaned! Here is the new format:")
display(df.head())
print(df.info())

Data is cleaned! Here is the new format:


Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,1,33.0,51,12,5.0,0.0,8.97,2.0,0.0,,2,4,1,3.0,1.0,No,1
1,0,24.0,5,12,2.0,0.0,5.9,5.0,0.0,,1,11,0,3.0,2.0,Yes,0
2,1,31.0,44,12,3.0,0.0,7.03,5.0,0.0,,2,6,0,9.0,1.0,Yes,0
3,0,28.0,49,12,3.0,0.0,5.59,2.0,0.0,,1,8,1,4.0,5.0,Yes,1
4,0,25.0,18,12,4.0,0.0,8.13,3.0,0.0,,1,17,1,1.0,1.0,No,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 17 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27901 non-null  int64  
 1   Age                                    27901 non-null  float64
 2   City                                   27901 non-null  int64  
 3   Profession                             27901 non-null  int64  
 4   Academic Pressure                      27901 non-null  float64
 5   Work Pressure                          27901 non-null  float64
 6   CGPA                                   27901 non-null  float64
 7   Study Satisfaction                     27901 non-null  float64
 8   Job Satisfaction                       27901 non-null  float64
 9   Sleep Duration                         18 non-null     float64
 10  Dietary Habits                         27901 non-null  int64  
 11  De

In [None]:
# Exploratory Data Analysis (EDA) - Requirement 'b'
sns.set_style("whitegrid")

# 1. Summary Statistics (Understanding the numbers)
print("Summary Statistics:")
display(df.describe())

# 2. Univariate Analysis: Pie Chart for Depression
# Purpose: To see how balanced the dataset is (Unbalanced data is hard for ML)
plt.figure(figsize=(6, 6))
colors = ['#66b3ff','#ff9999'] # Blue for No, Red for Yes
df['Depression'].value_counts().plot.pie(autopct='%1.1f%%', colors=colors, startangle=90, labels=['No Depression', 'Depression'])
plt.title('Distribution of Target Variable (Depression)')
plt.ylabel('')
plt.show()

# 3. Bivariate Analysis: Violin Plot (Sleep vs. Academic Pressure)
# Purpose: To see if pressure affects students who sleep less
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='Sleep Duration', y='Academic Pressure', hue='Depression', split=True, palette='muted')
plt.title('Interaction: Sleep Duration vs. Academic Pressure on Depression')
plt.xlabel('Sleep Duration (0=Low, 3=High)')
plt.show()

# 4. Bivariate Analysis: Box Plot (CGPA vs Depression)
# Purpose: To see if there is a grade difference between depressed/non-depressed students
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='Depression', y='CGPA', palette='pastel')
plt.title('CGPA Distribution by Depression Status')
plt.show()