In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("student_depression.csv")

0        33.0
1        24.0
2        31.0
3        28.0
4        25.0
         ... 
27896    27.0
27897    27.0
27898    31.0
27899    18.0
27900    27.0
Name: Age, Length: 27901, dtype: float64

In [4]:
cat_col = [d for d in df.columns if df[d].dtype == 'object' and d!='Profession']
num_col = [d for d in df.columns if df[d].dtype != 'object']
print(cat_col)
print(num_col)

['Gender', 'City', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Financial Stress', 'Family History of Mental Illness']
['id', 'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Depression']


HANDLING CATEGORICAL DATA

In [21]:
for col in num_col:
    print("Column is: ", col)
    colum = df[col]
    print(f'Max value in {col} is {max(colum)},\nMin value is {min(colum)}')
    print()

Column is:  id
Max value in id is 140699,
Min value is 2

Column is:  Age
Max value in Age is 59.0,
Min value is 18.0

Column is:  Academic Pressure
Max value in Academic Pressure is 5.0,
Min value is 0.0

Column is:  Work Pressure
Max value in Work Pressure is 5.0,
Min value is 0.0

Column is:  CGPA
Max value in CGPA is 10.0,
Min value is 0.0

Column is:  Study Satisfaction
Max value in Study Satisfaction is 5.0,
Min value is 0.0

Column is:  Job Satisfaction
Max value in Job Satisfaction is 4.0,
Min value is 0.0

Column is:  Work/Study Hours
Max value in Work/Study Hours is 12.0,
Min value is 0.0

Column is:  Depression
Max value in Depression is 1,
Min value is 0



In [None]:
for col in cat_col:
    plt.figure(figsize=(max(12, len(df[col].unique()) * 0.2),6))
    top_n = 10
    value_counts = df[col].value_counts().nlargest(top_n)
    filtered = df[df[col].isin(value_counts.index)]
    
    sns.countplot(data=filtered, x=col)
    plt.title(f'{col} - Top {top_n} Categories')
    plt.xticks(rotation=45, ha='right')
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

In [None]:
sns.countplot(x='Gender', data=df)

In [None]:
df['Gender'].value_counts().plot.pie(autopct='%1.1f%%')

In [None]:
sns.histplot(df['Age'], kde=True)

In [None]:
sns.boxplot(x='Gender', y='CGPA', data=df)

In [None]:
sns.violinplot(x='Depression', y='Study Satisfaction', data=df)


In [None]:
pd.crosstab(df['Gender'], df['Depression']).plot(kind='bar', stacked=True)

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x='Age', hue='Depression', kde=True, element='step', stat='count')
plt.title('Age Distribution by Depression Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
import numpy as np

print(num_col)
outlier_percentage = {}

for col in num_col:
    col_data = df[col]
    q1 = np.percentile(col_data, 25)
    q3 = np.percentile(col_data, 75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    # Boolean mask for outliers
    outliers_mask = (col_data < lower) | (col_data > upper)
    outlier_count = outliers_mask.sum()
    outlier_percentage[col] = (outlier_count / len(col_data)) * 100

    print(f"Count of outliers in column '{col}': {outlier_count}")
    print(f"Percentage of outliers in column '{col}': {outlier_percentage[col]:.2f}%")
    print(f"Lower bound: {lower}")
    print(f"Upper bound: {upper}")
    print(f"Outlier data:\n{col_data[outliers_mask].values}")
    print("\n==============================================\n")
