In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("student_depression.csv")

In [None]:
cat_col = [d for d in df.columns if df[d].dtype == 'object' and d!='Profession']
num_col = [d for d in df.columns if df[d].dtype != 'object']
print(cat_col)
print(num_col)

In [None]:
for col in cat_col:
    plt.figure(figsize=(max(12, len(df[col].unique()) * 0.2),6))
    top_n = 10
    value_counts = df[col].value_counts().nlargest(top_n)
    filtered = df[df[col].isin(value_counts.index)]
    
    sns.countplot(data=filtered, x=col)
    plt.title(f'{col} - Top {top_n} Categories')
    plt.xticks(rotation=45, ha='right')
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

In [None]:
sns.countplot(x='Gender', data=df)

In [None]:
df['Gender'].value_counts().plot.pie(autopct='%1.1f%%')

In [None]:
sns.histplot(df['Age'], kde=True)

In [None]:
sns.boxplot(x='Gender', y='CGPA', data=df)

In [None]:
sns.violinplot(x='Depression', y='Study Satisfaction', data=df)


In [None]:
pd.crosstab(df['Gender'], df['Depression']).plot(kind='bar', stacked=True)

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x='Age', hue='Depression', kde=True, element='step', stat='count')
plt.title('Age Distribution by Depression Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [17]:
import numpy as np

print(num_col)
outlier_percentage = {}

for col in num_col:
    col_data = df[col]
    q1 = np.percentile(col_data, 25)
    q3 = np.percentile(col_data, 75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    # Boolean mask for outliers
    outliers_mask = (col_data < lower) | (col_data > upper)
    outlier_count = outliers_mask.sum()
    outlier_percentage[col] = (outlier_count / len(col_data)) * 100

    print(f"Count of outliers in column '{col}': {outlier_count}")
    print(f"Percentage of outliers in column '{col}': {outlier_percentage[col]:.2f}%")
    print(f"Lower bound: {lower}")
    print(f"Upper bound: {upper}")
    print(f"Outlier data:\n{col_data[outliers_mask].values}")
    print("\n==============================================\n")


['id', 'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Depression']
Count of outliers in column 'id': 0
Percentage of outliers in column 'id': 0.00%
Lower bound: -71129.5
Upper bound: 211986.5
Outlier data:
[]


Count of outliers in column 'Age': 12
Percentage of outliers in column 'Age': 0.04%
Lower bound: 7.5
Upper bound: 43.5
Outlier data:
[58. 49. 51. 44. 46. 59. 54. 48. 56. 46. 48. 48.]


Count of outliers in column 'Academic Pressure': 0
Percentage of outliers in column 'Academic Pressure': 0.00%
Lower bound: -1.0
Upper bound: 7.0
Outlier data:
[]


Count of outliers in column 'Work Pressure': 3
Percentage of outliers in column 'Work Pressure': 0.01%
Lower bound: 0.0
Upper bound: 0.0
Outlier data:
[5. 2. 5.]


Count of outliers in column 'CGPA': 9
Percentage of outliers in column 'CGPA': 0.03%
Lower bound: 2.345
Upper bound: 12.865
Outlier data:
[0. 0. 0. 0. 0. 0. 0. 0. 0.]


Count of outliers in column 'Study Sa