In [1]:
# üßπ Final Data Processing - Student Mental Health (Depression Prediction)
# ---------------------------------------------------------
# Dataset cleaned & ready for regression / classification models

import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder

# 1Ô∏è‚É£ Load Dataset
df = pd.read_csv("Student Mental health.csv")
print("‚úÖ Dataset Loaded. Shape:", df.shape)

# 2Ô∏è‚É£ Rename Columns for Consistency
df.columns = ['Timestamp', 'Gender', 'Age', 'Course', 'Year', 'CGPA', 'Marital', 'Depression', 'Anxiety', 'Panic', 'Specialist']
print("Columns after rename:", df.columns.tolist())

# 3Ô∏è‚É£ Cleaning Numeric Columns (Age, Year, CGPA)

# --- Age ---
df['Age'] = df['Age'].astype(str).str.replace(',', '.')
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Age'].fillna(df['Age'].median(), inplace=True)

# --- Year of Study ---
df['Year'] = df['Year'].astype(str).str.lower().str.extract(r'(\d+)')
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Year'].fillna(df['Year'].median(), inplace=True)

# --- CGPA ---
df['CGPA'] = df['CGPA'].astype(str)
df['CGPA'] = df['CGPA'].str.replace(',', '.')
df['CGPA'] = df['CGPA'].str.extract(r'(\d+\.\d+|\d+)')  # ambil angka valid

df['CGPA'] = pd.to_numeric(df['CGPA'], errors='coerce')
df['CGPA'].fillna(df['CGPA'].median(), inplace=True)

# 4Ô∏è‚É£ Encode Gender & Depression
gender_map = {'Male': 1, 'Female': 0}
depression_map = {'Yes': 1, 'No': 0}

df['Gender'] = df['Gender'].astype(str).str.strip().map(gender_map)
df['Depression'] = df['Depression'].astype(str).str.strip().map(depression_map)

df['Gender'].fillna(0, inplace=True)
df['Depression'].fillna(0, inplace=True)

# 5Ô∏è‚É£ Encode Course & Marital (One-Hot)
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[['Course', 'Marital']])
encoded_cols = encoder.get_feature_names_out(['Course', 'Marital'])
encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=df.index)

df_final = pd.concat([df.drop(columns=['Timestamp', 'Course', 'Marital', 'Anxiety', 'Panic', 'Specialist']), encoded_df], axis=1)

# 6Ô∏è‚É£ Check Results
print("\n‚úÖ Data Cleaning Completed!")
print(df_final[['Age', 'Year', 'CGPA']].describe())
print("\nUnique Year Values:", sorted(df_final['Year'].unique()))
print("\nUnique CGPA Sample:", df_final['CGPA'].unique()[:10])

# 7Ô∏è‚É£ Save Cleaned Dataset
output_path = 'cleaned_student_mental_health_depression.csv'
df_final.to_csv(output_path, index=False)
print(f"\n‚úÖ Final cleaned data saved to '{output_path}' with shape {df_final.shape}")


‚úÖ Dataset Loaded. Shape: (101, 11)
Columns after rename: ['Timestamp', 'Gender', 'Age', 'Course', 'Year', 'CGPA', 'Marital', 'Depression', 'Anxiety', 'Panic', 'Specialist']

‚úÖ Data Cleaning Completed!
              Age        Year        CGPA
count  101.000000  101.000000  101.000000
mean    20.514851    1.970297    3.079208
std      2.488429    0.994540    0.709692
min     18.000000    1.000000    0.000000
25%     18.000000    1.000000    3.000000
50%     19.000000    2.000000    3.000000
75%     23.000000    3.000000    3.500000
max     24.000000    4.000000    3.500000

Unique Year Values: [np.int64(1), np.int64(2), np.int64(3), np.int64(4)]

Unique CGPA Sample: [3.  3.5 2.5 2.  0. ]

‚úÖ Final cleaned data saved to 'cleaned_student_mental_health_depression.csv' with shape (101, 54)


In [2]:
import pandas as pd
df = pd.read_csv("cleaned_student_mental_health_depression.csv")
print(df.head(10))
print(df.describe())


   Gender   Age  Year  CGPA  Depression  Course_Accounting   Course_BCS  \
0       0  18.0     1   3.0           1                 0.0         0.0   
1       1  21.0     2   3.0           0                 0.0         0.0   
2       1  19.0     1   3.0           1                 0.0         0.0   
3       0  22.0     3   3.0           1                 0.0         0.0   
4       1  23.0     4   3.0           0                 0.0         0.0   
5       1  19.0     2   3.5           0                 0.0         0.0   
6       0  23.0     2   3.5           1                 0.0         0.0   
7       0  18.0     1   3.5           0                 0.0         1.0   
8       0  19.0     2   2.5           0                 0.0         0.0   
9       1  18.0     1   3.5           0                 0.0         0.0   

   Course_BENL  Course_BIT  Course_Banking Studies  ...  \
0          0.0         0.0                     0.0  ...   
1          0.0         0.0                     0.0  ... 