# Mental Health Data Preprocessing

In [8]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../src") 
from preprocess import *

In [9]:
train_path = "../data/raw/train.csv"
test_path = "../data/raw/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

### Train, validation set split

In [10]:
# Stratified split to preserve class balance
X = train
y = X.pop("Depression")

X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=0,  # For reproducibility
    stratify=y       # Critical for imbalanced data
)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}")

Train: (112560, 19), Validation: (28140, 19)


### Pipeline

In [11]:
# Columns to drop 
drop_cols = [
    "Name", "City", "Profession",  # Identifiers
    "Academic Pressure", "Work Pressure",  
    "Study Satisfaction", "Job Satisfaction", # Replaced by engineered features
    "Degree"  
]

# Columns for one-hot encoding
categorical_cols = [
    "Gender",
    "Working Professional or Student",
    "Sleep Duration", 
    "Dietary Habits",
    "Have you ever had suicidal thoughts ?",
    "Family History of Mental Illness"
]

# One hot encoder transformer definition
categorical_transformer = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder="passthrough"  # Keeps non-encoded columns
)

# Pipeline definition
preprocessor = Pipeline(steps=[
    ("pressure", FunctionTransformer(assign_pressure)),       # Creates "Pressure"
    ("satisfaction", FunctionTransformer(assign_satisfaction)), # Creates "Satisfaction"
    ("impute", GroupImputer(["Pressure", "Satisfaction", "Financial Stress"])),
    ("pressure_ratio", FunctionTransformer(add_pressure_ratio)),
    ("diet_clean", FunctionTransformer(replace_diet_habits)),
    ("sleep_clean", FunctionTransformer(replace_sleep_duration)),
    ("drop_cols", FunctionTransformer(lambda X: X.drop(columns = drop_cols))),
    ("encode", categorical_transformer)    # One-hot encoding
])

In [12]:
# Fit the preprocessor on training data
preprocessor.fit(X_train)

# Transform all datasets
X_train_transformed = preprocessor.transform(X_train)
X_val_transformed = preprocessor.transform(X_val)
test_transformed = preprocessor.transform(test)

# Get feature names after one-hot encoding
cat_features = preprocessor.named_steps["encode"] \
                 .named_transformers_["onehot"] \
                 .get_feature_names_out(categorical_cols) 

all_features = np.concatenate([
    cat_features,
    [col for col in train.columns if col not in drop_cols and col not in categorical_cols],
    ["Pressure", "Satisfaction","Pressure_Satisfaction_Ratio"]
])

final_train = pd.DataFrame(X_train_transformed, columns=all_features)
final_val = pd.DataFrame(X_val_transformed, columns=all_features)
final_test = pd.DataFrame(test_transformed, columns=all_features)

### Validation of new feature

In [13]:
# Validation of new feature
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

# Create two versions of pipeline
pipe_baseline = Pipeline([step for step in preprocessor.steps if step[0] != "pressure_ratio"])
pipe_with_ratio = preprocessor  # includes the new feature

# Quick validation using cross-validation
baseline_scores = cross_val_score(
    XGBClassifier(random_state=0),
    pipe_baseline.fit_transform(X_train),
    y_train,
    cv=3,
    scoring='accuracy'
)

ratio_scores = cross_val_score(
    XGBClassifier(random_state=0),
    pipe_with_ratio.fit_transform(X_train),
    y_train,
    cv=3,
    scoring='accuracy'
)

print(f"Baseline CV Accuracy: {baseline_scores.mean():.5f} ± {baseline_scores.std():.5f}")
print(f"With Ratio CV Accuracy: {ratio_scores.mean():.5f} ± {ratio_scores.std():.5f}")

Baseline CV Accuracy: 0.93571 ± 0.00152
With Ratio CV Accuracy: 0.93536 ± 0.00092


### Save processed data

In [14]:
processed_train = pd.concat([final_train.reset_index(drop=True), 
                             y_train.reset_index(drop=True).rename("Depression")], axis=1)  
processed_train.to_csv("../data/processed/processed_train.csv", index=False)

processed_val = pd.concat([final_val.reset_index(drop=True), 
                           y_val.reset_index(drop=True).rename("Depression")], axis=1)  
processed_val.to_csv("../data/processed/processed_val.csv", index=False)  

final_test.to_csv("../data/processed/processed_test.csv", index=False)  

## Preprocessing Summary

### Key Transformations:
1. **Engineered Features**:
   - **Pressure**: Unified academic/work pressure
   - **Satisfaction**: Unified study/job satisfaction
   - **Pressure_Satisfaction_Ratio**: Ratio of Pressura and Satisfaction

2. **Handled Missing Data**:
   - Group-specific median imputation for **Pressure**, **Satisfaction**, **Financial Stress**

3. **Categorical Encoding**:
   - One-hot for categorical features (Gender, Profession status, etc.)
