# Preprocessing

In [17]:
import pandas as pd 
import numpy as np 
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../src") 
from preprocess import *

In [26]:
train_path = "../data/raw/train.csv"
test_path = "../data/raw/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print(f"Train shape: {train.shape}, Test shape: {test.shape}")
train.head()

Train shape: (140700, 20), Test shape: (93800, 19)


Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [19]:
# preprocessed_train = train.copy()
# preprocessed_train.drop(columns=drop, inplace=True)
# preprocessed_train.dropna(subset=["Financial Stress", "Pressure", "Satisfaction"], axis=0, inplace=True)
# print(preprocessed_train.shape)
# preprocessed_train.isnull().sum()

In [27]:
test.isnull().sum()

id                                           0
Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                               24632
Academic Pressure                        75033
Work Pressure                            18778
CGPA                                     75034
Study Satisfaction                       75033
Job Satisfaction                         18774
Sleep Duration                               0
Dietary Habits                               5
Degree                                       2
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             0
Family History of Mental Illness             0
dtype: int64

In [20]:
y = train.pop("Depression")
X_train, X_val, y_train, y_val = train_test_split(train, y, train_size=0.8, test_size=0.2, random_state=0)

In [21]:
drop = ["Name", "City", "Profession", "Academic Pressure", "Work Pressure", "Study Satisfaction", "Job Satisfaction", "Degree"]
oh = ["Gender", "Working Professional or Student", "Sleep Duration", "Dietary Habits", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness" ]

categorical_transformer = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), oh)
    ],
    remainder="passthrough"  # Keeps non-encoded columns
)

preprocessor = Pipeline(steps=[
    ("pressure", FunctionTransformer(assign_pressure)),       # Creates "Pressure"
    ("satisfaction", FunctionTransformer(assign_satisfaction)), # Creates "Satisfaction"
    #("profession_fill", FunctionTransformer(fill_profession)),
    ("diet_replace", FunctionTransformer(replace_diet_habits)),
    ("sleep_replace", FunctionTransformer(replace_sleep_duration)),
    ("drop_columns", FunctionTransformer(lambda X: X.drop(columns = drop))),
    #("drop_na_rows", FunctionTransformer(lambda X: X.dropna(subset=["Financial Stress", "Pressure", "Satisfaction"]))),
    ("drop_na_rows", DropNaRows(subset=["Financial Stress", "Pressure", "Satisfaction"])),
    ("cat_transformer", categorical_transformer)    # One-hot encoding
])

In [22]:
# Fit the preprocessor on training data
preprocessor.fit(X_train)

# Transform training data and filter targets
X_train_transformed = preprocessor.transform(X_train)
y_train_filtered = y_train[preprocessor.named_steps["drop_na_rows"].kept_train_indices_]

# Transform validation data 
X_val_transformed = preprocessor.transform(X_val)

# Get indices from the PRE-one-hot data:
val_filtered = preprocessor[:6].transform(X_val)  # Apply steps BEFORE one-hot
y_val_filtered = y_val.loc[val_filtered.index]    # Align using correct indices

test_transformed = preprocessor.transform(test)

# Get feature names after one-hot encoding
cat_features = preprocessor.named_steps["cat_transformer"] \
                 .named_transformers_["cat"] \
                 .get_feature_names_out(oh) 

all_features = np.concatenate([
    cat_features,
    [col for col in train.columns if col not in drop and col not in oh],
    ["Pressure", "Satisfaction"]
])
#cat_features = preprocessor.named_transformers_["cat"].get_feature_names_out(oh)
#all_features = np.concatenate([cat_features, preprocessed_train.columns.drop(oh)])

final_train = pd.DataFrame(X_train_transformed, columns=all_features)
final_val = pd.DataFrame(X_val_transformed, columns=all_features)
final_test = pd.DataFrame(test_transformed, columns=all_features)

In [23]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

model = XGBClassifier(random_state = 0)
model.fit(final_train, y_train_filtered)

prediction = model.predict(final_val)
#binary_prediction = (prediction >= 0.5).astype(int)

accuracy_score(y_val_filtered, prediction)

0.937548873249449

In [24]:
test_preds = model.predict(final_test)

output = pd.DataFrame({"Id": test.id,
                       "SalePrice": test_preds})
output.to_csv("../submissions/submission.csv", index=False)

ValueError: array length 93778 does not match index length 93800