# Preprocessing

In [23]:
import pandas as pd 
import numpy as np 
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../src") 
from preprocess import *

In [24]:
train_path = "../data/raw/train.csv"
test_path = "../data/raw/test.csv"

train = pd.read_csv(train_path, index_col = "id")
test = pd.read_csv(test_path, index_col = "id")

print(f"Train shape: {train.shape}, Test shape: {test.shape}")
train.head()

Train shape: (140700, 19), Test shape: (93800, 18)


Unnamed: 0_level_0,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [25]:
# preprocessed_train = train.copy()
# preprocessed_train.drop(columns=drop, inplace=True)
# preprocessed_train.dropna(subset=["Financial Stress", "Pressure", "Satisfaction"], axis=0, inplace=True)
# print(preprocessed_train.shape)
# preprocessed_train.isnull().sum()

In [26]:
y = train.pop("Depression")
X_train, X_val, y_train, y_val = train_test_split(train, y, train_size=0.8, test_size=0.2, random_state=0)

In [27]:
drop = ["Name", "City", "Profession", "Academic Pressure", "Work Pressure", "Study Satisfaction", "Job Satisfaction", "Degree"]
oh = ["Gender", "Working Professional or Student", "Sleep Duration", "Dietary Habits", "Have you ever had suicidal thoughts ?", "Family History of Mental Illness" ]

categorical_transformer = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), oh)
    ],
    remainder="passthrough"  # Keeps non-encoded columns
)

preprocessor = Pipeline(steps=[
    ("pressure", FunctionTransformer(assign_pressure)),
    ("satisfaction", FunctionTransformer(assign_satisfaction)),
    #("profession_fill", FunctionTransformer(fill_profession)),
    ("diet_replace", FunctionTransformer(replace_diet_habits)),
    ("sleep_replace", FunctionTransformer(replace_sleep_duration)),
    ("drop_columns", FunctionTransformer(lambda X: X.drop(columns = drop))),
    ("drop_na_rows", FunctionTransformer(lambda X: X.dropna(subset=["Financial Stress", "Pressure", "Satisfaction"]))),
    ("cat_transformer", categorical_transformer)
])

# Fit the preprocessor on training data
preprocessor.fit(X_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [30]:
# Transform both train and test data
train_transformed = preprocessor.transform(X_train)
val_transformed = preprocessor.transform(X_val)
test_transformed = preprocessor.transform(test)

# Get feature names after one-hot encoding
cat_features = preprocessor.named_steps["cat_transformer"] \
                 .named_transformers_["cat"] \
                 .get_feature_names_out(oh) 

all_features = np.concatenate([
    cat_features,
    [col for col in train.columns if col not in drop and col not in oh],
    ["Pressure", "Satisfaction"]
])
#cat_features = preprocessor.named_transformers_["cat"].get_feature_names_out(oh)
#all_features = np.concatenate([cat_features, preprocessed_train.columns.drop(oh)])

final_train = pd.DataFrame(train_transformed, columns=all_features)
final_val = pd.DataFrame(val_transformed, columns=all_features)
final_test = pd.DataFrame(test_transformed, columns=all_features)


In [31]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

model = XGBClassifier(random_state = 0)
model.fit(final_train, y_train)

prediction = model.predict(final_val)
#binary_prediction = (prediction >= 0.5).astype(int)

accuracy_score(y_val, prediction)

XGBoostError: [17:56:37] C:\actions-runner\_work\xgboost\xgboost\src\data\data.cc:542: Check failed: this->labels.Size() % this->num_row_ == 0 (35 vs. 0) : Incorrect size for labels: (112560,1) v.s. 112525

In [None]:
output = pd.DataFrame({"Id": test_data.Id,
                       "SalePrice": test_preds})
output.to_csv("submission.csv", index=False)