In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

# Read train data
train = pd.read_csv("train_new.csv")

# Drop unnecessary columns
train = train.drop(['id', 'Name', 'City'], axis=1)

# Detect categorical columns
categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", categorical_columns)

# Label Encoding for categorical columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))  # Convert to string to handle any unexpected types
    label_encoders[col] = le  # Save the encoder for potential inverse transformation

# Split data based on 'Working Professional or Student' column
working_professional_data = train[train['Working Professional or Student'] == 1]  # Assuming 1 = Working Professional
student_data = train[train['Working Professional or Student'] == 0]  # Assuming 0 = Student

working_professional_data.fillna(working_professional_data.mean(), inplace=True)
student_data.fillna(student_data.mean(), inplace=True)

# Drop the splitting column from feature sets
X_wp = working_professional_data.drop(['Depression', 'Working Professional or Student','Academic Pressure','CGPA','Study Satisfaction'], axis=1)
y_wp = working_professional_data['Depression']

X_student = student_data.drop(['Depression', 'Working Professional or Student','Profession','Work Pressure','Job Satisfaction'], axis=1)
y_student = student_data['Depression']

# Split train and test for each group
X_train_wp, X_test_wp, y_train_wp, y_test_wp = train_test_split(X_wp, y_wp, test_size=0.2, random_state=42, stratify=y_wp)
X_train_student, X_test_student, y_train_student, y_test_student = train_test_split(X_student, y_student, test_size=0.2, random_state=42, stratify=y_student)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_student = scaler.fit_transform(X_train_student)
X_test_student = scaler.transform(X_test_student)

Categorical columns: ['Gender', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'city_type']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  working_professional_data.fillna(working_professional_data.mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  student_data.fillna(student_data.mean(), inplace=True)


In [12]:
len(X_train_wp), len(X_test_wp)

(90080, 22520)

In [15]:
# AdaBoost with Decision Tree
dt = DecisionTreeClassifier(
    max_depth=2,
    min_samples_split=13,
    min_samples_leaf=4,
    class_weight='balanced',
    random_state=40
)

ada_model = AdaBoostClassifier(
    estimator=dt,
    n_estimators=485,
    learning_rate=0.026178,
    algorithm='SAMME.R',
    random_state=40
)

rf_model = RandomForestClassifier(
    bootstrap=True,
    class_weight='balanced',
    criterion='gini',
    max_depth=18,
    max_features='sqrt',
    max_leaf_nodes=170,
    min_impurity_decrease=0.00055221171236024,
    min_samples_leaf=3,
    min_samples_split=18,
    n_estimators=891,
    n_jobs=-1,
    random_state=40
)
# XGBoost parameters
xgb_model = xgb.XGBClassifier(
    colsample_bytree=0.6915192661966489,
    gamma=0.038489954914396496,
    learning_rate=0.0969254358741304,
    max_bin=285,
    max_depth=6,
    min_child_weight=2,
    n_estimators=781,
    reg_alpha=1.616240759128834,
    reg_lambda=1.266807513020847,
    scale_pos_weight=0.061946902654867256,
    subsample=0.9485842360750871,
    tree_method='hist',
    use_label_encoder=False,
)

# CatBoost parameters
cat_model = cb.CatBoostClassifier(
    subsample=0.6,
    scale_pos_weight=0.061946902654867256,
    random_strength=0.1,
    min_child_samples=5,
    learning_rate=0.03,
    l2_leaf_reg=5,
    iterations=300,
    grow_policy="SymmetricTree",
    depth=6,
    colsample_bylevel=1.0,
    border_count=32,
    bagging_temperature=0.8,
    random_state=40,
    verbose=False
)

et_model = ExtraTreesClassifier(
    n_estimators=700,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    random_state=40,
    n_jobs=-1
)

optimized_gb_model = GradientBoostingClassifier(
    learning_rate=0.04293117062858835,
    max_depth=5,
    max_features='sqrt',
    min_samples_leaf=5,
    min_samples_split=2,
    n_estimators=918,
    subsample=0.7077649335194086,
    random_state=40
)

lgb_model = lgb.LGBMClassifier(
    subsample=0.8,
    reg_lambda=0,
    reg_alpha=0.1,
    num_leaves=31,
    n_estimators=500,
    min_child_samples=20,
    max_depth=15,
    learning_rate=0.15,
    colsample_bytree=0.6,
    class_weight='balanced',
    random_state=40
)

# Create first-level ensemble
base_estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('ada', ada_model),
    ('cat', cat_model),
    ('et', et_model),
    ('gb', optimized_gb_model),
    ('lgb', lgb_model),
]

# Create stacking classifier
stacking_wp = StackingClassifier(
    estimators=base_estimators,
    cv=5,
    n_jobs=-1
)

stacking_student = StackingClassifier(
    estimators=base_estimators,
    cv=5,
    n_jobs=-1
)

from sklearn.metrics import accuracy_score

print("\nTraining stacking ensemble for WP...")
stacking_wp.fit(X_train_wp, y_train_wp)
y_pred = stacking_wp.predict(X_test_wp)
accuracy = accuracy_score(y_test_wp, y_pred)
print(f"Stacking Ensemble Accuracy for WP: {accuracy:.4f}")

print("\nTraining stacking ensemble for Student...")
stacking_student.fit(X_train_student, y_train_student)
y_pred = stacking_student.predict(X_test_student)
accuracy = accuracy_score(y_test_student, y_pred)
print(f"Stacking Ensemble Accuracy for student: {accuracy:.4f}")


Training stacking ensemble for WP...


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 7374, number of negative: 82706
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 90080, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 5899, number of negative: 66165
[LightGBM] [Info] Number of positive: 5900, number of negative: 66164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003782 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 72064, number of used features: 14
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 72064, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[Light

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Stacking Ensemble Accuracy for WP: 0.9594

Training stacking ensemble for Student...


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 13040, number of negative: 9228
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001865 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 383
[LightGBM] [Info] Number of data points in the train set: 22268, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




[LightGBM] [Info] Number of positive: 10432, number of negative: 7382
[LightGBM] [Info] Number of positive: 10432, number of negative: 7382
[LightGBM] [Info] Number of positive: 10432, number of negative: 7382
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 381
[LightGBM] [Info] Number of data points in the train set: 17814, number of used features: 14
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 382
[LightGBM] [Info] Number of positive: 10432, number of negative: 7383
[LightGBM] [Info] Number of data points in the train set: 17814, number of used features: 14
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testin

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Stacking Ensemble Accuracy for student: 0.8452


In [16]:
# predict test
test = pd.read_csv("test_new.csv")
test = test.drop(['Name', 'City'], axis=1)
for col, le in label_encoders.items():
    if col == 'id':
        continue
    test[col] = le.transform(test[col].astype(str))

test_wp = test[test['Working Professional or Student'] == 1]
test_student = test[test['Working Professional or Student'] == 0]

test_wp_id = test_wp['id']
test_student_id = test_student['id']

test_wp.fillna(working_professional_data.mean(), inplace=True)
test_student.fillna(student_data.mean(), inplace=True)

X_test_wp = test_wp.drop(['id','Working Professional or Student','Academic Pressure','CGPA','Study Satisfaction'], axis=1)
X_test_student = test_student.drop(['id','Working Professional or Student','Profession','Work Pressure','Job Satisfaction'], axis=1)


y_pred_wp = stacking_wp.predict(X_test_wp)
y_pred_student = stacking_student.predict(X_test_student)

# join id and prediction using test_wp_id
result_wp = pd.DataFrame()
result_wp['id'] = test_wp_id
result_wp['Depression'] = y_pred_wp

result_student = pd.DataFrame()
result_student['id'] = test_student_id
result_student['Depression'] = y_pred_student

result = pd.concat([result_wp, result_student])
# sort by id
result = result.sort_values(by='id')
result.to_csv('submission_preprocess_stacking_separate.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_wp.fillna(working_professional_data.mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_student.fillna(student_data.mean(), inplace=True)
