In [2]:
from google.colab import files
uploaded = files.upload()

Saving Train_Data.csv to Train_Data.csv
Saving Test_Data.csv to Test_Data.csv


In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")


train = train.dropna(subset=['age_group'])


features = [col for col in train.columns if col != 'age_group']


for col in features:
    if train[col].dtype != 'object':
        median_val = train[col].median()
        train[col] = train[col].fillna(median_val)
        if col in test.columns:
            test[col] = test[col].fillna(median_val)


for col in features:
    if train[col].dtype == 'object':
        mode_val = train[col].mode()[0]
        train[col] = train[col].fillna(mode_val)
        if col in test.columns:
            test[col] = test[col].fillna(mode_val)


train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})


X = train[features]
y = train['age_group']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res)


lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_res, y_train_res)


xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb.fit(X_train_res, y_train_res)


rf_prob = rf.predict_proba(X_val)[:, 1]
lgbm_prob = lgbm.predict_proba(X_val)[:, 1]
xgb_prob = xgb.predict_proba(X_val)[:, 1]


avg_prob = (rf_prob + lgbm_prob + xgb_prob) / 3


threshold = 0.4
ensemble_pred = (avg_prob >= threshold).astype(int)


print("✅ Ensemble Accuracy:", accuracy_score(y_val, ensemble_pred))
print(classification_report(y_val, ensemble_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, ensemble_pred))


X_full_res, y_full_res = sm.fit_resample(X, y)
rf.fit(X_full_res, y_full_res)
lgbm.fit(X_full_res, y_full_res)
xgb.fit(X_full_res, y_full_res)


rf_test_prob = rf.predict_proba(test)[:, 1]
lgbm_test_prob = lgbm.predict_proba(test)[:, 1]
xgb_test_prob = xgb.predict_proba(test)[:, 1]

avg_test_prob = (rf_test_prob + lgbm_test_prob + xgb_test_prob) / 3
ensemble_test = (avg_test_prob >= threshold).astype(int)


submission = pd.DataFrame({
    'age_group': ensemble_test
})

submission.to_csv('submission.csv', index=False)
print("✅ Submission file saved as submission.csv")
files.download('submission.csv')

Train shape: (1966, 9)
Test shape: (312, 8)
[LightGBM] [Info] Number of positive: 1310, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1643
[LightGBM] [Info] Number of data points in the train set: 2620, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
✅ Ensemble Accuracy: 0.731457800511509
              precision    recall  f1-score   support

           0       0.86      0.81      0.84       328
           1       0.24      0.32      0.28        63

    accuracy                           0.73       391
   macro avg       0.55      0.56      0.56       391
weighted avg       0.76      0.73      0.75       391

Confusion Matrix:
 [[266  62]
 [ 43  20]]
[LightGBM] [Info] Number of positive: 1638, numbe

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>