In [None]:
from google.colab import files
uploaded = files.upload()

Saving Train_Data.csv to Train_Data.csv
Saving Test_Data.csv to Test_Data.csv


In [6]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

train = train.dropna(subset=['age_group'])

features = [col for col in train.columns if col != 'age_group']


for col in features:
    if train[col].dtype != 'object':
        median_val = train[col].median()
        train[col] = train[col].fillna(median_val)
        if col in test.columns:
            test[col] = test[col].fillna(median_val)


for col in features:
    if train[col].dtype == 'object':
        mode_val = train[col].mode()[0]
        train[col] = train[col].fillna(mode_val)
        if col in test.columns:
            test[col] = test[col].fillna(mode_val)


train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})


X = train[features]
y = train['age_group']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_val)
print("Random Forest Accuracy:", accuracy_score(y_val, rf_pred))


lgbm = LGBMClassifier(class_weight='balanced', random_state=42)
lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_val)
print("LightGBM Accuracy:", accuracy_score(y_val, lgbm_pred))



neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

xgb = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_val)
print("XGBoost Accuracy:", accuracy_score(y_val, xgb_pred))

ensemble_pred = (rf_pred + lgbm_pred + xgb_pred) >= 2
ensemble_pred = ensemble_pred.astype(int)
print("Ensemble Accuracy:", accuracy_score(y_val, ensemble_pred))
print(classification_report(y_val, ensemble_pred))


rf.fit(X, y)
lgbm.fit(X, y)
xgb.fit(X, y)


rf_test = rf.predict(test)
lgbm_test = lgbm.predict(test)
xgb_test = xgb.predict(test)


ensemble_test = (rf_test + lgbm_test + xgb_test) >= 2
ensemble_test = ensemble_test.astype(int)

submission = pd.DataFrame({
    'age_group': ensemble_test
})


submission.to_csv('/content/submission.csv', index=False)
print("✅ Submission file saved as submission.csv")
files.download('submission.csv')

Train shape: (1966, 9)
Test shape: (312, 8)
Random Forest Accuracy: 0.8312020460358056
[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1000
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LightGBM Accuracy: 0.7544757033248082
XGBoost Accuracy: 0.7519181585677749
Ensemble Accuracy: 0.7749360613810742
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       328
           1       0.26      0.22      0.24        63

    accuracy                           0.77       391
   macro avg       0.56      

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>