In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from imblearn.combine import SMOTEENN
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv('Train_Data.csv')
test_data = pd.read_csv('Test_Data.csv')


train_data['age_group'] = train_data['age_group'].map({'Adult': 0, 'Senior': 1}).astype('Int64')


cols = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']
for col in cols:
    if train_data[col].dtype in ['float64', 'int64']:
        train_data[col] = train_data[col].fillna(train_data[col].median())
    else:
        train_data[col] = train_data[col].fillna(train_data[col].mode()[0])
train_data = train_data.dropna(subset=['age_group'])

for col in cols:
    if test_data[col].dtype in ['float64', 'int64']:
        test_data[col] = test_data[col].fillna(train_data[col].median())
    else:
        test_data[col] = test_data[col].fillna(train_data[col].mode()[0])

num_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features_train = poly.fit_transform(train_data[num_cols])
poly_features_test = poly.transform(test_data[num_cols])
poly_feature_names = poly.get_feature_names_out(num_cols)

base_features = ['RIAGENDR', 'PAQ605', 'DIQ010']
feature_cols = base_features + list(poly_feature_names)

X_train_full = np.concatenate([train_data[base_features].values, poly_features_train], axis=1)
X_test_full  = np.concatenate([test_data[base_features].values, poly_features_test], axis=1)

scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_full_scaled  = scaler.transform(X_test_full)

y = train_data['age_group'].astype(int)

X_train, X_val, y_train, y_val = train_test_split(X_train_full_scaled, y, test_size=0.2, random_state=42, stratify=y)

smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X_train, y_train)

rf_selector = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_selector.fit(X_res, y_res)
thresholds = ['mean', 'median', 0.01, 0.02, 0.03]
best_fs_f1 = 0
for thresh in thresholds:
    selector = SelectFromModel(rf_selector, prefit=True, threshold=thresh)
    X_res_sel = selector.transform(X_res)
    X_val_sel = selector.transform(X_val)
    # Quick check with simple logistic regression
    lr_fs = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
    lr_fs.fit(X_res_sel, y_res)
    y_pred = lr_fs.predict(X_val_sel)
    f1 = f1_score(y_val, y_pred, pos_label=1)
    if f1 > best_fs_f1:
        best_fs_f1 = f1
        best_selector = selector
        best_X_res_sel = X_res_sel
        best_X_val_sel = X_val_sel
        best_X_test_sel = selector.transform(X_test_full_scaled)
        best_selected_features = np.array(feature_cols)[selector.get_support()]

X_res_sel = best_X_res_sel
X_val_sel = best_X_val_sel
X_test_sel = best_X_test_sel
selected_features = best_selected_features

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, pos_label=1)

lr_params = {'C': np.logspace(-2, 2, 10), 'solver': ['lbfgs', 'liblinear']}
lr_search = RandomizedSearchCV(LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42),
                               lr_params, n_iter=5, scoring=scorer, cv=cv, random_state=42)
lr_search.fit(X_res_sel, y_res)
lr = CalibratedClassifierCV(lr_search.best_estimator_, method='sigmoid', cv=3)

rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7, None], 'min_samples_split': [2, 5, 10]}
rf_search = RandomizedSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42),
                               rf_params, n_iter=5, scoring=scorer, cv=cv, random_state=42)
rf_search.fit(X_res_sel, y_res)
rf = CalibratedClassifierCV(rf_search.best_estimator_, method='isotonic', cv=3)

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}
scale_pos_weight = (y_res == 0).sum() / (y_res == 1).sum()
xgb_search = RandomizedSearchCV(
    XGBClassifier(eval_metric='logloss', scale_pos_weight=scale_pos_weight, random_state=42),
    xgb_params, n_iter=5, scoring=scorer, cv=cv, random_state=42
)
xgb_search.fit(X_res_sel, y_res)
xgb = CalibratedClassifierCV(xgb_search.best_estimator_, method='sigmoid', cv=3)

lr.fit(X_res_sel, y_res)
rf.fit(X_res_sel, y_res)
xgb.fit(X_res_sel, y_res)

stack_estimators = [
    ('lr', lr),
    ('rf', rf),
    ('xgb', xgb)
]
meta_params = {'C': np.logspace(-2, 2, 10), 'solver': ['lbfgs', 'liblinear']}
meta_search = RandomizedSearchCV(LogisticRegression(max_iter=1000), meta_params, n_iter=5, scoring=scorer, cv=cv, random_state=42)
meta_search.fit(X_val_sel, y_val)
stack = StackingClassifier(
    estimators=stack_estimators,
    final_estimator=meta_search.best_estimator_,
    cv=cv,
    n_jobs=-1
)
stack.fit(X_res_sel, y_res)

def best_threshold(model, X_val, y_val):
    y_proba = model.predict_proba(X_val)[:, 1]
    thresholds = np.arange(0.05, 0.91, 0.01)
    best_f1 = 0
    best_thresh = 0.5
    for t in thresholds:
        y_pred = (y_proba >= t).astype(int)
        f1 = f1_score(y_val, y_pred, pos_label=1)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t
    return best_thresh, best_f1

models = {'Logistic Regression': lr, 'Random Forest': rf, 'XGBoost': xgb, 'Stacking': stack}
results = {}
for name, model in models.items():
    thresh, f1 = best_threshold(model, X_val_sel, y_val)
    y_proba = model.predict_proba(X_val_sel)[:, 1]
    y_pred = (y_proba >= thresh).astype(int)
    print(f"\n{name} (best threshold: {thresh:.2f}):")
    print(classification_report(y_val, y_pred, target_names=['Adult', 'Senior']))
    results[name] = (f1, thresh)

best_model_name = max(results, key=lambda k: results[k][0])
best_model = models[best_model_name]
best_thresh = results[best_model_name][1]
print(f"\nSelected model: {best_model_name} with threshold {best_thresh:.2f}")

importances = np.mean(
    [clf.estimator.feature_importances_ for clf in rf.calibrated_classifiers_], axis=0
)
feat_imp = pd.Series(importances, index=selected_features).sort_values(ascending=False)
print("\nTop 10 selected feature importances:\n", feat_imp.head(10))


y_test_proba = best_model.predict_proba(X_test_sel)[:, 1]
y_test_pred = (y_test_proba >= best_thresh).astype(int)
submission = pd.DataFrame({'age_group': y_test_pred.astype(int)})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")



Logistic Regression (best threshold: 0.67):
              precision    recall  f1-score   support

       Adult       0.91      0.76      0.83       328
      Senior       0.32      0.60      0.42        63

    accuracy                           0.73       391
   macro avg       0.62      0.68      0.62       391
weighted avg       0.81      0.73      0.76       391


Random Forest (best threshold: 0.35):
              precision    recall  f1-score   support

       Adult       0.89      0.69      0.77       328
      Senior       0.25      0.56      0.35        63

    accuracy                           0.66       391
   macro avg       0.57      0.62      0.56       391
weighted avg       0.79      0.66      0.71       391


XGBoost (best threshold: 0.26):
              precision    recall  f1-score   support

       Adult       0.89      0.62      0.73       328
      Senior       0.24      0.62      0.34        63

    accuracy                           0.62       391
   macro av