In [2]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb
from imblearn.over_sampling import BorderlineSMOTE


In [3]:

# Load data
train = pd.read_csv('data/Train_Data.csv')
test = pd.read_csv('data/Test_Data.csv')


In [4]:

# Clean target variable
train = train.dropna(subset=['age_group'])
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1}).astype(int)


In [5]:

# Define features
features = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']


In [6]:

# Advanced clinical feature engineering
def create_advanced_features(df):
    df = df.copy()
    # Metabolic interactions
    df['BMI_Glucose_Interaction'] = df['BMXBMI'] * df['LBXGLU']
    df['Insulin_Glucose_Ratio'] = df['LBXIN'] / (df['LBXGLU'] + 1e-6)
    df['GTT_Glucose_Ratio'] = df['LBXGLT'] / (df['LBXGLU'] + 1e-6)
    # Clinical risk flags
    df['Obesity_Flag'] = (df['BMXBMI'] >= 30).astype(int)
    df['Prediabetes_Flag'] = (df['LBXGLU'] >= 100).astype(int)
    df['High_Insulin_Flag'] = (df['LBXIN'] >= 15).astype(int)
    df['Impaired_GTT_Flag'] = (df['LBXGLT'] >= 140).astype(int)
    # Composite risk score
    df['Metabolic_Risk_Score'] = (df['Obesity_Flag'] + df['Prediabetes_Flag'] + 
                                  df['High_Insulin_Flag'] + df['Impaired_GTT_Flag'])
    return df


In [7]:

# Apply feature engineering
train_eng = create_advanced_features(train[features])
test_eng = create_advanced_features(test[features])
all_features = (features + ['BMI_Glucose_Interaction', 'Insulin_Glucose_Ratio', 
                            'GTT_Glucose_Ratio', 'Obesity_Flag', 'Prediabetes_Flag', 
                            'High_Insulin_Flag', 'Impaired_GTT_Flag', 'Metabolic_Risk_Score'])


In [8]:

# Impute missing values using MICE
imputer = IterativeImputer(max_iter=20, random_state=42)
train_imp = pd.DataFrame(imputer.fit_transform(train_eng), columns=all_features, index=train_eng.index)
test_imp = pd.DataFrame(imputer.transform(test_eng), columns=all_features, index=test_eng.index)




In [9]:

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_imp)
X_test = scaler.transform(test_imp)
y_train = train['age_group'].values


In [10]:

# Handle class imbalance with BorderlineSMOTE
smote = BorderlineSMOTE(random_state=42, k_neighbors=5)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(f"Original class distribution: {np.bincount(y_train)}")
print(f"Resampled class distribution: {np.bincount(y_resampled)}")


Original class distribution: [1638  314]
Resampled class distribution: [1638 1638]


In [11]:

# Define ensemble models
models = {
    'rf': RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=10, 
                                 class_weight='balanced', random_state=42),
    'gb': GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, 
                                     subsample=0.8, random_state=42),
    'lgb': lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=31, max_depth=8, 
                              subsample=0.8, colsample_bytree=0.8, class_weight='balanced', 
                              random_state=42, verbosity=-1)
}


In [12]:

# Cross-validation and ensemble prediction
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
ensemble_preds = np.zeros(len(X_test))
oof_preds = np.zeros(len(X_resampled))

for fold, (train_idx, val_idx) in enumerate(cv.split(X_resampled, y_resampled)):
    X_tr, X_val = X_resampled[train_idx], X_resampled[val_idx]
    y_tr, y_val = y_resampled[train_idx], y_resampled[val_idx]
    
    fold_preds = []
    for name, model in models.items():
        model.fit(X_tr, y_tr)
        val_pred = model.predict_proba(X_val)[:, 1]
        test_pred = model.predict_proba(X_test)[:, 1]
        fold_preds.append(test_pred)
    
    # Average predictions for the fold
    ensemble_pred = np.mean(fold_preds, axis=0)
    ensemble_preds += ensemble_pred / cv.n_splits
    
    # Out-of-fold predictions for threshold optimization
    val_ensemble = np.mean([model.predict_proba(X_val)[:, 1] for model in models.values()], axis=0)
    oof_preds[val_idx] = val_ensemble
    print(f"Fold {fold+1} completed")




Fold 1 completed




Fold 2 completed




Fold 3 completed




Fold 4 completed
Fold 5 completed




In [13]:

# Optimize threshold for accuracy
precision, recall, thresholds = precision_recall_curve(y_resampled, oof_preds)
# Since we're targeting accuracy, we'll also check accuracy directly
oof_binary_preds = (oof_preds >= 0.5).astype(int)
base_accuracy = accuracy_score(y_resampled, oof_binary_preds)
print(f"Baseline accuracy (threshold=0.5): {base_accuracy:.4f}")


Baseline accuracy (threshold=0.5): 0.8730


In [14]:

# Test different thresholds for maximum accuracy
best_accuracy = base_accuracy
best_threshold = 0.5
for thresh in thresholds:
    preds = (oof_preds >= thresh).astype(int)
    acc = accuracy_score(y_resampled, preds)
    if acc > best_accuracy:
        best_accuracy = acc
        best_threshold = thresh

print(f"Optimal threshold: {best_threshold:.4f}")
print(f"Best accuracy on resampled data: {best_accuracy:.4f}")


Optimal threshold: 0.5926
Best accuracy on resampled data: 0.8806


In [15]:

# Final predictions with optimized threshold
final_preds = (ensemble_preds >= best_threshold).astype(int)


In [16]:

# Create submission file
submission = pd.DataFrame({'age_group': final_preds})
submission.to_csv('submission.csv', index=False)
print("submission.csv saved. Prediction distribution:")
print(f"Adults (0): {np.sum(final_preds == 0)} ({np.mean(final_preds == 0)*100:.1f}%)")
print(f"Seniors (1): {np.sum(final_preds == 1)} ({np.mean(final_preds == 1)*100:.1f}%)")


submission.csv saved. Prediction distribution:
Adults (0): 267 (85.6%)
Seniors (1): 45 (14.4%)
