In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# 1. Load and clean the data
train = pd.read_csv('Train_Data.csv')
test = pd.read_csv('Test_Data.csv')

# Drop rows where age_group is missing, empty, or invalid
train = train.dropna(subset=['age_group'])
train['age_group'] = train['age_group'].astype(str).str.strip()
train = train[train['age_group'].isin(['Adult', 'Senior'])]
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})
assert train['age_group'].isnull().sum() == 0, "There are still NaNs in the target!"

# Advanced Feature Engineering
def feature_engineering(df):
    """
    Create sophisticated features based on medical domain knowledge
    """
    df = df.copy()

    # Handle missing values strategically
    # Use KNN imputation for better missing value handling
    numeric_cols = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']
    imputer = KNNImputer(n_neighbors=5)
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    # Create interaction features based on medical knowledge
    df['glucose_insulin_ratio'] = df['LBXGLU'] / (df['LBXIN'] + 0.01)  # Insulin sensitivity indicator
    df['glucose_tolerance_score'] = df['LBXGLT'] / (df['LBXGLU'] + 0.01)  # Glucose tolerance
    df['metabolic_risk_score'] = df['BMXBMI'] * df['LBXGLU'] / 100  # Combined metabolic risk
    # BMI categories (medical standard)
    df['bmi_category'] = pd.cut(df['BMXBMI'], bins=[0, 18.5, 25, 30, 50], labels=[0, 1, 2, 3]).astype(float)
                               #
    #df['bmi_category'] = df['bmi_category'].astype(float)
    # Glucose risk categories
    df['glucose_risk'] = pd.cut(df['LBXGLU'], bins=[0, 100, 126, 500], labels=[0, 1, 2]).astype(float)
                               #
    #df['glucose_risk'] = df['glucose_risk'].astype(float)
    # Physical activity and health interaction
    df['activity_bmi_interaction'] = df['PAQ605'] * df['BMXBMI']
    df['diabetes_glucose_interaction'] = df['DIQ010'] * df['LBXGLU']

    # Statistical transformations
    df['log_insulin'] = np.log1p(df['LBXIN'])
    df['sqrt_glucose'] = np.sqrt(df['LBXGLU'])
    df['bmi_squared'] = df['BMXBMI'] ** 2

    # Age-related compound features (even though we're predicting age group)
    df['health_composite_score'] = (
        df['BMXBMI'] * 0.3 +
        df['LBXGLU'] * 0.3 +
        df['LBXIN'] * 0.2 +
        df['LBXGLT'] * 0.2
    )

    # Remove SEQN as it's just an identifier
    if 'SEQN' in df.columns:
        df = df.drop('SEQN', axis=1)

    return df
#
X_train = feature_engineering(train.drop(['age_group'], axis=1))
X_test = feature_engineering(test.copy())

# Impute again in case new features have NaNs
imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Robust scaling
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
y_train = train['age_group'].values

# 3. Train/Validation Split for Threshold Optimization
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, stratify=y_train, random_state=42
)

# 4. LightGBM Hyperparameter Tuning
params = {
    'n_estimators': [200, 300, 400],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63],
    'class_weight': ['balanced']
}
lgbm = lgb.LGBMClassifier(random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs = GridSearchCV(lgbm, params, cv=skf, scoring='f1', n_jobs=-1)
gs.fit(X_tr, y_tr)
print("Best F1 (CV):", gs.best_score_)
print("Best Params:", gs.best_params_)

# 5. Train Final Model
best_lgbm = gs.best_estimator_
best_lgbm.fit(X_tr, y_tr)

# 6. Threshold Optimization
val_probs = best_lgbm.predict_proba(X_val)[:, 1]
thresholds = np.arange(0.1, 0.91, 0.01)
f1_scores = [f1_score(y_val, (val_probs >= thr).astype(int)) for thr in thresholds]
best_thr = thresholds[np.argmax(f1_scores)]
print(f"Best threshold on validation: {best_thr:.2f} (F1: {np.max(f1_scores):.4f})")

# 7. Retrain on Full Data
final_lgbm = lgb.LGBMClassifier(**gs.best_params_, random_state=42)
final_lgbm.fit(X_train_scaled, y_train)

# 8. Predict on Test Set
test_probs = final_lgbm.predict_proba(X_test_scaled)[:, 1]
test_preds = (test_probs >= best_thr).astype(int)

# 9. Submission
submission = pd.DataFrame({'age_group': test_preds.astype(int)})
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created.")

[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2689
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Best F1 (CV): 0.43242140341722496
Best Params: {'class_weight': 'balanced', 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 400, 'num_leaves': 15}
[LightGBM] [Info] Number of positive: 251, number of negative: 1310
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`