#  Enhanced Age Group Classification Notebook
This notebook breaks down `training.py` into parts with explanations.

###  1. Robust Data Loading
This section attempts to load training and test datasets from various possible file names. It ensures that if one naming convention fails, others are tried.

In [None]:
# 1. ROBUST DATA LOADING with multiple filename attempts
def load_data_robust():
    file_combinations = [
        ('train.csv', 'test.csv'),
        ('Train_Data.csv', 'Test_Data.csv'),
        ('training_data.csv', 'testing_data.csv')
    ]
    
    for train_file, test_file in file_combinations:
        try:
            train_df = pd.read_csv(train_file)
            test_df = pd.read_csv(test_file)
            print(f" Data loaded: {train_file}, {test_file}")
            print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
            return train_df, test_df
        except FileNotFoundError:
            continue
    
    raise Exception(" Could not find data files. Please check filenames.")

train_df, test_df = load_data_robust()

###  2. Advanced Data Preprocessing
This part handles cleaning of the target column `age_group`, including encoding it and dropping missing values.

In [None]:
# 2. ADVANCED DATA PREPROCESSING
print("\n DATA ANALYSIS & PREPROCESSING")
print("-" * 40)

# Handle target variable encoding
if 'age_group' in train_df.columns:
    initial_rows = len(train_df)
    train_df = train_df.dropna(subset=['age_group'])
    print(f"Dropped {initial_rows - len(train_df)} rows with missing target")
    
    # Robust target encoding
    if train_df['age_group'].dtype == 'object':
        unique_vals = train_df['age_group'].unique()
        print(f"Target values found: {unique_vals}")
        
        # Multiple encoding strategies
        if 'Adult' in unique_vals and 'Senior' in unique_vals:
            train_df['age_group'] = train_df['age_group'].map({'Adult': 0, 'Senior': 1})
        elif 'adult' in unique_vals and 'senior' in unique_vals:
            train_df['age_group'] = train_df['age_group'].map({'adult': 0, 'senior': 1})
        else:
            # Try numeric conversion
            train_df['age_group'] = pd.to_numeric(train_df['age_group'], errors='coerce')
    
    # Ensure binary encoding
    train_df['age_group'] = train_df['age_group'].astype(int)
    target_dist = train_df['age_group'].value_counts(normalize=True)
    print(f"Target distribution: Adult(0): {target_dist.get(0, 0):.3f}, Senior(1): {target_dist.get(1, 0):.3f}")

# Define features
TARGET = 'age_group'
ID_COL = 'SEQN' if 'SEQN' in train_df.columns else None
exclude_cols = [TARGET] + ([ID_COL] if ID_COL else [])
features = [c for c in train_df.columns if c not in exclude_cols]

print(f"Features to use: {features}")
print(f"Number of features: {len(features)}")

X = train_df[features].copy()
y = train_df[TARGET].copy()
X_test = test_df[features].copy()

###  3. Domain-Specific Feature Engineering
Adds medically-informed and interaction-based features that may help predict the `age_group` effectively.

In [None]:
# 3. DOMAIN-SPECIFIC FEATURE ENGINEERING
def create_medical_features(df):
    """Create medically-informed features for age prediction"""
    df_new = df.copy()
    
    # Missing value indicators (important for medical data)
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            df_new[f'{col}_is_missing'] = df[col].isnull().astype(int)
    
    # Gender-specific features
    if 'RIAGENDR' in df.columns:
        df_new['is_male'] = (df['RIAGENDR'] == 1).astype(int)
        df_new['is_female'] = (df['RIAGENDR'] == 2).astype(int)
    
    # BMI-based health indicators (crucial for age prediction)
    if 'BMXBMI' in df.columns:
        bmi = df['BMXBMI'].fillna(df['BMXBMI'].median())
        
        # Clinical BMI categories
        df_new['bmi_underweight'] = (bmi < 18.5).astype(int)
        df_new['bmi_normal'] = ((bmi >= 18.5) & (bmi < 25)).astype(int) 
        df_new['bmi_overweight'] = ((bmi >= 25) & (bmi < 30)).astype(int)
        df_new['bmi_obese_1'] = ((bmi >= 30) & (bmi < 35)).astype(int)
        df_new['bmi_obese_2'] = (bmi >= 35).astype(int)
        
        # Age-related BMI patterns
        df_new['bmi_senior_risk'] = ((bmi < 20) | (bmi > 35)).astype(int)
        df_new['bmi_optimal_range'] = ((bmi >= 20) & (bmi <= 27)).astype(int)
        
        # BMI transformations
        df_new['bmi_squared'] = bmi ** 2
        df_new['bmi_log'] = np.log1p(bmi)
        df_new['bmi_reciprocal'] = 1 / (bmi + 1)
    
    # Glucose metabolism (key age predictor)
    if 'LBXGLU' in df.columns:
        glucose = df['LBXGLU'].fillna(df['LBXGLU'].median())
        
        # Clinical glucose thresholds
        df_new['glucose_hypoglycemic'] = (glucose < 70).astype(int)
        df_new['glucose_normal'] = ((glucose >= 70) & (glucose < 100)).astype(int)
        df_new['glucose_prediabetic'] = ((glucose >= 100) & (glucose < 126)).astype(int)
        df_new['glucose_diabetic'] = (glucose >= 126).astype(int)
        df_new['glucose_severely_high'] = (glucose >= 200).astype(int)
        
        # Age-related glucose patterns
        df_new['glucose_age_risk'] = np.where(glucose >= 126, 3,
                                     np.where(glucose >= 100, 2,
                                     np.where(glucose < 70, 1, 0)))
        
        # Glucose transformations
        df_new['glucose_log'] = np.log1p(glucose)
        df_new['glucose_sqrt'] = np.sqrt(glucose)
    
    # Glucose tolerance test (OGTT)
    if 'LBXGLT' in df.columns:
        glt = df['LBXGLT'].fillna(df['LBXGLT'].median())
        df_new['glt_normal'] = (glt < 140).astype(int)
        df_new['glt_impaired'] = ((glt >= 140) & (glt < 200)).astype(int)
        df_new['glt_diabetic'] = (glt >= 200).astype(int)
        df_new['glt_log'] = np.log1p(glt)
    
    # Insulin levels (metabolic health)
    if 'LBXIN' in df.columns:
        insulin = df['LBXIN'].fillna(df['LBXIN'].median())
        df_new['insulin_low'] = (insulin < 5).astype(int)
        df_new['insulin_normal'] = ((insulin >= 5) & (insulin < 25)).astype(int)
        df_new['insulin_high'] = ((insulin >= 25) & (insulin < 50)).astype(int)
        df_new['insulin_very_high'] = (insulin >= 50).astype(int)
        df_new['insulin_log'] = np.log1p(insulin)
    
    # Physical activity (lifestyle factor)
    if 'PAQ605' in df.columns:
        df_new['is_active'] = (df['PAQ605'] == 1).astype(int)
        df_new['is_inactive'] = (df['PAQ605'] == 2).astype(int)
    
    # Diabetes diagnosis
    if 'DIQ010' in df.columns:
        df_new['has_diabetes'] = (df['DIQ010'] == 1).astype(int)
        df_new['no_diabetes'] = (df['DIQ010'] == 2).astype(int)
    
    # CRITICAL INTERACTION FEATURES (often the key to high performance)
    if 'BMXBMI' in df.columns and 'LBXGLU' in df.columns:
        bmi = df['BMXBMI'].fillna(df['BMXBMI'].median())
        glucose = df['LBXGLU'].fillna(df['LBXGLU'].median())
        
        df_new['bmi_glucose_product'] = bmi * glucose
        df_new['bmi_glucose_ratio'] = bmi / (glucose + 1)
        df_new['glucose_bmi_ratio'] = glucose / (bmi + 1)
        df_new['metabolic_syndrome_risk'] = ((bmi > 30) & (glucose > 100)).astype(int)
    
    if 'LBXGLU' in df.columns and 'LBXIN' in df.columns:
        glucose = df['LBXGLU'].fillna(df['LBXGLU'].median())
        insulin = df['LBXIN'].fillna(df['LBXIN'].median())
        
        df_new['glucose_insulin_ratio'] = glucose / (insulin + 1)
        df_new['insulin_resistance_index'] = glucose * insulin
        df_new['homa_ir_approx'] = (glucose * insulin) / 405  # Simplified HOMA-IR
    
    if 'RIAGENDR' in df.columns and 'BMXBMI' in df.columns:
        df_new['male_high_bmi'] = ((df['RIAGENDR'] == 1) & (df['BMXBMI'] > 30)).astype(int)
        df_new['female_low_bmi'] = ((df['RIAGENDR'] == 2) & (df['BMXBMI'] < 20)).astype(int)
    
    # Comprehensive health risk score
    risk_score = 0
    if 'DIQ010' in df.columns:
        risk_score += (df['DIQ010'] == 1).astype(int) * 3  # Diabetes major risk
    if 'BMXBMI' in df.columns:
        risk_score += (df['BMXBMI'] > 35).astype(int) * 2  # Severe obesity
        risk_score += ((df['BMXBMI'] > 30) & (df['BMXBMI'] <= 35)).astype(int) * 1
    if 'LBXGLU' in df.columns:
        risk_score += (df['LBXGLU'] > 126).astype(int) * 2  # Diabetes range
        risk_score += ((df['LBXGLU'] > 100) & (df['LBXGLU'] <= 126)).astype(int) * 1
    if 'PAQ605' in df.columns:
        risk_score += (df['PAQ605'] == 2).astype(int) * 1  # Inactivity
    
    df_new['total_health_risk'] = risk_score
    df_new['high_risk_senior'] = (risk_score >= 4).astype(int)
    
    return df_new

# Apply enhanced feature engineering
print("\n🔧 FEATURE ENGINEERING")
print("-" * 30)
X_enhanced = create_medical_features(X)
X_test_enhanced = create_medical_features(X_test)

print(f"Original features: {X.shape[1]}")
print(f"Enhanced features: {X_enhanced.shape[1]}")
print(f"Added features: {X_enhanced.shape[1] - X.shape[1]}")

###  4. Advanced Missing Value Handling
Uses Iterative Imputation, KNN, and Simple strategies to fill in missing data in a smart way.

In [None]:
# 4. ADVANCED MISSING VALUE HANDLING
def advanced_imputation(X_train, X_test):
    """Multiple imputation strategies"""
    
    # Identify feature types
    numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # Strategy 1: Iterative imputation (most sophisticated)
    iterative_imputer = IterativeImputer(
        estimator=RandomForestClassifier(n_estimators=50, random_state=42),
        max_iter=10,
        random_state=42
    )
    
    # Strategy 2: KNN imputation
    knn_imputer = KNNImputer(n_neighbors=5)
    
    # Strategy 3: Median/Mode imputation
    simple_imputer = SimpleImputer(strategy='median')
    
    # Use iterative for most features, KNN for backup
    X_train_imputed = pd.DataFrame(
        iterative_imputer.fit_transform(X_train[numeric_features]),
        columns=numeric_features,
        index=X_train.index
    )
    
    X_test_imputed = pd.DataFrame(
        iterative_imputer.transform(X_test[numeric_features]),
        columns=numeric_features,
        index=X_test.index
    )
    
    # Add non-numeric columns back
    for col in X_train.columns:
        if col not in numeric_features:
            X_train_imputed[col] = X_train[col].fillna(X_train[col].mode()[0] if not X_train[col].mode().empty else 0)
            X_test_imputed[col] = X_test[col].fillna(X_train[col].mode()[0] if not X_train[col].mode().empty else 0)
    
    return X_train_imputed, X_test_imputed

X_enhanced, X_test_enhanced = advanced_imputation(X_enhanced, X_test_enhanced)
print(" Advanced imputation completed")

###  5. Feature Selection
Selects the top features using statistical tests, mutual information, and random forest importance.

In [None]:
# 5. FEATURE SELECTION with multiple methods
def intelligent_feature_selection(X, y, k=None):
    """Multi-method feature selection"""
    
    if k is None:
        k = min(50, X.shape[1])  # Select top 50 or all features if less
    
    # Method 1: Statistical tests
    selector_stats = SelectKBest(score_func=f_classif, k=k)
    X_stats = selector_stats.fit_transform(X, y)
    stats_features = X.columns[selector_stats.get_support()].tolist()
    
    # Method 2: Mutual information
    selector_mi = SelectKBest(score_func=mutual_info_classif, k=k)
    X_mi = selector_mi.fit_transform(X, y)
    mi_features = X.columns[selector_mi.get_support()].tolist()
    
    # Method 3: Random Forest importance
    rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_selector.fit(X, y)
    importances = rf_selector.feature_importances_
    top_rf_features = X.columns[np.argsort(importances)[-k:]].tolist()
    
    # Combine methods (union of top features)
    combined_features = list(set(stats_features + mi_features + top_rf_features))
    
    print(f"Feature selection: {len(combined_features)} features selected from {X.shape[1]}")
    return combined_features

selected_features = intelligent_feature_selection(X_enhanced, y)
X_selected = X_enhanced[selected_features]
X_test_selected = X_test_enhanced[selected_features]

###  6. Train/Validation Split
Splits the dataset into training and validation sets using stratified sampling.

In [None]:
# 6. TRAIN/VALIDATION SPLIT
X_train, X_val, y_train, y_val = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n DATA SPLIT")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Training target distribution: {y_train.value_counts(normalize=True).round(3).to_dict()}")

###  7. Optimized Preprocessing Pipeline
Uses transformers like QuantileTransformer to preprocess numeric data.

In [None]:
# 7. OPTIMIZED PREPROCESSING PIPELINE
def create_preprocessing_pipeline():
    """Create optimized preprocessing pipeline"""
    
    numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # Use QuantileTransformer for better handling of skewed distributions
    numeric_transformer = Pipeline([
        ('scaler', QuantileTransformer(n_quantiles=100, random_state=42))
    ])
    
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features)
    ], remainder='passthrough')
    
    return preprocessor

preprocessor = create_preprocessing_pipeline()

###  8. Advanced Resampling Strategy
Handles class imbalance using techniques like SMOTE, BorderlineSMOTE, and SMOTETomek.

In [None]:
# 8. ADVANCED RESAMPLING STRATEGY
def get_optimal_resampling_strategy(X, y):
    """Determine optimal resampling based on class distribution"""
    
    class_counts = y.value_counts()
    imbalance_ratio = class_counts[0] / class_counts[1] if class_counts[1] > 0 else float('inf')
    
    print(f"Class imbalance ratio: {imbalance_ratio:.2f}")
    
    if imbalance_ratio > 3:
        # Severe imbalance - use SMOTE + undersampling
        return SMOTETomek(sampling_strategy=0.7, random_state=42)
    elif imbalance_ratio > 2:
        # Moderate imbalance - use BorderlineSMOTE
        return BorderlineSMOTE(sampling_strategy=0.8, random_state=42)
    else:
        # Mild imbalance - use standard SMOTE
        return SMOTE(sampling_strategy=0.9, random_state=42)

resampler = get_optimal_resampling_strategy(X_train, y_train)

###  9. Optimized Model Configurations
Defines pipelines for XGBoost, LightGBM, and RandomForest with tuned hyperparameters.

In [None]:
# 9. OPTIMIZED MODEL CONFIGURATIONS
def get_optimized_models():
    """Get carefully tuned models for high F1 performance"""
    
    models = {}
    
    # XGBoost - optimized for F1 score
    xgb_pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('resampler', resampler),
        ('classifier', xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1
        ))
    ])
    
    xgb_params = {
        'classifier__n_estimators': [200, 300, 400],
        'classifier__max_depth': [4, 6, 8],
        'classifier__learning_rate': [0.05, 0.1, 0.15],
        'classifier__subsample': [0.8, 0.9],
        'classifier__colsample_bytree': [0.8, 0.9],
        'classifier__scale_pos_weight': [2, 3, 4, 5]
    }
    
    models['xgb'] = (xgb_pipeline, xgb_params)
    
    # LightGBM - often excellent for tabular data
    lgb_pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('resampler', resampler),
        ('classifier', lgb.LGBMClassifier(
            objective='binary',
            metric='binary_logloss',
            verbose=-1,
            random_state=42,
            n_jobs=-1
        ))
    ])
    
    lgb_params = {
        'classifier__n_estimators': [200, 300, 400],
        'classifier__max_depth': [4, 6, 8],
        'classifier__learning_rate': [0.05, 0.1, 0.15],
        'classifier__num_leaves': [31, 50, 70],
        'classifier__class_weight': ['balanced']
    }
    
    models['lgb'] = (lgb_pipeline, lgb_params)
    
    # Random Forest with balanced class weights
    rf_pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('resampler', resampler),
        ('classifier', RandomForestClassifier(
            random_state=42,
            n_jobs=-1,
            class_weight='balanced'
        ))
    ])
    
    rf_params = {
        'classifier__n_estimators': [300, 500],
        'classifier__max_depth': [8, 12, 16],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2],
        'classifier__max_features': ['sqrt', 'log2']
    }
    
    models['rf'] = (rf_pipeline, rf_params)
    
    return models

###  10. Efficient Hyperparameter Tuning
Uses RandomizedSearchCV to efficiently find the best hyperparameters for each model.

In [None]:
# 10. EFFICIENT HYPERPARAMETER TUNING
print("\n HYPERPARAMETER OPTIMIZATION")
print("-" * 35)

models = get_optimized_models()
best_models = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, (pipeline, param_grid) in models.items():
    print(f"\nTuning {name.upper()}...")
    
    # Use RandomizedSearchCV for efficiency
    search = RandomizedSearchCV(
        pipeline, 
        param_grid,
        n_iter=20,  # Reduced for efficiency
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    try:
        search.fit(X_train, y_train)
        best_models[name] = search.best_estimator_
        print(f" {name} - Best F1: {search.best_score_:.4f}")
        
    except Exception as e:
        print(f" {name} failed: {str(e)[:100]}...")

###  11. Advanced Threshold Optimization
Finds the threshold that maximizes the F1 score on validation data.

In [None]:
# 11. ADVANCED THRESHOLD OPTIMIZATION
def optimize_threshold_advanced(model, X_val, y_val):
    """Advanced threshold optimization with F1 focus"""
    
    try:
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(X_val)[:, 1]
        else:
            probs = model.decision_function(X_val)
            probs = (probs - probs.min()) / (probs.max() - probs.min())
    except:
        return 0.5, 0.0
    
    # Fine-grained threshold search
    thresholds = np.linspace(0.1, 0.9, 81)
    best_threshold = 0.5
    best_f1 = 0.0
    
    for threshold in thresholds:
        y_pred = (probs >= threshold).astype(int)
        f1 = f1_score(y_val, y_pred)
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1

###  12. Model Evaluation and Selection
Evaluates each trained model and selects the best based on F1 score.

In [None]:
# 12. MODEL EVALUATION AND SELECTION
print("\n MODEL EVALUATION")
print("-" * 25)

model_results = {}

for name, model in best_models.items():
    try:
        # Get predictions
        y_pred_default = model.predict(X_val)
        f1_default = f1_score(y_val, y_pred_default)
        
        # Optimize threshold
        best_threshold, best_f1 = optimize_threshold_advanced(model, X_val, y_val)
        
        model_results[name] = {
            'model': model,
            'f1_default': f1_default,
            'f1_optimized': best_f1,
            'threshold': best_threshold
        }
        
        print(f"\n{name.upper()}:")
        print(f"  F1 (default): {f1_default:.4f}")
        print(f"  F1 (optimized): {best_f1:.4f}")
        print(f"  Best threshold: {best_threshold:.3f}")
        
    except Exception as e:
        print(f" Error evaluating {name}: {str(e)[:50]}...")

###  13. Create Ensemble if Beneficial
Creates a soft voting ensemble using the top 2-3 performing models.

In [None]:
# 13. CREATE ENSEMBLE IF BENEFICIAL
if len(model_results) >= 2:
    print("\n ENSEMBLE CREATION")
    print("-" * 25)
    
    # Sort models by F1 score
    sorted_models = sorted(model_results.items(), 
                          key=lambda x: x[1]['f1_optimized'], 
                          reverse=True)
    
    # Take top 2-3 models for ensemble
    top_models = [(name, result['model']) for name, result in sorted_models[:3]]
    
    # Create voting ensemble
    ensemble = VotingClassifier(
        estimators=top_models,
        voting='soft'
    )
    
    try:
        ensemble.fit(X_train, y_train)
        ensemble_threshold, ensemble_f1 = optimize_threshold_advanced(ensemble, X_val, y_val)
        
        model_results['ensemble'] = {
            'model': ensemble,
            'f1_optimized': ensemble_f1,
            'threshold': ensemble_threshold
        }
        
        print(f"Ensemble F1: {ensemble_f1:.4f} (threshold: {ensemble_threshold:.3f})")
        
    except Exception as e:
        print(f"Ensemble creation failed: {str(e)[:50]}...")

###  14. Final Model Selection and Prediction
Retrains the best model on full data and generates predictions for the test set.

In [None]:
# 14. FINAL MODEL SELECTION AND PREDICTION
print("\n FINAL MODEL SELECTION")
print("-" * 30)

if model_results:
    # Select best model
    best_model_name = max(model_results.keys(), 
                         key=lambda k: model_results[k]['f1_optimized'])
    
    final_model = model_results[best_model_name]['model']
    final_threshold = model_results[best_model_name]['threshold']
    final_f1 = model_results[best_model_name]['f1_optimized']
    
    print(f"Selected: {best_model_name.upper()}")
    print(f"Expected F1: {final_f1:.4f}")
    print(f"Threshold: {final_threshold:.3f}")
    
    # Retrain on full dataset
    print("\n Retraining on full dataset...")
    final_model.fit(X_selected, y)
    
    # Generate final predictions
    print(" Generating final predictions...")
    
    try:
        if hasattr(final_model, 'predict_proba'):
            test_probs = final_model.predict_proba(X_test_selected)[:, 1]
            final_predictions = (test_probs >= final_threshold).astype(int)
        else:
            final_predictions = final_model.predict(X_test_selected)
    except Exception as e:
        print(f"Prediction error: {e}")
        final_predictions = final_model.predict(X_test_selected)
    
    # Create submission
    submission = pd.DataFrame({
        'age_group': final_predictions
    })
    
    submission.to_csv('optimized_submission.csv', index=False)
    
    print(f"\n SUBMISSION READY!")
    print("=" * 40)
    print(f"File: optimized_submission.csv")
    print(f"Total predictions: {len(final_predictions)}")
    
    pred_counts = pd.Series(final_predictions).value_counts()
    print(f"Adult (0): {pred_counts.get(0, 0)} ({pred_counts.get(0, 0)/len(final_predictions)*100:.1f}%)")
    print(f"Senior (1): {pred_counts.get(1, 0)} ({pred_counts.get(1, 0)/len(final_predictions)*100:.1f}%)")
    
    print(f"\nModel: {best_model_name.upper()}")
    print(f"Expected F1 Score: {final_f1:.4f}")
    
    if final_f1 > 0.6:
        print(" EXCELLENT! This should achieve 50+ F1 score!")
    elif final_f1 > 0.5:
        print(" GOOD performance expected!")
    else:
        print(" May need further optimization")

else:
    print(" No models were successfully trained")

print("\n PROCESS COMPLETE!")
print("=" * 40)