# AIDAMS ML Competition Solution

## Competition Goal
This notebook provides a comprehensive machine learning pipeline for the AIDAMS ML competition on Kaggle.

**Objective:** Predict the academic risk of students in higher education

**Target Variable:** Student outcome (Dropout, Enrolled, Graduate)

**Approach:** We will compare three powerful gradient boosting models:
- XGBoost
- LightGBM
- CatBoost

---

## Section 1: Configuration & Setup

In [None]:
# ============================================================
# CONFIGURATION - Adjust these settings as needed
# ============================================================

# Model selection: 'xgboost', 'lightgbm', or 'catboost'
SELECTED_MODEL = 'xgboost'

# Enable/disable hyperparameter tuning (can take a long time)
ENABLE_HYPERPARAMETER_TUNING = False

# Enable/disable visualizations
ENABLE_VISUALIZATIONS = True

# Enable/disable feature engineering
ENABLE_FEATURE_ENGINEERING = False

# Random seed for reproducibility
RANDOM_SEED = 42

# Train-test split ratio
TEST_SIZE = 0.2

# Cross-validation folds
CV_FOLDS = 5

# Data file paths
TRAIN_DATA_PATH = 'data.csv'
TEST_DATA_PATH = 'test.csv'

print("‚úì Configuration loaded successfully")
print(f"  Selected Model: {SELECTED_MODEL}")
print(f"  Hyperparameter Tuning: {ENABLE_HYPERPARAMETER_TUNING}")
print(f"  Visualizations: {ENABLE_VISUALIZATIONS}")
print(f"  Feature Engineering: {ENABLE_FEATURE_ENGINEERING}")

In [None]:
# ============================================================
# IMPORT LIBRARIES
# ============================================================

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV

# Models
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Model persistence
import joblib

# Utilities
import json
import warnings
from pathlib import Path

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì All libraries imported successfully")

## Section 2: Data Loading & Exploratory Data Analysis

In [None]:
# ============================================================
# LOAD DATA
# ============================================================

# Load the training data (handles both comma and semicolon separators)
df = pd.read_csv(TRAIN_DATA_PATH, sep=None, engine='python')

# Clean column names (remove tabs and extra spaces)
df.columns = df.columns.str.replace('\t', ' ').str.strip()

print("‚úì Data loaded successfully")
print(f"  Dataset shape: {df.shape}")
print(f"  Number of samples: {df.shape[0]}")
print(f"  Number of features: {df.shape[1] - 1}")

In [None]:
# ============================================================
# DISPLAY BASIC INFORMATION
# ============================================================

print("\nüìä Dataset Information:")
print("=" * 60)
df.info()

In [None]:
# ============================================================
# DISPLAY FIRST FEW ROWS
# ============================================================

print("\nüìã First 5 rows of the dataset:")
print("=" * 60)
df.head()

In [None]:
# ============================================================
# ANALYZE MISSING VALUES
# ============================================================

missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percent
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("\n‚ö†Ô∏è Missing Values Summary:")
    print("=" * 60)
    print(missing_df)
else:
    print("\n‚úì No missing values found in the dataset")

In [None]:
# ============================================================
# TARGET VARIABLE DISTRIBUTION
# ============================================================

print("\nüéØ Target Variable Distribution:")
print("=" * 60)

target_counts = df['Target'].value_counts()
target_percent = (target_counts / len(df)) * 100

target_summary = pd.DataFrame({
    'Count': target_counts,
    'Percentage': target_percent
})

print(target_summary)
print(f"\nNumber of classes: {df['Target'].nunique()}")

In [None]:
# ============================================================
# BASIC STATISTICS
# ============================================================

print("\nüìà Basic Statistical Summary:")
print("=" * 60)
df.describe()

## Section 3: Data Visualization

In [None]:
# ============================================================
# TARGET VARIABLE DISTRIBUTION PLOT
# ============================================================

if ENABLE_VISUALIZATIONS:
    plt.figure(figsize=(10, 6))
    
    target_counts = df['Target'].value_counts()
    colors = ['#3498db', '#e74c3c', '#2ecc71']
    
    plt.bar(target_counts.index, target_counts.values, color=colors[:len(target_counts)])
    plt.xlabel('Target Class', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.title('Distribution of Target Variable', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels on bars
    for i, (label, value) in enumerate(target_counts.items()):
        plt.text(i, value, str(value), ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.savefig('target_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("‚úì Target distribution plot saved as 'target_distribution.png'")
else:
    print("‚äó Visualizations disabled")

In [None]:
# ============================================================
# CORRELATION HEATMAP FOR NUMERICAL FEATURES
# ============================================================

if ENABLE_VISUALIZATIONS:
    # Select only numerical columns (excluding Target)
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Target' in numerical_cols:
        numerical_cols.remove('Target')
    
    # Limit to top 20 features for better visualization
    if len(numerical_cols) > 20:
        numerical_cols = numerical_cols[:20]
    
    if len(numerical_cols) > 0:
        plt.figure(figsize=(14, 10))
        
        correlation_matrix = df[numerical_cols].corr()
        
        sns.heatmap(correlation_matrix, 
                    annot=False, 
                    cmap='coolwarm', 
                    center=0,
                    square=True,
                    linewidths=0.5,
                    cbar_kws={"shrink": 0.8})
        
        plt.title('Correlation Heatmap of Numerical Features', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("‚úì Correlation heatmap saved as 'correlation_heatmap.png'")
    else:
        print("‚ö†Ô∏è No numerical features found for correlation analysis")
else:
    print("‚äó Visualizations disabled")

## Section 4: Data Preprocessing

In [None]:
# ============================================================
# DATA PREPROCESSING FUNCTION
# ============================================================

def preprocess_data(data, target_encoder=None, feature_encoders=None, is_training=True):
    """
    Preprocess the data:
    - Handle missing values
    - Encode categorical variables
    - Encode target variable (if training)
    - Remove ID column
    
    Args:
        data: Input DataFrame
        target_encoder: LabelEncoder for target (used during prediction)
        feature_encoders: Dictionary of LabelEncoders for features (used during prediction)
        is_training: Whether this is training data
    
    Returns:
        X: Processed features
        y: Encoded labels (None if not training)
        target_encoder: LabelEncoder for target
        feature_encoders: Dictionary of LabelEncoders for features
        feature_names: List of feature names
    """
    
    df = data.copy()
    
    # Separate target and features
    if is_training:
        if 'Target' not in df.columns:
            raise ValueError("Target column not found in training data")
        y = df['Target'].copy()
        X = df.drop(columns=['Target'])
    else:
        y = None
        X = df.copy()
    
    # Remove ID column if present (but save it for later)
    id_column = None
    if 'id' in X.columns:
        id_column = X['id'].copy()
        X = X.drop(columns=['id'])
    
    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    
    print(f"\nüîç Preprocessing {'training' if is_training else 'test'} data...")
    print(f"  Numerical columns: {len(numerical_cols)}")
    print(f"  Categorical columns: {len(categorical_cols)}")
    
    # Handle missing values in numerical columns (fill with median)
    for col in numerical_cols:
        if X[col].isnull().sum() > 0:
            median_value = X[col].median()
            X[col].fillna(median_value, inplace=True)
            print(f"  Filled {X[col].isnull().sum()} missing values in '{col}' with median: {median_value:.2f}")
    
    # Handle missing values in categorical columns (fill with mode)
    for col in categorical_cols:
        if X[col].isnull().sum() > 0:
            mode_value = X[col].mode()[0] if len(X[col].mode()) > 0 else 'Unknown'
            X[col].fillna(mode_value, inplace=True)
            print(f"  Filled {X[col].isnull().sum()} missing values in '{col}' with mode: {mode_value}")
    
    # Encode categorical variables
    if feature_encoders is None:
        feature_encoders = {}
    
    for col in categorical_cols:
        if is_training:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            feature_encoders[col] = le
        else:
            if col in feature_encoders:
                le = feature_encoders[col]
                # Handle unseen categories
                X[col] = X[col].astype(str).apply(lambda x: x if x in le.classes_ else le.classes_[0])
                X[col] = le.transform(X[col])
            else:
                print(f"  Warning: No encoder found for '{col}', using default encoding")
                X[col] = 0
    
    # Encode target variable
    if is_training:
        if target_encoder is None:
            target_encoder = LabelEncoder()
            y = target_encoder.fit_transform(y.astype(str))
        else:
            y = target_encoder.transform(y.astype(str))
    
    feature_names = X.columns.tolist()
    
    print(f"‚úì Preprocessing complete")
    print(f"  Final feature count: {len(feature_names)}")
    print(f"  Final sample count: {len(X)}")
    
    return X, y, target_encoder, feature_encoders, feature_names

In [None]:
# ============================================================
# APPLY PREPROCESSING
# ============================================================

X, y, target_encoder, feature_encoders, feature_names = preprocess_data(df, is_training=True)

print(f"\n‚úì Data preprocessing completed")
print(f"  Features shape: {X.shape}")
print(f"  Target shape: {y.shape}")
print(f"  Number of classes: {len(np.unique(y))}")
print(f"  Class labels: {target_encoder.classes_}")

## Section 5: Feature Engineering (Optional)

In [None]:
# ============================================================
# FEATURE ENGINEERING FUNCTION
# ============================================================

def engineer_features(X_data, feature_names):
    """
    Create additional features from existing ones.
    
    This is a placeholder function. You can add your own feature engineering here.
    Examples:
    - Interaction features
    - Polynomial features
    - Aggregation features
    - Domain-specific features
    
    Args:
        X_data: Feature matrix
        feature_names: List of feature names
    
    Returns:
        X_engineered: Feature matrix with additional features
        new_feature_names: Updated list of feature names
    """
    
    X_eng = X_data.copy()
    new_features = []
    
    # Example: Create interaction features (commented out)
    # if 'Age at enrollment' in feature_names and 'Admission grade' in feature_names:
    #     age_idx = feature_names.index('Age at enrollment')
    #     grade_idx = feature_names.index('Admission grade')
    #     X_eng['Age_x_Grade'] = X_data.iloc[:, age_idx] * X_data.iloc[:, grade_idx]
    #     new_features.append('Age_x_Grade')
    
    # Example: Create polynomial features (commented out)
    # if 'Admission grade' in feature_names:
    #     grade_idx = feature_names.index('Admission grade')
    #     X_eng['Admission_grade_squared'] = X_data.iloc[:, grade_idx] ** 2
    #     new_features.append('Admission_grade_squared')
    
    new_feature_names = list(feature_names) + new_features
    
    if len(new_features) > 0:
        print(f"‚úì Created {len(new_features)} new features")
        print(f"  New features: {new_features}")
    else:
        print("‚ÑπÔ∏è No feature engineering applied (placeholder function)")
    
    return X_eng, new_feature_names

In [None]:
# ============================================================
# APPLY FEATURE ENGINEERING (IF ENABLED)
# ============================================================

if ENABLE_FEATURE_ENGINEERING:
    X, feature_names = engineer_features(X, feature_names)
    print(f"\n‚úì Feature engineering completed")
    print(f"  Total features: {len(feature_names)}")
else:
    print("\n‚äó Feature engineering disabled")

## Section 6: Train-Test Split

In [None]:
# ============================================================
# SPLIT DATA INTO TRAINING AND VALIDATION SETS
# ============================================================

# Use stratified split to maintain class distribution
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_SEED,
    stratify=y
)

print("\n‚úì Data split completed")
print(f"  Training set size: {X_train.shape[0]} samples ({(1-TEST_SIZE)*100:.0f}%)")
print(f"  Validation set size: {X_val.shape[0]} samples ({TEST_SIZE*100:.0f}%)")
print(f"  Number of features: {X_train.shape[1]}")

# Verify class distribution
train_dist = np.bincount(y_train) / len(y_train)
val_dist = np.bincount(y_val) / len(y_val)

print("\n  Class distribution:")
for i, class_name in enumerate(target_encoder.classes_):
    print(f"    {class_name}: Train={train_dist[i]:.2%}, Val={val_dist[i]:.2%}")

## Section 7: Model Training - XGBoost

In [None]:
# ============================================================
# TRAIN XGBOOST MODEL
# ============================================================

print("\nüöÄ Training XGBoost model...")
print("=" * 60)

# Determine if binary or multiclass
n_classes = len(np.unique(y))

if n_classes == 2:
    objective = 'binary:logistic'
    eval_metric = 'logloss'
else:
    objective = 'multi:softmax'
    eval_metric = 'mlogloss'

xgb_model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    objective=objective,
    eval_metric=eval_metric,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    num_class=n_classes if n_classes > 2 else None
)

# Train the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=False
)

# Make predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_val_pred_xgb = xgb_model.predict(X_val)

# Calculate accuracy
train_acc_xgb = accuracy_score(y_train, y_train_pred_xgb)
val_acc_xgb = accuracy_score(y_val, y_val_pred_xgb)

print(f"‚úì XGBoost training completed")
print(f"  Training accuracy: {train_acc_xgb:.4f}")
print(f"  Validation accuracy: {val_acc_xgb:.4f}")

## Section 8: Model Training - LightGBM

In [None]:
# ============================================================
# TRAIN LIGHTGBM MODEL
# ============================================================

print("\nüöÄ Training LightGBM model...")
print("=" * 60)

# Determine if binary or multiclass
if n_classes == 2:
    objective = 'binary'
else:
    objective = 'multiclass'

lgb_model = lgb.LGBMClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    objective=objective,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    num_class=n_classes if n_classes > 2 else None,
    verbose=-1
)

# Train the model with early stopping
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

# Make predictions
y_train_pred_lgb = lgb_model.predict(X_train)
y_val_pred_lgb = lgb_model.predict(X_val)

# Calculate accuracy
train_acc_lgb = accuracy_score(y_train, y_train_pred_lgb)
val_acc_lgb = accuracy_score(y_val, y_val_pred_lgb)

print(f"‚úì LightGBM training completed")
print(f"  Training accuracy: {train_acc_lgb:.4f}")
print(f"  Validation accuracy: {val_acc_lgb:.4f}")

## Section 9: Model Training - CatBoost

In [None]:
# ============================================================
# TRAIN CATBOOST MODEL
# ============================================================

print("\nüöÄ Training CatBoost model...")
print("=" * 60)

# Automatically detect categorical features
cat_features = []
for i, col in enumerate(feature_names):
    # Check if feature has low cardinality (likely categorical)
    unique_values = X[col].nunique()
    if unique_values <= 50 or unique_values <= max(20, int(0.05 * len(X))):
        cat_features.append(i)

print(f"  Detected {len(cat_features)} categorical features")

# Determine loss function
if n_classes == 2:
    loss_function = 'Logloss'
else:
    loss_function = 'MultiClass'

catboost_model = CatBoostClassifier(
    depth=6,
    learning_rate=0.1,
    iterations=500,
    subsample=0.8,
    colsample_bylevel=0.8,
    loss_function=loss_function,
    eval_metric='Accuracy',
    random_seed=RANDOM_SEED,
    verbose=False,
    early_stopping_rounds=50
)

# Train the model
catboost_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=False
)

# Make predictions
y_train_pred_cat = catboost_model.predict(X_train).ravel()
y_val_pred_cat = catboost_model.predict(X_val).ravel()

# Calculate accuracy
train_acc_cat = accuracy_score(y_train, y_train_pred_cat)
val_acc_cat = accuracy_score(y_val, y_val_pred_cat)

print(f"‚úì CatBoost training completed")
print(f"  Training accuracy: {train_acc_cat:.4f}")
print(f"  Validation accuracy: {val_acc_cat:.4f}")

## Section 10: Model Comparison

In [None]:
# ============================================================
# COMPARE ALL THREE MODELS
# ============================================================

print("\nüìä Model Comparison")
print("=" * 60)

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'CatBoost'],
    'Training Accuracy': [train_acc_xgb, train_acc_lgb, train_acc_cat],
    'Validation Accuracy': [val_acc_xgb, val_acc_lgb, val_acc_cat]
})

# Add overfitting indicator
comparison_df['Overfitting'] = comparison_df['Training Accuracy'] - comparison_df['Validation Accuracy']

# Sort by validation accuracy
comparison_df = comparison_df.sort_values('Validation Accuracy', ascending=False)

print(comparison_df.to_string(index=False))

# Identify best model
best_model_name = comparison_df.iloc[0]['Model']
best_val_acc = comparison_df.iloc[0]['Validation Accuracy']

print(f"\nüèÜ Best model: {best_model_name} (Validation Accuracy: {best_val_acc:.4f})")

## Section 11: Cross-Validation

In [None]:
# ============================================================
# PERFORM CROSS-VALIDATION ON SELECTED MODEL
# ============================================================

print(f"\nüîç Performing {CV_FOLDS}-fold cross-validation on {SELECTED_MODEL.upper()}...")
print("=" * 60)

# Select the model based on configuration
if SELECTED_MODEL == 'xgboost':
    cv_model = xgb_model
elif SELECTED_MODEL == 'lightgbm':
    cv_model = lgb_model
elif SELECTED_MODEL == 'catboost':
    cv_model = catboost_model
else:
    print(f"‚ö†Ô∏è Invalid model selection: {SELECTED_MODEL}. Using XGBoost.")
    cv_model = xgb_model
    SELECTED_MODEL = 'xgboost'

# Perform stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_SEED)

cv_scores = cross_val_score(cv_model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"\n‚úì Cross-validation completed")
print(f"  CV Scores: {[f'{score:.4f}' for score in cv_scores]}")
print(f"  Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"  Std CV Accuracy: {cv_scores.std():.4f}")
print(f"  Mean ¬± Std: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")

## Section 12: Hyperparameter Tuning (Optional)

In [None]:
# ============================================================
# HYPERPARAMETER TUNING FOR XGBOOST
# ============================================================

if ENABLE_HYPERPARAMETER_TUNING and SELECTED_MODEL == 'xgboost':
    print("\nüîß Performing hyperparameter tuning for XGBoost...")
    print("=" * 60)
    print("‚ö†Ô∏è This may take several minutes...")
    
    # Define parameter grid
    param_grid = {
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'n_estimators': [100, 200, 300, 500],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.5]
    }
    
    # Create base model
    base_model = xgb.XGBClassifier(
        objective=objective,
        eval_metric=eval_metric,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        num_class=n_classes if n_classes > 2 else None
    )
    
    # Perform randomized search
    random_search = RandomizedSearchCV(
        base_model,
        param_distributions=param_grid,
        n_iter=20,
        scoring='accuracy',
        cv=3,
        verbose=1,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    
    random_search.fit(X_train, y_train)
    
    print(f"\n‚úì Hyperparameter tuning completed")
    print(f"  Best parameters: {random_search.best_params_}")
    print(f"  Best CV score: {random_search.best_score_:.4f}")
    
    # Update model with best parameters
    xgb_model = random_search.best_estimator_
    
elif ENABLE_HYPERPARAMETER_TUNING and SELECTED_MODEL == 'lightgbm':
    print("\nüîß Performing hyperparameter tuning for LightGBM...")
    print("=" * 60)
    print("‚ö†Ô∏è This may take several minutes...")
    
    # Define parameter grid
    param_grid = {
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'n_estimators': [100, 200, 300, 500],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'num_leaves': [31, 50, 100]
    }
    
    # Create base model
    base_model = lgb.LGBMClassifier(
        objective=objective,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        num_class=n_classes if n_classes > 2 else None,
        verbose=-1
    )
    
    # Perform randomized search
    random_search = RandomizedSearchCV(
        base_model,
        param_distributions=param_grid,
        n_iter=20,
        scoring='accuracy',
        cv=3,
        verbose=1,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    
    random_search.fit(X_train, y_train)
    
    print(f"\n‚úì Hyperparameter tuning completed")
    print(f"  Best parameters: {random_search.best_params_}")
    print(f"  Best CV score: {random_search.best_score_:.4f}")
    
    # Update model with best parameters
    lgb_model = random_search.best_estimator_
    
else:
    print("\n‚äó Hyperparameter tuning disabled")
    if ENABLE_HYPERPARAMETER_TUNING:
        print(f"  Note: Hyperparameter tuning is only implemented for XGBoost and LightGBM")
        print(f"  Current model: {SELECTED_MODEL}")

## Section 13: Final Model Training

In [None]:
# ============================================================
# RETRAIN ON FULL DATASET
# ============================================================

print(f"\nüéØ Training final {SELECTED_MODEL.upper()} model on full dataset...")
print("=" * 60)

# Select and train the final model
if SELECTED_MODEL == 'xgboost':
    final_model = xgb_model
    final_model.fit(X, y, verbose=False)
elif SELECTED_MODEL == 'lightgbm':
    final_model = lgb_model
    final_model.fit(X, y, callbacks=[lgb.log_evaluation(period=0)])
elif SELECTED_MODEL == 'catboost':
    final_model = catboost_model
    final_model.fit(X, y, cat_features=cat_features, verbose=False)
else:
    print(f"‚ö†Ô∏è Invalid model selection: {SELECTED_MODEL}")
    final_model = xgb_model

print(f"‚úì Final model training completed")

# Save the model
model_filename = f'model_{SELECTED_MODEL}.pkl'
joblib.dump(final_model, model_filename)
print(f"‚úì Model saved as '{model_filename}'")

# Save the encoders
encoders_filename = f'encoders_{SELECTED_MODEL}.pkl'
joblib.dump({
    'target_encoder': target_encoder,
    'feature_encoders': feature_encoders
}, encoders_filename)
print(f"‚úì Encoders saved as '{encoders_filename}'")

# Save feature information
feature_info = {
    'feature_names': feature_names,
    'n_features': len(feature_names),
    'n_classes': n_classes,
    'class_names': target_encoder.classes_.tolist(),
    'cat_features': cat_features if SELECTED_MODEL == 'catboost' else []
}

feature_info_filename = f'feature_info_{SELECTED_MODEL}.json'
with open(feature_info_filename, 'w') as f:
    json.dump(feature_info, f, indent=2)
print(f"‚úì Feature info saved as '{feature_info_filename}'")

## Section 14: Generate Submission

In [None]:
# ============================================================
# GENERATE SUBMISSION FILE
# ============================================================

print("\nüì§ Generating submission file...")
print("=" * 60)

# Check if test data exists
if Path(TEST_DATA_PATH).exists():
    # Load test data
    test_df = pd.read_csv(TEST_DATA_PATH, sep=None, engine='python')
    test_df.columns = test_df.columns.str.replace('\t', ' ').str.strip()
    
    print(f"‚úì Test data loaded: {test_df.shape}")
    
    # Save test IDs if present
    if 'id' in test_df.columns:
        test_ids = test_df['id'].copy()
    else:
        test_ids = pd.Series(range(len(test_df)), name='id')
    
    # Preprocess test data (without target)
    X_test, _, _, _, _ = preprocess_data(
        test_df, 
        target_encoder=target_encoder,
        feature_encoders=feature_encoders,
        is_training=False
    )
    
    # Make predictions
    if SELECTED_MODEL == 'catboost':
        predictions = final_model.predict(X_test).ravel()
    else:
        predictions = final_model.predict(X_test)
    
    # Decode predictions back to original labels
    predictions_decoded = target_encoder.inverse_transform(predictions.astype(int))
    
    # Create submission dataframe
    submission_df = pd.DataFrame({
        'id': test_ids,
        'Target': predictions_decoded
    })
    
    # Save submission file
    submission_filename = f'submission_{SELECTED_MODEL}.csv'
    submission_df.to_csv(submission_filename, index=False)
    
    print(f"\n‚úì Submission file generated: '{submission_filename}'")
    print(f"  Number of predictions: {len(submission_df)}")
    
    # Display first 10 predictions
    print("\n  First 10 predictions:")
    print(submission_df.head(10).to_string(index=False))
    
    # Show prediction distribution
    print("\n  Prediction distribution:")
    pred_dist = submission_df['Target'].value_counts()
    for label, count in pred_dist.items():
        print(f"    {label}: {count} ({count/len(submission_df)*100:.1f}%)")
    
else:
    print(f"‚ö†Ô∏è Test data file not found: {TEST_DATA_PATH}")
    print("  Skipping submission generation")
    print("  To generate submission, place test.csv in the current directory and re-run this cell")

## Section 15: Summary

In [None]:
# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "=" * 60)
print("üìä FINAL MODEL SUMMARY")
print("=" * 60)

print(f"\nüèÜ Model Details:")
print(f"  Selected Model: {SELECTED_MODEL.upper()}")
print(f"  Number of Features: {len(feature_names)}")
print(f"  Number of Classes: {n_classes}")
print(f"  Class Names: {', '.join(target_encoder.classes_)}")

print(f"\nüìà Performance Metrics:")
if SELECTED_MODEL == 'xgboost':
    print(f"  Training Accuracy: {train_acc_xgb:.4f}")
    print(f"  Validation Accuracy: {val_acc_xgb:.4f}")
elif SELECTED_MODEL == 'lightgbm':
    print(f"  Training Accuracy: {train_acc_lgb:.4f}")
    print(f"  Validation Accuracy: {val_acc_lgb:.4f}")
elif SELECTED_MODEL == 'catboost':
    print(f"  Training Accuracy: {train_acc_cat:.4f}")
    print(f"  Validation Accuracy: {val_acc_cat:.4f}")

print(f"  Mean CV Accuracy: {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")

print(f"\nüíæ Saved Files:")
print(f"  Model: {model_filename}")
print(f"  Encoders: {encoders_filename}")
print(f"  Feature Info: {feature_info_filename}")
if Path(TEST_DATA_PATH).exists():
    print(f"  Submission: submission_{SELECTED_MODEL}.csv")

print(f"\n‚öôÔ∏è Configuration Used:")
print(f"  Hyperparameter Tuning: {ENABLE_HYPERPARAMETER_TUNING}")
print(f"  Feature Engineering: {ENABLE_FEATURE_ENGINEERING}")
print(f"  Visualizations: {ENABLE_VISUALIZATIONS}")
print(f"  Random Seed: {RANDOM_SEED}")
print(f"  CV Folds: {CV_FOLDS}")

print("\n" + "=" * 60)
print("‚úì Pipeline execution completed successfully!")
print("=" * 60)

# Tips for better performance
print("\nüí° Tips for Better Performance:")
print("  1. Try all three models and compare their CV scores")
print("  2. Enable hyperparameter tuning for fine-tuning")
print("  3. Experiment with feature engineering")
print("  4. Consider ensemble methods (averaging predictions)")
print("  5. Always trust cross-validation scores over validation scores")
print("\nGood luck with your submission! üöÄ")