# Kaggle Playground Series S6E1: Student Exam Score Prediction

**Approach:** XGBoost with Ridge Regression meta-feature stacking

---

## Pipeline Overview

1. Load competition and original datasets
2. Engineer 34 numeric features (polynomials, logs, interactions, bins)
3. Train Ridge regression with target encoding â†’ generate OOF meta-feature
4. Train XGBoost with native categorical support + Ridge predictions as feature
5. Generate submission via fold averaging

In [1]:
"""
Student Test Score Prediction - XGBoost with Ridge Regression
Modified for local execution
"""

import numpy as np
import pandas as pd
import xgboost as xgb
import warnings

from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import TargetEncoder

warnings.filterwarnings("ignore")
np.random.seed(42)

# ============================================================================
# FILE PATHS - UPDATE THESE TO MATCH YOUR LOCAL SYSTEM
# ============================================================================
TRAIN_PATH = "/Users/badalkr.sharma/Documents/Kaggle Competitions/Predicting student test score/playground-series-s6e1/train.csv"
TEST_PATH = "/Users/badalkr.sharma/Documents/Kaggle Competitions/Predicting student test score/playground-series-s6e1/test.csv"
SUBMISSION_PATH = "/Users/badalkr.sharma/Documents/Kaggle Competitions/Predicting student test score/playground-series-s6e1/sample_submission.csv"

# Original dataset path - UPDATE THIS IF YOU HAVE THE ORIGINAL DATASET
# If you don't have this file, the code will work without it (see below)
ORIGINAL_PATH = "/Users/badalkr.sharma/Documents/Kaggle Competitions/Predicting student test score/playground-series-s6e1/Exam_Score_Prediction.csv"

# Output directory - files will be saved in the same location as input files
OUTPUT_DIR = "/Users/badalkr.sharma/Documents/Kaggle Competitions/Predicting student test score/playground-series-s6e1/"

# ============================================================================
# CONFIGURATION
# ============================================================================
TARGET = "exam_score"
ID_COL = "id"
N_FOLDS = 10
RANDOM_STATE = 1003

# ============================================================================
# LOAD DATA
# ============================================================================
print("Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
submission_df = pd.read_csv(SUBMISSION_PATH)

# Try to load original dataset, if not available, continue without it
try:
    original_df = pd.read_csv(ORIGINAL_PATH)
    use_original = True
    print(f"Train:    {train_df.shape}")
    print(f"Test:     {test_df.shape}")
    print(f"Original: {original_df.shape}")
except FileNotFoundError:
    print(f"Warning: Original dataset not found at {ORIGINAL_PATH}")
    print("Continuing without original dataset (this may slightly reduce performance)")
    original_df = None
    use_original = False
    print(f"Train:    {train_df.shape}")
    print(f"Test:     {test_df.shape}")

base_features = [c for c in train_df.columns if c not in [TARGET, ID_COL]]
cat_features = train_df.select_dtypes("object").columns.tolist()

print(f"\nBase features: {len(base_features)}")
print(f"Categorical:   {cat_features}")

# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
def engineer_features(df, base_cols):
    """Generate engineered features from raw data."""
    out = df.copy()
    eps = 1e-5
    
    # Clipped values for safe transforms
    study = out['study_hours'].clip(lower=0)
    attend = out['class_attendance'].clip(lower=0)
    sleep = out['sleep_hours'].clip(lower=0)
    
    # Polynomial features
    out['study_hours_squared'] = out['study_hours'] ** 2
    out['class_attendance_squared'] = out['class_attendance'] ** 2
    out['sleep_hours_squared'] = out['sleep_hours'] ** 2
    out['age_squared'] = out['age'] ** 2
    
    # Log transforms
    out['log_study_hours'] = np.log1p(study)
    out['log_class_attendance'] = np.log1p(attend)
    out['log_sleep_hours'] = np.log1p(sleep)
    
    # Sqrt transforms
    out['sqrt_study_hours'] = np.sqrt(study)
    out['sqrt_class_attendance'] = np.sqrt(attend)
    
    # Interactions
    out['study_hours_times_attendance'] = out['study_hours'] * out['class_attendance']
    out['study_hours_times_sleep'] = out['study_hours'] * out['sleep_hours']
    out['attendance_times_sleep'] = out['class_attendance'] * out['sleep_hours']
    out['age_times_study_hours'] = out['age'] * out['study_hours']
    
    # Ratios
    out['study_hours_over_sleep'] = out['study_hours'] / (out['sleep_hours'] + eps)
    out['attendance_over_sleep'] = out['class_attendance'] / (out['sleep_hours'] + eps)
    out['attendance_over_study'] = out['class_attendance'] / (out['study_hours'] + eps)
    
    # Ordinal mappings
    ordinal_maps = {
        'sleep_quality': {'poor': 0, 'average': 1, 'good': 2},
        'facility_rating': {'low': 0, 'medium': 1, 'high': 2},
        'exam_difficulty': {'easy': 0, 'moderate': 1, 'hard': 2}
    }
    for col, mapping in ordinal_maps.items():
        out[f'{col}_numeric'] = out[col].map(mapping).fillna(1).astype(int)
    
    # Ordinal interactions
    out['study_hours_times_sleep_quality'] = out['study_hours'] * out['sleep_quality_numeric']
    out['attendance_times_facility'] = out['class_attendance'] * out['facility_rating_numeric']
    out['sleep_hours_times_difficulty'] = out['sleep_hours'] * out['exam_difficulty_numeric']
    out['facility_x_sleepq'] = out['facility_rating_numeric'] * out['sleep_quality_numeric']
    out['difficulty_x_facility'] = out['exam_difficulty_numeric'] * out['facility_rating_numeric']
    
    # Rule-based flags
    out['high_att_high_study'] = ((out['class_attendance'] >= 90) & (out['study_hours'] >= 6)).astype(int)
    out['ideal_sleep_flag'] = ((out['sleep_hours'] >= 7) & (out['sleep_hours'] <= 9)).astype(int)
    out['high_study_flag'] = (out['study_hours'] >= 7).astype(int)
    
    # Composite efficiency
    out['efficiency'] = (out['study_hours'] * out['class_attendance']) / (out['sleep_hours'] + 1)
    
    # Gap features
    out['sleep_gap_8'] = (out['sleep_hours'] - 8.0).abs()
    out['attendance_gap_100'] = (out['class_attendance'] - 100.0).abs()
    
    # Binned features
    out['study_bin_num'] = pd.cut(out['study_hours'], bins=5, labels=False).astype(int)
    out['attendance_bin_num'] = pd.cut(out['class_attendance'], bins=5, labels=False).astype(int)
    out['sleep_bin_num'] = pd.cut(out['sleep_hours'], bins=5, labels=False).astype(int)
    out['age_bin_num'] = pd.cut(out['age'], bins=5, labels=False).astype(int)
    
    engineered_cols = [
        'study_hours_squared', 'class_attendance_squared', 'sleep_hours_squared', 'age_squared',
        'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
        'sqrt_study_hours', 'sqrt_class_attendance',
        'study_hours_times_attendance', 'study_hours_times_sleep', 'attendance_times_sleep',
        'age_times_study_hours',
        'study_hours_over_sleep', 'attendance_over_sleep', 'attendance_over_study',
        'sleep_quality_numeric', 'facility_rating_numeric', 'exam_difficulty_numeric',
        'study_hours_times_sleep_quality', 'attendance_times_facility', 'sleep_hours_times_difficulty',
        'facility_x_sleepq', 'difficulty_x_facility',
        'high_att_high_study', 'ideal_sleep_flag', 'high_study_flag',
        'efficiency',
        'sleep_gap_8', 'attendance_gap_100',
        'study_bin_num', 'attendance_bin_num', 'sleep_bin_num', 'age_bin_num'
    ]
    
    return out[base_cols + engineered_cols], engineered_cols

# Apply feature engineering
print("\nApplying feature engineering...")
X_train, engineered_cols = engineer_features(train_df, base_features)
X_test, _ = engineer_features(test_df, base_features)

y_train = train_df[TARGET].reset_index(drop=True)

# Handle original data if available
if use_original:
    X_orig, _ = engineer_features(original_df, base_features)
    y_orig = original_df[TARGET].reset_index(drop=True)
    full_data = pd.concat([X_train, X_test, X_orig], axis=0, ignore_index=True)
else:
    X_orig = None
    y_orig = None
    full_data = pd.concat([X_train, X_test], axis=0, ignore_index=True)

for col in engineered_cols:
    full_data[col] = full_data[col].astype(float)

n_train, n_test = len(train_df), len(test_df)
X = full_data.iloc[:n_train].copy()
X_test = full_data.iloc[n_train:n_train + n_test].copy()

if use_original:
    X_original = full_data.iloc[n_train + n_test:].copy()
else:
    X_original = None

print(f"Engineered features: {len(engineered_cols)}")
print(f"Total features:      {X.shape[1]} (11 base + {len(engineered_cols)} engineered)")

# ============================================================================
# RIDGE REGRESSION WITH CROSS-VALIDATION
# ============================================================================
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

oof_ridge = np.zeros(len(X))
test_preds_ridge = np.zeros((len(X_test), N_FOLDS))
if use_original:
    orig_preds_ridge = np.zeros(len(X_original))

ridge_alphas = np.logspace(-3, 3, 20)

print("\n" + "="*50)
print("TRAINING RIDGE REGRESSION")
print("="*50)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_train), 1):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Augment with original data if available
    if use_original:
        X_tr_aug = pd.concat([X_tr, X_original], axis=0)
        y_tr_aug = pd.concat([y_tr, y_orig], axis=0)
    else:
        X_tr_aug = X_tr
        y_tr_aug = y_tr
    
    # Target encode categoricals
    encoder = TargetEncoder(smooth='auto', target_type='continuous')
    X_tr_enc = X_tr_aug.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()
    
    X_tr_enc[cat_features] = encoder.fit_transform(X_tr_aug[cat_features], y_tr_aug)
    X_val_enc[cat_features] = encoder.transform(X_val[cat_features])
    X_test_enc[cat_features] = encoder.transform(X_test[cat_features])
    
    # Fit Ridge
    ridge = RidgeCV(alphas=ridge_alphas, cv=5, scoring='neg_root_mean_squared_error')
    ridge.fit(X_tr_enc, y_tr_aug.values.ravel())
    
    # Predictions (clipped to valid range)
    oof_ridge[val_idx] = np.clip(ridge.predict(X_val_enc), 0, 100)
    test_preds_ridge[:, fold - 1] = np.clip(ridge.predict(X_test_enc), 0, 100)
    
    if use_original:
        orig_preds_ridge += np.clip(ridge.predict(X_tr_enc.iloc[-len(X_original):]), 0, 100) / N_FOLDS
    
    rmse = np.sqrt(mean_squared_error(y_val, oof_ridge[val_idx]))
    print(f"Fold {fold:2d} | RMSE: {rmse:.6f}")

ridge_oof_rmse = np.sqrt(mean_squared_error(y_train, oof_ridge))
print(f"\nRidge OOF RMSE: {ridge_oof_rmse:.6f}")

# ============================================================================
# XGBOOST WITH RIDGE META-FEATURE
# ============================================================================
print("\n" + "="*50)
print("PREPARING DATA FOR XGBOOST")
print("="*50)

# Convert categoricals for XGBoost native handling
if use_original:
    full_data = pd.concat([X_train, X_test, X_orig], axis=0, ignore_index=True)
else:
    full_data = pd.concat([X_train, X_test], axis=0, ignore_index=True)

for col in base_features:
    full_data[col] = full_data[col].astype(str).astype("category")
for col in engineered_cols:
    full_data[col] = full_data[col].astype(float)

X_xgb = full_data.iloc[:n_train].copy()
X_test_xgb = full_data.iloc[n_train:n_train + n_test].copy()

if use_original:
    X_orig_xgb = full_data.iloc[n_train + n_test:].copy()
else:
    X_orig_xgb = None

# Add Ridge meta-feature
X_xgb['ridge_pred'] = oof_ridge
X_test_xgb['ridge_pred'] = test_preds_ridge.mean(axis=1)
if use_original:
    X_orig_xgb['ridge_pred'] = orig_preds_ridge

print(f"Final feature count: {X_xgb.shape[1]} (including Ridge meta-feature)")

# XGBoost parameters
# Note: Changed device to 'cpu' for compatibility. Change to 'cuda' if you have GPU
xgb_params = {
    "n_estimators": 20000,
    "learning_rate": 0.004,
    "max_depth": 9,
    "subsample": 0.78,
    "colsample_bytree": 0.55,
    "colsample_bynode": 0.65,
    "reg_lambda": 6,
    "reg_alpha": 0.15,
    "min_child_weight": 6,
    "tree_method": "hist",
    "enable_categorical": True,
    "eval_metric": "rmse",
    "early_stopping_rounds": 100,
    "random_state": 42,
    "device": "cpu"  # Change to "cuda" if you have a GPU
}

test_preds_xgb = []
oof_xgb = np.zeros(len(X_xgb))

print("\n" + "="*50)
print("TRAINING XGBOOST")
print("="*50)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_xgb, y_train), 1):
    print(f"\nFold {fold}/{N_FOLDS}")
    
    X_tr, X_val = X_xgb.iloc[train_idx], X_xgb.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Augment with original data if available
    if use_original:
        X_tr_aug = pd.concat([X_tr, X_orig_xgb], axis=0)
        y_tr_aug = pd.concat([y_tr, y_orig], axis=0)
    else:
        X_tr_aug = X_tr
        y_tr_aug = y_tr
    
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_tr_aug, y_tr_aug,
        eval_set=[(X_val, y_val)],
        verbose=1000
    )
    
    oof_xgb[val_idx] = model.predict(X_val)
    test_preds_xgb.append(model.predict(X_test_xgb))
    
    rmse = np.sqrt(mean_squared_error(y_val, oof_xgb[val_idx]))
    print(f"Validation RMSE: {rmse:.5f}")

xgb_oof_rmse = np.sqrt(mean_squared_error(y_train, oof_xgb))

# ============================================================================
# RESULTS SUMMARY
# ============================================================================
print("\n" + "="*50)
print("MODEL PERFORMANCE SUMMARY")
print("="*50)
print(f"Ridge OOF RMSE:   {ridge_oof_rmse:.6f}")
print(f"XGBoost OOF RMSE: {xgb_oof_rmse:.5f}")

print(f"\nFeature Summary")
print("-" * 40)
print(f"Base features:       {len(base_features)}")
print(f"Engineered features: {len(engineered_cols)}")
print(f"Meta-feature:        1")
print(f"Total:               {X_xgb.shape[1]}")

# ============================================================================
# SAVE RESULTS
# ============================================================================
import os

# OOF predictions
oof_df = pd.DataFrame({ID_COL: train_df[ID_COL], TARGET: oof_xgb})
oof_path = os.path.join(OUTPUT_DIR, "xgb_oof_optimized.csv")
oof_df.to_csv(oof_path, index=False)

# Submission
submission = submission_df.copy()
submission[TARGET] = np.mean(test_preds_xgb, axis=0)
submission_path = os.path.join(OUTPUT_DIR, "submission_optimized.csv")
submission.to_csv(submission_path, index=False)

# Feature importance
importance_scores = model.get_booster().get_score(importance_type="gain")

importance_df = pd.DataFrame({
    "feature": list(importance_scores.keys()),
    "importance": list(importance_scores.values())
}).sort_values("importance", ascending=False)

importance_df['importance_pct'] = 100 * importance_df['importance'] / importance_df['importance'].sum()
importance_path = os.path.join(OUTPUT_DIR, "feature_importance_optimized.csv")
importance_df.to_csv(importance_path, index=False)

print("\n" + "="*50)
print("FILES SAVED")
print("="*50)
print(f"  {submission_path}")
print(f"  {oof_path}")
print(f"  {importance_path}")

print("\nTop 10 Features by Gain")
print("-" * 50)
print(importance_df.head(10)[['feature', 'importance_pct']].to_string(index=False))

print("\n" + "="*50)
print("EXECUTION COMPLETE!")
print("="*50)

Loading data...
Continuing without original dataset (this may slightly reduce performance)
Train:    (630000, 13)
Test:     (270000, 12)

Base features: 11
Categorical:   ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']

Applying feature engineering...
Engineered features: 34
Total features:      45 (11 base + 34 engineered)

TRAINING RIDGE REGRESSION
Fold  1 | RMSE: 8.870377
Fold  2 | RMSE: 8.962600
Fold  3 | RMSE: 8.867122
Fold  4 | RMSE: 8.877534
Fold  5 | RMSE: 8.889717
Fold  6 | RMSE: 8.903042
Fold  7 | RMSE: 8.889881
Fold  8 | RMSE: 8.867915
Fold  9 | RMSE: 8.917761
Fold 10 | RMSE: 8.889603

Ridge OOF RMSE: 8.893598

PREPARING DATA FOR XGBOOST
Final feature count: 46 (including Ridge meta-feature)

TRAINING XGBOOST

Fold 1/10
[0]	validation_0-rmse:18.86022
[1000]	validation_0-rmse:8.61629
[1947]	validation_0-rmse:8.59355
Validation RMSE: 8.59336

Fold 2/10
[0]	validation_0-rmse:18.90668
[1000]	validation_0-rmse:8.70249