### Overview
Welcome to the 2026 Kaggle Playground Series! We plan to continue in the spirit of previous playgrounds, providing interesting and approachable datasets for our community to practice their machine learning skills, and anticipate a competition each month.

**Your Goal:** Predict students' test scores.

In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.optimize import minimize
warnings.filterwarnings('ignore')

# SABITLER
RANDOM_STATE = 42
N_FOLDS = 10

print("Veri yükleniyor...")
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

# FEATURE ENGINEERING
def feature_engineering(df, is_train=True):
    df = df.copy()
    
    quality_map = {'poor': 0, 'average': 1, 'good': 2}
    rating_map  = {'low': 0, 'medium': 1, 'high': 2}
    diff_map    = {'easy': 0, 'moderate': 1, 'hard': 2}
    
    df['sleep_quality']    = df['sleep_quality'].map(quality_map).fillna(1)
    df['facility_rating']  = df['facility_rating'].map(rating_map).fillna(1)
    df['exam_difficulty']  = df['exam_difficulty'].map(diff_map).fillna(1)
    df['internet_access']  = df['internet_access'].map({'no': 0, 'yes': 1}).fillna(0.5)
    
    df['study_per_sleep']      = df['study_hours'] / (df['sleep_hours'] + 0.1)
    df['attendance_per_study'] = df['class_attendance'] * df['study_hours']
    df['sleep_quality_x_hours']= df['sleep_quality'] * df['sleep_hours']
    df['study_sleep_diff']     = df['study_hours'] - df['sleep_hours']
    
    for col in ['study_hours', 'sleep_hours', 'class_attendance']:
        if col in df.columns:
            df[f'log_{col}']  = np.log1p(df[col].clip(0))
            df[f'sqrt_{col}'] = np.sqrt(df[col].clip(0))
    
    return df

train = feature_engineering(train)
test  = feature_engineering(test)

# Kategorik → Sayısal (Label Encoding)
cat_cols = ['gender', 'course', 'study_method']

for col in cat_cols:
    if col in train.columns:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]]).astype(str)
        le.fit(combined)
        train[col] = le.transform(train[col].astype(str))
        test[col]  = le.transform(test[col].astype(str))

# CLUSTERING — Hızlandırılmış versiyon
cluster_features = ['study_hours', 'sleep_hours', 'class_attendance']

valid_cluster_cols = [c for c in cluster_features if c in train.columns]

if valid_cluster_cols:
    print("Clustering başlıyor...")
    scaler = StandardScaler()
    cluster_train = scaler.fit_transform(train[valid_cluster_cols].fillna(0))
    cluster_test  = scaler.transform(test[valid_cluster_cols].fillna(0))
    
    for n in [3, 6]:  
        print(f"  KMeans n_clusters={n} çalışıyor...")
        kmeans = KMeans(n_clusters=n, random_state=RANDOM_STATE, n_init=10)
        train[f'cluster{n}'] = kmeans.fit_predict(cluster_train)
        test[f'cluster{n}']  = kmeans.predict(cluster_test)
    print("Clustering tamamlandı.")

# Target Encoding (manuel)
te_cols = ['course', 'study_method']

global_mean = train['exam_score'].mean()

for col in te_cols:
    if col in train.columns:
        means = train.groupby(col)['exam_score'].mean()
        counts = train.groupby(col)['exam_score'].count()
        smoothing = 5.0
        smoothed = (means * counts + global_mean * smoothing) / (counts + smoothing)
        
        train[f'te_{col}'] = train[col].map(smoothed).fillna(global_mean)
        test[f'te_{col}']  = test[col].map(smoothed).fillna(global_mean)

# Veri setleri
exclude = ['id', 'exam_score']
features = [c for c in train.columns if c not in exclude]
X = train[features].copy()
y = train['exam_score'].copy()
X_test = test[features].copy()

print(f"Feature sayısı: {len(features)}")

# Meta Features (Ridge + XGB)
def get_meta_oof(model, X, y, X_test, n_splits=5):
    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), 1):
        print(f"  Meta fold {fold}/{n_splits}...")
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        
        model.fit(X_tr, y_tr)
        oof[va_idx] = model.predict(X_va)
        test_pred += model.predict(X_test) / n_splits
    
    return oof, test_pred

print("Ridge meta-features hesaplanıyor...")
ridge = RidgeCV(alphas=np.logspace(-3, 3, 20), cv=5)
ridge_oof, ridge_test = get_meta_oof(ridge, X, y, X_test)

print("XGB meta-features hesaplanıyor...")
xgb_meta = XGBRegressor(n_estimators=1200, learning_rate=0.03, max_depth=6, 
                        subsample=0.8, colsample_bytree=0.8, random_state=RANDOM_STATE, n_jobs=-1)
xgb_oof, xgb_test = get_meta_oof(xgb_meta, X, y, X_test)

X_meta = X.copy()
X_meta['ridge_meta'] = ridge_oof
X_meta['xgb_meta']   = xgb_oof

X_test_meta = X_test.copy()
X_test_meta['ridge_meta'] = ridge_test
X_test_meta['xgb_meta']   = xgb_test

# ENSEMBLE
seeds = [42, 43, 44, 1234, 2024, 2025, 777]

oof_all = []
test_all = []

for seed in seeds:
    print(f"\nSeed {seed} için modeller eğitiliyor...")
    
    params_xgb = {
        'n_estimators': 2800, 'learning_rate': 0.014, 'max_depth': 7,
        'subsample': 0.82, 'colsample_bytree': 0.75, 'reg_alpha': 0.8, 'reg_lambda': 2.5,
        'random_state': seed, 'n_jobs': -1, 'tree_method': 'hist'
    }
    
    params_lgbm = {
        'n_estimators': 3200, 'learning_rate': 0.013, 'max_depth': 9, 'num_leaves': 35,
        'subsample': 0.80, 'colsample_bytree': 0.78, 'reg_alpha': 0.5, 'reg_lambda': 2.0,
        'random_state': seed, 'n_jobs': -1, 'verbosity': -1
    }
    
    params_cat = {
        'iterations': 2800, 'learning_rate': 0.016, 'depth': 7, 'l2_leaf_reg': 3.0,
        'random_seed': seed, 'verbose': 0
    }
    
    models = [
        ('xgb', XGBRegressor(**params_xgb)),
        ('lgbm', LGBMRegressor(**params_lgbm)),
        ('cat', CatBoostRegressor(**params_cat))
    ]
    
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    
    for name, model in models:
        print(f"  {name.upper()} modeli fold'lar...")
        oof = np.zeros(len(X_meta))
        test_pred = np.zeros(len(X_test_meta))
        
        for fold, (tr_idx, va_idx) in enumerate(kf.split(X_meta), 1):
            X_tr = X_meta.iloc[tr_idx]
            y_tr = y.iloc[tr_idx]
            X_va = X_meta.iloc[va_idx]
            
            model.fit(X_tr, y_tr)
            oof[va_idx] = model.predict(X_va)
            test_pred += model.predict(X_test_meta) / N_FOLDS
        
        oof_all.append(oof)
        test_all.append(test_pred)

oof_all = np.column_stack(oof_all)
test_all = np.column_stack(test_all)

# Ağırlık optimizasyonu
def ensemble_loss(w):
    pred = np.dot(oof_all, w)
    mse = mean_squared_error(y, pred)
    return np.sqrt(mse)

n_models = oof_all.shape[1]
init_w = np.ones(n_models) / n_models

print("Ağırlık optimizasyonu (Nelder-Mead) başlıyor...")
res = minimize(ensemble_loss, init_w, method='Nelder-Mead', 
               options={'maxiter': 8000, 'fatol': 1e-6})

best_weights = res.x / res.x.sum() if res.x.sum() != 0 else init_w

print(f"OOF RMSE: {res.fun:.5f}")

# Final tahmin
final_pred = np.dot(test_all, best_weights)
final_pred = np.clip(final_pred, 0, 100)

sub = pd.DataFrame({'id': test['id'], 'exam_score': final_pred})
sub.to_csv('submission_fixed.csv', index=False)
print("→ submission_fixed.csv hazır!")

Veri yükleniyor...
Clustering başlıyor...
  KMeans n_clusters=3 çalışıyor...
  KMeans n_clusters=6 çalışıyor...
Clustering tamamlandı.
Feature sayısı: 25
Ridge meta-features hesaplanıyor...
  Meta fold 1/5...
  Meta fold 2/5...
  Meta fold 3/5...
  Meta fold 4/5...
  Meta fold 5/5...
XGB meta-features hesaplanıyor...
  Meta fold 1/5...
  Meta fold 2/5...
  Meta fold 3/5...
  Meta fold 4/5...
  Meta fold 5/5...

Seed 42 için modeller eğitiliyor...
  XGB modeli fold'lar...
  LGBM modeli fold'lar...
  CAT modeli fold'lar...

Seed 43 için modeller eğitiliyor...
  XGB modeli fold'lar...
  LGBM modeli fold'lar...
  CAT modeli fold'lar...

Seed 44 için modeller eğitiliyor...
  XGB modeli fold'lar...
  LGBM modeli fold'lar...
  CAT modeli fold'lar...

Seed 1234 için modeller eğitiliyor...
  XGB modeli fold'lar...
  LGBM modeli fold'lar...
  CAT modeli fold'lar...

Seed 2024 için modeller eğitiliyor...
  XGB modeli fold'lar...
  LGBM modeli fold'lar...
  CAT modeli fold'lar...

Seed 2025 için m