In [1]:
# Install libraries if running in a local environment that lacks them
# !pip install pandas numpy scikit-learn xgboost lightgbm catboost

import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling Libraries
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Gradient Boosting Libraries
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries Imported Successfully!")


Libraries Imported Successfully!


In [2]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

print(f"Train Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")

# Drop ID column as it is not a feature
train_df = train_df.drop('id', axis=1)
test_ids = test_df['id'] # Keep for submission
test_df = test_df.drop('id', axis=1)

# Display first few rows
train_df.head()

Train Shape: (630000, 13)
Test Shape: (270000, 12)


Unnamed: 0,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


In [3]:
def preprocess_data(df):
    df = df.copy()
    
    # --- Manual Ordinal Encoding (Order Matters) ---
    # Sleep Quality: Poor -> Average -> Good
    sleep_map = {'poor': 0, 'average': 1, 'good': 2}
    df['sleep_quality'] = df['sleep_quality'].map(sleep_map)
    
    # Facility Rating: Low -> Moderate/Medium -> High
    # Note: Check unique values in your data if it's 'medium' or 'moderate'
    facility_map = {'low': 0, 'moderate': 1, 'medium': 1, 'high': 2}
    df['facility_rating'] = df['facility_rating'].map(facility_map)
    
    # Exam Difficulty: Easy -> Moderate -> Hard
    difficulty_map = {'easy': 0, 'moderate': 1, 'hard': 2}
    df['exam_difficulty'] = df['exam_difficulty'].map(difficulty_map)
    
    # Internet Access: Yes/No -> 1/0
    internet_map = {'yes': 1, 'no': 0}
    df['internet_access'] = df['internet_access'].map(internet_map)
    
    return df

# Apply ordinal encodings
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# --- Feature Engineering (Creating New Smart Features) ---
# Combine train and test for consistent One-Hot Encoding
all_data = pd.concat([train_df.drop('exam_score', axis=1), test_df], axis=0)

# One-Hot Encoding for Nominal Columns (Gender, Course, Study Method)
# We use pd.get_dummies which is simpler for quick competitions
all_data = pd.get_dummies(all_data, columns=['gender', 'course', 'study_method'], drop_first=True)

# Split back into train and test
X = all_data.iloc[:len(train_df), :]
X_test = all_data.iloc[len(train_df):, :]
y = train_df['exam_score']

print("Data Preprocessing Complete.")
print(f"Features used: {X.columns.tolist()}")

Data Preprocessing Complete.
Features used: ['age', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'facility_rating', 'exam_difficulty', 'gender_male', 'gender_other', 'course_b.sc', 'course_b.tech', 'course_ba', 'course_bba', 'course_bca', 'course_diploma', 'study_method_group study', 'study_method_mixed', 'study_method_online videos', 'study_method_self-study']


In [4]:
# 1. XGBoost
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_jobs': -1,
    'random_state': 42,
    'tree_method': 'hist' # Faster training
}

# 2. LightGBM
lgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_jobs': -1,
    'random_state': 42,
    'verbose': -1
}

# 3. CatBoost (Handles categoricals well automatically, but we already encoded)
cat_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 3,
    'verbose': 0,
    'random_seed': 42
}

model_xgb = xgb.XGBRegressor(**xgb_params)
model_lgb = lgb.LGBMRegressor(**lgb_params)
model_cat = CatBoostRegressor(**cat_params)

In [5]:
from sklearn.base import BaseEstimator, RegressorMixin

# --- STEP 1: DEFINE THE WRAPPER TO FIX THE ATTRIBUTE ERROR ---
class SklearnCatBoost(BaseEstimator, RegressorMixin):
    def __init__(self, **params):
        self.params = params
        self.model = CatBoostRegressor(**self.params)
    
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def get_params(self, deep=True):
        return self.params

    def set_params(self, **params):
        self.params.update(params)
        return self

# --- STEP 2: RE-INITIALIZE MODELS ---
model_xgb = xgb.XGBRegressor(**xgb_params)
model_lgb = lgb.LGBMRegressor(**lgb_params)
# Use the new wrapper for CatBoost
model_cat = SklearnCatBoost(**cat_params)

# --- STEP 3: EVALUATE ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_model(model, X, y, name):
    # Using negative RMSE because cross_val_score tries to maximize score
    # n_jobs=1 is safer for CatBoost to avoid threading conflicts
    cv_scores = cross_val_score(model, X, y, 
                                scoring='neg_root_mean_squared_error', 
                                cv=kf, 
                                n_jobs=-1)
    rmse_scores = -cv_scores
    print(f"{name} Average RMSE: {np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores):.4f})")
    return np.mean(rmse_scores)

print("--- Evaluating Models ---")
score_xgb = evaluate_model(model_xgb, X, y, "XGBoost")
score_lgb = evaluate_model(model_lgb, X, y, "LightGBM")
score_cat = evaluate_model(model_cat, X, y, "CatBoost")

--- Evaluating Models ---
XGBoost Average RMSE: 8.7582 (+/- 0.0143)
LightGBM Average RMSE: 8.7690 (+/- 0.0135)
CatBoost Average RMSE: 8.7827 (+/- 0.0128)


In [6]:
print("--- Training Final Models on Full Data ---")
model_xgb.fit(X, y)
print("XGBoost Trained.")
model_lgb.fit(X, y)
print("LightGBM Trained.")
model_cat.fit(X, y)
print("CatBoost Trained.")

# Make predictions on test set
pred_xgb = model_xgb.predict(X_test)
pred_lgb = model_lgb.predict(X_test)
pred_cat = model_cat.predict(X_test)

# Weighted Average Ensemble
# You can adjust weights based on CV scores (give more weight to the best model)
# Here we treat them equally which is a safe starting strategy
final_predictions = (0.34 * pred_xgb) + (0.33 * pred_lgb) + (0.33 * pred_cat)

# Post-processing: Clip predictions to be between 0 and 100
# (Since you can't score 105 or -5 on an exam)
final_predictions = np.clip(final_predictions, 0, 100)

print("Prediction Complete.")

--- Training Final Models on Full Data ---
XGBoost Trained.
LightGBM Trained.
CatBoost Trained.
Prediction Complete.


In [7]:
# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'exam_score': final_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("submission.csv created successfully!")
print(submission.head())

submission.csv created successfully!
       id  exam_score
0  630000   72.068545
1  630001   69.933980
2  630002   87.971088
3  630003   55.996914
4  630004   46.766826
