In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s6e1/sample_submission.csv
/kaggle/input/playground-series-s6e1/train.csv
/kaggle/input/playground-series-s6e1/test.csv


In [2]:
!pip install catboost




In [3]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


In [4]:
train_df = pd.read_csv("/kaggle/input/playground-series-s6e1/train.csv")
test_df  = pd.read_csv("/kaggle/input/playground-series-s6e1/test.csv")


In [5]:
# Drop ID
X = train_df.drop(columns=["id", "exam_score"])
y = train_df["exam_score"]

X_test = test_df.drop(columns=["id"])


In [6]:
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Categorical features:")
print(categorical_features)


Categorical features:
['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


In [7]:
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))


In [8]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n--- Fold {fold+1} ---")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.03,
        depth=8,
        loss_function="RMSE",
        eval_metric="RMSE",
        cat_features=categorical_features,
        random_seed=42,
        verbose=200,
        early_stopping_rounds=200
    )

    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True
    )

    oof_preds[val_idx] = model.predict(X_val)
    test_preds += model.predict(X_test) / N_SPLITS



--- Fold 1 ---
0:	learn: 18.5512672	test: 18.4806218	best: 18.4806218 (0)	total: 453ms	remaining: 22m 38s
200:	learn: 8.8411323	test: 8.8355849	best: 8.8355849 (200)	total: 1m 6s	remaining: 15m 22s
400:	learn: 8.8172224	test: 8.8194302	best: 8.8194302 (400)	total: 2m 3s	remaining: 13m 23s
600:	learn: 8.7917123	test: 8.8011426	best: 8.8011426 (600)	total: 3m 14s	remaining: 12m 56s
800:	learn: 8.7717349	test: 8.7888181	best: 8.7888181 (800)	total: 4m 26s	remaining: 12m 11s
1000:	learn: 8.7546248	test: 8.7801220	best: 8.7801220 (1000)	total: 5m 38s	remaining: 11m 15s
1200:	learn: 8.7394897	test: 8.7735593	best: 8.7735593 (1200)	total: 6m 48s	remaining: 10m 12s
1400:	learn: 8.7245302	test: 8.7682367	best: 8.7682367 (1400)	total: 8m	remaining: 9m 8s
1600:	learn: 8.7108206	test: 8.7641137	best: 8.7641137 (1600)	total: 9m 11s	remaining: 8m 2s
1800:	learn: 8.6977494	test: 8.7600554	best: 8.7600471 (1799)	total: 10m 23s	remaining: 6m 54s
2000:	learn: 8.6852814	test: 8.7569944	best: 8.7569944 (

In [11]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y, oof_preds))
print(f"\nFinal CV RMSE: {rmse:.5f}")




Final CV RMSE: 8.75719


In [12]:
feature_importance = model.get_feature_importance(prettified=True)
print(feature_importance)


          Feature Id  Importances
0        study_hours    47.074725
1   class_attendance    18.854873
2      sleep_quality    10.611411
3       study_method     9.637972
4    facility_rating     7.512354
5        sleep_hours     5.147418
6             course     0.411279
7                age     0.384290
8    exam_difficulty     0.212617
9             gender     0.079371
10   internet_access     0.073690


In [13]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "exam_score": test_preds
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created")


submission.csv created
