In [1]:
import pandas as pd
import numpy as np 
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import joblib

# Load data from ../Dataset

In [2]:
df = pd.read_csv('../Dataset/df_processed.csv')

In [3]:
df.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,School_Bus,WklyStudyHours,Grade
0,female,group C,bachelor's degree,standard,0,married,regularly,1,3,1,Less than 5 hours,0
1,female,group C,some college,standard,0,married,sometimes,1,0,1,Between 5-10 hours,1
2,female,group B,master's degree,standard,0,single,sometimes,1,4,1,Less than 5 hours,1
3,male,group A,associate's degree,free/reduced,0,married,never,0,1,1,Between 5-10 hours,0
4,male,group C,some college,standard,0,married,sometimes,1,0,1,Between 5-10 hours,0


# Split data

In [4]:
# Create label and features
X = df.drop('Grade', axis=1)
y = df['Grade']

In [5]:
# Create train, test, and validation data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a catboost classifier

In [10]:
# Identify categorical features
cat_features = list(range(0, X_train.shape[1]))

# Create the model
model = CatBoostClassifier(
    iterations=1000, 
    depth=7, 
    cat_features=cat_features, 
    verbose=200,
    early_stopping_rounds=50
)

model = CatBoostClassifier(cat_features=cat_features)

# Train the model
model.fit(X_train, y_train, eval_set=(X_val, y_val))

Learning rate set to 0.067571
0:	learn: 0.6636986	test: 0.6633542	best: 0.6633542 (0)	total: 46ms	remaining: 46s
1:	learn: 0.6388933	test: 0.6382687	best: 0.6382687 (1)	total: 65.5ms	remaining: 32.7s
2:	learn: 0.6178962	test: 0.6170721	best: 0.6170721 (2)	total: 77.7ms	remaining: 25.8s
3:	learn: 0.6010173	test: 0.5998312	best: 0.5998312 (3)	total: 84.9ms	remaining: 21.1s
4:	learn: 0.5860938	test: 0.5844137	best: 0.5844137 (4)	total: 95ms	remaining: 18.9s
5:	learn: 0.5733653	test: 0.5715318	best: 0.5715318 (5)	total: 105ms	remaining: 17.4s
6:	learn: 0.5605756	test: 0.5584266	best: 0.5584266 (6)	total: 116ms	remaining: 16.5s
7:	learn: 0.5538425	test: 0.5515368	best: 0.5515368 (7)	total: 123ms	remaining: 15.2s
8:	learn: 0.5459125	test: 0.5435419	best: 0.5435419 (8)	total: 133ms	remaining: 14.6s
9:	learn: 0.5404750	test: 0.5379399	best: 0.5379399 (9)	total: 139ms	remaining: 13.8s
10:	learn: 0.5322171	test: 0.5295962	best: 0.5295962 (10)	total: 149ms	remaining: 13.4s
11:	learn: 0.5251975	te

<catboost.core.CatBoostClassifier at 0x162320640>

In [12]:
test_score = model.score(X_test, y_test)
test_score

0.778116162714814

# Hyperparameter tuning

In [7]:
# Define the hyperparameter space
space = {
    'iterations': hp.quniform('iterations', 500, 1500, 100),
    'depth': hp.quniform('depth', 4, 8, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'l2_leaf_reg': hp.quniform('l2_leaf_reg', 1, 5, 1)
}

In [None]:
def objective(params):
    params = {
        'iterations': int(params['iterations']),
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'l2_leaf_reg': int(params['l2_leaf_reg']),
        'cat_features': cat_features,
        'verbose': 0
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    loss = -model.score(X_val, y_val)  # Minimize negative accuracy
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [None]:
# Run Bayesian Optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

# Evaluate best model

In [None]:
# Train the model with the best hyperparameters
best_params = {
    'iterations': int(best['iterations']),
    'depth': int(best['depth']),
    'learning_rate': best['learning_rate'],
    'l2_leaf_reg': int(best['l2_leaf_reg']),
    'cat_features': cat_features,
    'verbose': 0
}
best_model = CatBoostClassifier(**best_params)
best_model.fit(X_train, y_train)

# Evaluate on the test set
test_score = best_model.score(X_test, y_test)
print(f'Test Accuracy: {test_score * 100:.2f}%')

# Load Model in ../Models

In [None]:
# Save the model to a file
model_filename = '../Models/blackbox_model.pkl'
joblib.dump(best_model, model_filename)

# Load the model from the file
loaded_model = joblib.load(model_filename)