In [None]:
import pandas as pd
import numpy as np 
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# Load data from ../Dataset

In [None]:
df = pd.read_csv('../Dataset/df_processed.csv')

In [None]:
df.head()

# Split data

In [None]:
# Create label and features
X = df.drop('Grade', axis=1)
y = df['Grade']

In [None]:
# Create train, test, and validation data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a catboost classifier

In [None]:
# Identify categorical features
cat_features = list(range(0, X_train.shape[1]))

# Create the model
model = CatBoostClassifier(
    iterations=1000, 
    depth=7, 
    cat_features=cat_features, 
    verbose=200,
    early_stopping_rounds=50
)

# Train the model
model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Hyperparameter tuning

In [None]:
# Define the hyperparameter space
space = {
    'iterations': hp.quniform('iterations', 500, 1500, 100),
    'depth': hp.quniform('depth', 4, 8, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'l2_leaf_reg': hp.quniform('l2_leaf_reg', 1, 5, 1)
}

In [None]:
def objective(params):
    params = {
        'iterations': int(params['iterations']),
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'l2_leaf_reg': int(params['l2_leaf_reg']),
        'cat_features': cat_features,
        'verbose': 0
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    loss = -model.score(X_val, y_val)  # Minimize negative accuracy
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [None]:
# Run Bayesian Optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

# Evaluate best model

In [None]:
# Train the model with the best hyperparameters
best_params = {
    'iterations': int(best['iterations']),
    'depth': int(best['depth']),
    'learning_rate': best['learning_rate'],
    'l2_leaf_reg': int(best['l2_leaf_reg']),
    'cat_features': cat_features,
    'verbose': 0
}
best_model = CatBoostClassifier(**best_params)
best_model.fit(X_train, y_train)

# Evaluate on the test set
test_score = best_model.score(X_test, y_test)
print(f'Test Accuracy: {test_score * 100:.2f}%')

# Load Model in ../Models

In [None]:
# Save the model to a file
model_filename = '../Models/blackbox_model.pkl'
joblib.dump(best_model, model_filename)

# Load the model from the file
loaded_model = joblib.load(model_filename)