In [1]:
%%capture
%run preprocessing.ipynb

In [9]:
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import xgboost as xgb

# Load data
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy') - 1 # -1 to make classes start from 0
y_test = np.load('y_test.npy') - 1   # same thing here

In [8]:
# Define the parameters for the XGBoost model

# Create a DMatrix from the training data
dtrain = xgb.DMatrix(X_train, label=y_train)

param = {
    'max_depth': 3,  # Maximum depth of the trees
    'eta': 0.3,  # Learning rate
    'objective': 'multi:softprob',  # Loss function
    'num_class': 3  # Number of classes
}

# Train the model
num_round = 20  # Number of training rounds
model = xgb.train(param, dtrain, num_round)

# Create a DMatrix from the test data
dtest = xgb.DMatrix(X_test)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Get the class with the highest probability for each sample
y_pred = np.argmax(y_pred, axis=1)

# Print precision, recall, and F1-score
print(metrics.classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.95462   0.97194   0.96320      9977
           1    0.99330   0.99873   0.99601     10244
           2    0.97871   0.95596   0.96720     10196

    accuracy                        0.97561     30417
   macro avg    0.97554   0.97554   0.97547     30417
weighted avg    0.97572   0.97561   0.97559     30417



In [11]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import pandas as pd

# Define the parameter grid
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],  # Maximum depth of the trees
    'learning_rate': [0.001, 0.002, 0.003,0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5],  # Learning rate
    'n_estimators': [30, 40, 50, 75, 100, 200, 300, 400, 500],  # Number of trees (number of boosting rounds)
    'objective': ['multi:softprob']   # Error evaluation for multiclass training (only this option available)
}

# In XGBoost, each tree is built to correct the mistakes made by the previous ensemble of trees.
# It prevents overfitting by using a differentiable loss function
# It has parameters to control the complexity of the trees (maximum depth, gamma, etc.) 

# I didn't try tuning gamma, but results are amazing nonetheless

# Create a XGBClassifier
model = xgb.XGBClassifier(use_label_encoder=False)

# Create the grid search object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted')

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_search.best_params_)


# Save to a dataframe, and to a csv file
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.to_csv('Results_XGBoost.csv')

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500, 'objective': 'multi:softprob'}
