In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
train_processed_sample = 'data/processed/train_processed_sample.csv'
train_processed_target = 'data/processed/train_processed_target.csv'

test_processed_sample = 'data/processed/test_processed_sample.csv'
test_processed_target = 'data/processed/test_processed_target.csv'

validate_processed_sample = 'data/processed/validate_processed_sample.csv'
validate_processed_target = 'data/processed/validate_processed_target.csv'

In [3]:
X_train = pd.read_csv(train_processed_sample)
y_train = pd.read_csv(train_processed_target)

X_test = pd.read_csv(test_processed_sample)
y_test = pd.read_csv(test_processed_target)

X_val = pd.read_csv(validate_processed_sample)
y_val = pd.read_csv(validate_processed_target)

In [5]:
# Create DMatrix for model
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softprob',  # Multi-class classification with probabilities
    'num_class': 49,  # Number of classes
    'eval_metric': 'mlogloss',  # Multi-class log loss
    'max_depth': 6,  # Maximum depth of a tree
    'learning_rate': 0.1,  # Learning rate
    'subsample': 0.8,  # Subsample ratio
    'colsample_bytree': 0.8,  # Subsample ratio of columns
    'device': 'cuda',
}

bst = xgb.train(params, dtrain, num_boost_round=100)

y_pred_prob = bst.predict(dtest)

# Evaluate the model
logloss = log_loss(y_test, y_pred_prob)
print(f'Log Loss: {logloss:.4f}')

print('Predicted Probabilities:')
print(pd.DataFrame(y_pred_prob, columns=[f'Class_{i}' for i in range(y.shape[1])]))

# Save the model to a file
model_filename = 'model/xgboost_model_1.json'
bst.save_model(model_filename)