## Create a XGBoost Model

### 1) Import Standard Libraries

In [None]:
# import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import shap

### 2) Load the Dataset

In [None]:
# read in the csv file
df = pd.read_csv('data.csv')

# display first 5 rows
print(df.head())

### 3) Convert Categorical Features to Numeric

In [None]:
# encode categorical variables
label_encoder = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoder[col] = le

### 4) Feature Engineering

In [None]:
# convert date column to datetime
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

### 5) Define Features (X) and Target (y)

In [None]:
# define predictor variables
X = df.drop(columns=['category'])

# define target variable
y = df['category']

### Split into Train and Test Sets

In [None]:
# split into train and test sets
train_x, test_x, train_y, test_y = train_test_split(
    X,
    y,
    train_size=0.8, 
    shuffle=True,
    random_state=42
)

### Define Hyperparameter Grid for Tuning

In [None]:
# define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 9],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1],
    'gamma': [0, 0.1, 0.3]
    'min_child_weight': [1, 3, 5]
}

### Perform GridSearch with 5-Fold Cross Validation

In [None]:
# Initialize XGBoost Model
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    estimator=xgb_clf, 
    param_grid=param_grid, 
    scoring='accuracy',
    cv=5, 
    n_jobs=-1, # use all available CPU cores
    verbose=2)

NameError: name 'xgb' is not defined

In [None]:
# Fit GridSearchCV
grid_search.fit(train_x, train_y)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(best_params)

### Train the XGBoost Classifier with the Best Hyperparameters

In [None]:
# train the model with the best hyperparameters
optimized_xgb = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42, **best_params) # if accuracy is low, try using softprob next

# fit the model
optimized_xgb.fit(train_x, train_y)

# make predictions
y_pred = optimized_xgb.predict(test_x)

# model evaluation
accuracy = accuracy_score(test_y, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# print classification report
print(classification_report(test_y, y_pred))

### Get Feature Importance from XGBoost

In [None]:
# get feature importance from the model
feature_names = optimized_xgb.feature_importances_

# create a dataframe of feature importances
feature_importances = pd.DataFrame({'feature': train_x.columns, 'importance': feature_names})

# sort feature importances
feature_importances = feature_importances.sort_values(by='importance', ascending=True)

# plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importances)
plt.title('XGBoost Feature Importances')
plt.show()

### SHAP Values for Advanced Feature Importance

In [None]:
# explain model predictions using SHAP
explainer = shap.TreeExplainer(optimized_xgb, train_x)
shap_values = explainer.shap_values(test_x)

# plot SHAP values
shap.summary_plot(shap_values, test_x, plot_type='bar')

# detailed SHAP plot
shap.summary_plot(shap_values, test_x)