In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import model_shap

from data_pipeline import DataPipeline
from memory_profiler import memory_usage

In [None]:
# Load and preprocess the data
data_upac08 = DataPipeline('data/upac08')

# Track memory usage of the preprocessing
preprocessing_memory_usage = memory_usage((data_upac08._do, ()), timestamps=True)

In [None]:
# Save the memory usage to a dataframe

mem_usage = pd.DataFrame(preprocessing_memory_usage, columns=['Memory Usage', 'Timestamp'])
mem_usage.index = pd.to_datetime(mem_usage['Timestamp'], unit='s')
mem_usage = mem_usage.drop('Timestamp', axis=1)

mem_usage

In [None]:
# Fix the seed for reproducibility

np.random.seed(42)

In [None]:
# Do the iterations until the model converges (XGBoost)

xgb_dict = {}

FEATURES = data_upac08.train_data[0].columns
COUNTER = 0
while len(FEATURES) > 1:
    xgb_ = model_shap.XGBShap(data_upac08.train_data[0][FEATURES], data_upac08.train_data[1],
                              data_upac08.val_data[0][FEATURES], data_upac08.val_data[1],
                              data_upac08.test_data[0][FEATURES], data_upac08.test_data[1],
                              scaler=data_upac08.scaler,
                              n_trials=100, seed=42)

    xgb_.do()

    xgb_dict['It {:03d}'.format(COUNTER)] = xgb_

    FEATURES = xgb_.shap_values.abs().mean().sort_values(ascending=False)[:int(xgb_.shap_values.abs().mean().shape[0] * 0.5)].index

    COUNTER += 1

In [None]:
# Check the dictionary

xgb_dict['It 003'].model_memory_usage

In [None]:
# Plot the memory usage of the XGBoost models

fig, ax = plt.subplots(figsize=(5, 5))

for key, value in xgb_dict.items():
    ax.plot(range(len(value.model_memory_usage)), value.model_memory_usage - value.model_memory_usage.min(), label=key)

    ax.set_ylabel('Memory Usage (MB)')

    ax.legend()

    ax.set_xlabel('Time')

plt.show()

In [None]:
# Build a report by concatenating the reports of the models

report = pd.concat([xgb_dict[key].report for key in xgb_dict.keys()], axis=0)
report.index = xgb_dict.keys()

# Append the RMSE percentage deviation from the first iteration
report['RMSE Train %'] = (report['RMSE Train'] - report['RMSE Train'][0]) / report['RMSE Train'][0] * 100
report['RMSE Val %'] = (report['RMSE Val'] - report['RMSE Val'][0]) / report['RMSE Val'][0] * 100
report['RMSE Test %'] = (report['RMSE Test'] - report['RMSE Test'][0]) / report['RMSE Test'][0] * 100

report

In [None]:
# Plot the mean absolute SHAP values of the first iteration

xgb_dict['It 001'].plot_shap(plot_type='bar')

In [None]:
# Save the values of XGBShap class to files

for key in xgb_dict.keys():
    xgb_dict[key].save('models/xgboost/{}'.format(key))

In [None]:
# Load the class

import pickle

with open('models/xgboost/xgb_overall.pkl', 'rb') as f:
    xgb_overall = pickle.load(f)