Due to the fact that in the previous code we were able to identify that for a learning rate of 0.4 we see an improvement in the model's accuracy as the number of estimators increases, now the model is trained within that range of values to compare accuracy with computation time. This is done to identify if we can simplify the model and gain computing time to perform more tests if necessary.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import csv
import pickle
import xgboost as xgb
import plotly.express as px
from itertools import product
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load data using pickle

with open('data/variables.pkl', 'rb') as f:
    data = pickle.load(f)

normalized_vector = data[0]
label = data[1]

In [3]:
# Create train and test data
x_train, x_test, y_train, y_test = train_test_split(normalized_vector, label, test_size=0.3, random_state=0)

# Flatten the data
x_train_flat = np.array([np.concatenate(x) for x in x_train])
x_test_flat = np.array([np.concatenate(x) for x in x_test])

In [6]:
n_estimators = [350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 1950, 2000, 2050]
results = []
iteration_time = []
dtest = xgb.DMatrix(data = x_test_flat)
dtrain = xgb.DMatrix(data = x_train_flat, label = y_train)

ummbral = np.linspace(0.1, 0.8, 80)

params = {
        "objective": "binary:logistic", 
        "learning_rate": 0.4
    }


for j in n_estimators:
    iteration_start_time = time.time()
   

    # Train the model
    model = xgb.train(params, dtrain, j)

    y_pred_prob = model.predict(dtest)
    AC = []
    for i in ummbral:
        y_pred = np.where(y_pred_prob >= i, 1, 0)
        accuracy = accuracy_score(y_test, y_pred)
        AC.append(accuracy)
    idx_max = AC.index(max(AC))
    threshold = ummbral[idx_max]

    y_pred = np.where(y_pred_prob >= threshold, 1, 0)
    accuracy = accuracy_score(y_test, y_pred)
    results.append([j, accuracy])
    print(j, accuracy)

    iteration_end_time = time.time()
    elapsed_time = iteration_end_time - iteration_start_time
    iteration_time.append(elapsed_time)
    print(f"Elapsed time: {elapsed_time}")


350 0.6592009587199793
Elapsed time: 337.8184103965759
400 0.6608204181444234
Elapsed time: 355.0808928012848
450 0.6618892613645565
Elapsed time: 408.1656060218811
500 0.6632981910638229
Elapsed time: 449.9480903148651
550 0.6638973910508672
Elapsed time: 483.70644998550415
600 0.6639135856451117
Elapsed time: 526.3194570541382
650 0.6643994234724448
Elapsed time: 564.6280505657196
700 0.6653387099386225
Elapsed time: 603.8046295642853
750 0.6652253477789114
Elapsed time: 642.4625146389008
800 0.6654520720983336
Elapsed time: 685.267758846283
850 0.6659864937084001
Elapsed time: 723.6219205856323
900 0.6665694991012
Elapsed time: 767.1325542926788
950 0.6664237477530001
Elapsed time: 804.4891011714935
1000 0.6668448072033555
Elapsed time: 838.8665161132812
1050 0.6673468396249332
Elapsed time: 884.0506372451782
1100 0.6672172828709777
Elapsed time: 930.5242218971252
1150 0.6674440071903999
Elapsed time: 966.1658220291138
1200 0.6677355098867997
Elapsed time: 989.6624417304993
1250 0.6

In [None]:
# save results using pickle

with open('outputs/accuracy_outputs.pkl', 'wb') as f:
    pickle.dump(results, f)

In [9]:
# Create a dataframe with the results

df = pd.DataFrame(results, columns = ['n_estimators', 'accuracy'])

df['computation_time'] = iteration_time

In [22]:
# plot accuracy and computation time in two different plots using plotly

import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(
        x=df['n_estimators'],
        y=df['accuracy'],
        mode='lines',
        name='Accuracy',
        line=dict(color='blue'),
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=df['n_estimators'],
        y=df['computation_time'],
        mode='lines',
        name='Computation Time',
        line=dict(color='red'),
    ),
    row=1, col=2
)

fig.update_layout(title='Accuracy and Computation Time vs n_estimators', showlegend=True)

fig.show()