In [15]:
import pandas as pd
import numpy as np
import pickle
from test_evaluate_functions.test_evaluate_functions import calculate_accuracies, test_model, return_variable_importance
from training_functions.training_functions import calculate_binary_variables, split, split_x_y, evaluate_models, train_ada_boost, split_easy_and_sudden_errors
from predictive_models.predictive_models_functions import adboc_predictor, test_adboc_model
from sklearn.exceptions import DataConversionWarning
import warnings

warnings.filterwarnings("ignore", category=DataConversionWarning)

**Important note:** Change the directories to your local machine when reading the data

# Models for lights
## "adboc" model

### Drop nan columns:

In [5]:
dff = pd.read_csv("/home/leibniz/Desktop/IHMAN/preprocessing_results/out_lights.csv")
print("Reading done!")
# We will drop all the columns readings and lon and lat:
drop_cols = [
                col for col in dff.columns if 
                (col.startswith("power")) | (col.startswith("Active")) | (col.startswith("Reactive") | 
                (col in ["lat", "lon"])) | 
                (col == "Unnamed: 0") |
                (col.startswith("week")) |
                (col == "type") |
                (col == "ebox_id") |
                (col == "location") |
                (col == "id")
            ]
df = dff.drop(drop_cols, axis=1)
print("Drop done!")
# Interpolate with the mean in case it is necessary:
df = df.fillna(df.mean(numeric_only=True))
# Interpolation in this case should not be necessary

Reading done!
Drop done!


In [6]:
df = calculate_binary_variables(df)
# Preprocessing and split:
df["current_week"] = pd.to_datetime(df["current_week"])
train, validation, test = split(df, n_weeks=100)

cols_to_train = df.drop(["current_week", "hours_next_four_weeks", "error_next_four_weeks", "hours_week+1", "hours_week+2", "hours_week+3", "hours_week+4"], axis=1).columns
x_train, y_train = split_x_y(train, cols_to_train)
x_validation, y_validation = split_x_y(validation, cols_to_train)
x_test, y_test = split_x_y(test, cols_to_train)

0.8648401942667765
0.3588560269454507
0.3295029149558065
0.31164105809874276
2014-10-06 00:00:00
2019-04-29 00:00:00
2019-05-06 00:00:00
2021-03-29 00:00:00
2021-04-05 00:00:00
2023-03-06 00:00:00


### Train model with the whole train dataset:

In [None]:
model_results = evaluate_models(
    max_depth_l = [1, 2, 3],
    n_estimators_l = [1, 2, 5, 10, 15, 20, 30],
    lr_l = [0.1, 0.2, 0.5],
    prob_threshold_l = np.arange(0.45, 0.55, 0.01).tolist(),
    x_train = x_train,
    y_train = y_train,
    x_validation = x_validation,
    y_validation = y_validation
)
display(model_results)

Once the grid search ends explore the dataframe model_results and fins the model that best fits your training and validation data. In our case we will go with the following model:

In [10]:
ada_model = train_ada_boost(
    max_depth_tree = 1,
    n_estimators = 30,
    lr = 0.5,
    x  = x_train,
    y = y_train
)
# Test:
test_model(
    model = ada_model,
    x = x_test, 
    y = y_test,
    prob_threshold = 0.45
)

{'total_accuracy': 0.9647697583278902,
 'yes_accuracy': 0.9018666666666667,
 'no_accuracy': 0.9796551985258853}

Variable importance:

In [17]:
feature_importances = return_variable_importance(
    ada_model = ada_model, 
    trained_columns = cols_to_train
)
display(feature_importances.loc[feature_importances["importance"] != 0])

Unnamed: 0,variable,importance
0,hours_week-4,0.033333
0,hours_week-3,0.033333
0,hours_week-2,0.033333
0,hours_week-1,0.033333
0,hours_current_week,0.1
0,Temp_min_min_week-4,0.033333
0,Dew_avg_std_week-4,0.033333
0,Hum_avg_avg_week-4,0.066667
0,Pres_avg_std_week-4,0.033333
0,Temp_avg_avg_week-3,0.033333


### Sudden errors
We have a suspicion that the model we just trained does a very simple classification task using the errors of the last weeks, If the model finds errors in that weeks then it predicts an error in the following 4 weeks because it is normal to have errors recurrent errors.

From now on we will denote this errors as **"easy errors"**. This kind of error is easy to predict and we are more interested in predicting the from now on denoted **"sudden errors"**. This kind of errors are errors that appear without having a hard history of errors in the past weeks.

For example an error of type sudden whould be a lampost that suffers and error but we have all the variables hors_week-i at 0 or even one of them >0.

We will define an error as sudden error if we have 2 or less past weeks with errors. We consider the variable current_week to be a past week.

The first step will be to extract this kind of errors from the test data and see how the model we just trained predicts them to verify our theory

In [12]:
test_sudden, test_easy = split_easy_and_sudden_errors(test)

x_test_sudden, y_test_sudden = split_x_y(test_sudden, cols_to_train)
x_test_easy, y_test_easy = split_x_y(test_easy, cols_to_train)

Let's apply the "ada_model" to the splited test dataset to verify our theory:

In [13]:
test_model(ada_model, x_test_sudden, y_test_sudden, 0.45)

{'total_accuracy': 0.2809917355371901,
 'yes_accuracy': 0.2809917355371901,
 'no_accuracy': None}

In [14]:
test_model(ada_model, x_test_easy, y_test_easy, 0.45)

{'total_accuracy': 0.9829741311568161,
 'yes_accuracy': 0.9991979764328459,
 'no_accuracy': 0.9796551985258853}

We can see that the model almost has a perfect fit with the test_easy dataset. On the other hand, it has very bad performance on the test_sudden dataset. We can not consider ada_model to be the optimal predictor because it has a limited capacity of detecting sudden errors. We will have to design a better model for this cases.

To do it, the best way will be to split the datasets train, validation and test to train the sub-model so we make sure that the models are trained with the same datasests or subdatasets.

In [18]:
train_sudden, train_easy = split_easy_and_sudden_errors(train)
validation_sudden, validation_easy = split_easy_and_sudden_errors(validation)

# The dataset train_sudden has just rows with the column "error_next_four_weeks" = "Yes" so we will have to add
# some "No" rows in order to the model to train. To do this we will simply add a random sample of length len(train_sudden)
# of "No" rows to the dataset train_sudden.
train_sudden = pd.concat(
    [
        train.loc[train["error_next_four_weeks"] == "No"].sample(len(train_sudden)),
        train_sudden
    ],
    sort=True
).sample(frac=1.0, random_state=42)

validation_sudden = pd.concat(
    [
        validation.loc[validation["error_next_four_weeks"] == "No"].sample(len(validation_sudden)),
        validation_sudden
    ],
    sort=True
).sample(frac=1.0, random_state=42)

x_train_sudden, y_train_sudden = split_x_y(train_sudden, cols_to_train)
x_validation_sudden, y_validation_sudden = split_x_y(validation_sudden, cols_to_train)

In [None]:
# Train the model:
model_sudden_results = evaluate_models(
    max_depth_l = [1, 2, 3, 4],
    n_estimators_l = [1, 5, 10, 30, 100, 200, 300],
    lr_l = [0.05, 0.1, 0.2, 0.5, 0.7],
    prob_threshold_l = np.arange(0.40, 0.60, 0.01).tolist(),
    x_train = x_train_sudden,
    y_train = y_train_sudden,
    x_validation = x_validation_sudden,
    y_validation = y_validation_sudden
)

display(model_sudden_results.loc[(model_sudden_results["no_accuracy"] >= model_sudden_results["no_accuracy"].max()-0.37) & (model_sudden_results["yes_accuracy"] >= model_sudden_results["yes_accuracy"].max()-0.37)])

We will go with the model max_depth_tree=3, n_estimators=10, lr=0.7

In [20]:
ada_sudden_model = train_ada_boost(
    max_depth_tree = 3,
    n_estimators = 10,
    lr = 0.7,
    x  = x_train_sudden,
    y = y_train_sudden
)
# Test:
test_model(
    model = ada_sudden_model, 
    x = x_test_sudden, 
    y = y_test_sudden,
    prob_threshold = 0.48
)

{'total_accuracy': 0.5269578905942542,
 'yes_accuracy': 0.5269578905942542,
 'no_accuracy': None}

We managed to double our accuracy with this model on the sudden errors. Let's check the accuracy now for the whole test dataset:

In [21]:
# Test:
test_model(
    model = ada_sudden_model,
    x = x_test, 
    y = y_test,
    prob_threshold = 0.48
)

{'total_accuracy': 0.6450951175702155,
 'yes_accuracy': 0.8564266666666667,
 'no_accuracy': 0.5950854431178535}

In [22]:
# Check of the importance of the variables:
feature_importances = return_variable_importance(
    ada_model = ada_sudden_model,
    trained_columns = cols_to_train
)
display(feature_importances.loc[feature_importances["importance"] != 0])

Unnamed: 0,variable,importance
0,hours_week-4,0.025177
0,hours_week-3,0.038182
0,hours_week-2,0.027792
0,hours_week-1,0.013506
0,hours_current_week,0.097503
0,Temp_avg_std_week-4,0.018184
0,Temp_min_min_week-4,0.001437
0,Dew_avg_std_week-4,0.028706
0,Dew_min_min_week-4,0.019918
0,Hum_max_max_week-4,8.2e-05


We can see that the model focuses more on all the other variables instead of focusing a lot on the hours

The new accuracies are not as good as the first model but we are sure that we can detect better the sudden errors. The strategy that we will follow to do predictions will use a combination of the two models. Depending on the row that we want to predict we will have two possibilities:
* The first one will be when the row has three or more weeks with errors in the past weeks (remember that we consider current_week as a past week). In this case we will use the model "ada_model"
* The second case will be when the row has two or less weeks with errors in the past weeks. In this case we sill use the model "ada_sudden_model" 

Let's see if this strategy works better:

In [23]:
acc, preds = test_adboc_model(
    x_test,
    y_test,
    ada_model,
    ada_sudden_model,
    0.45,
    0.48
)

We get a slight increase in the accuracy so this is the winner!

### Export the best models:

In [None]:
with open("predictive_models/ada_model.pk1", "wb") as file:
    pickle.dump(ada_model, file)

with open("predictive_models/ada_sudden_model.pk1", "wb") as file:
    pickle.dump(ada_sudden_model, file)

In [None]:
# Save the probability thresholds:
with open("predictive_models/ada_prob.pk1", "wb") as file:
    pickle.dump(
        {"prob_ada_model": 0.45, "prob_sudden_model": 0.48},
        file
    )

## "default" model


### Drop nan rows:

In [24]:
dff = pd.read_csv("/home/leibniz/Desktop/IHMAN/preprocessing_results/out_lights.csv")
print("Reading done!")
# drop nan rows:
df = dff.loc[~dff["ActivePeak_current_week"].isna()]

# Drop some usless columns for the model:
drop_cols = [
    col for col in df.columns if
        (col in ["lat", "lon"]) | 
        (col == "Unnamed: 0") |
        (col.startswith("week")) |
        (col == "type") |
        (col == "ebox_id") |
        (col == "location") |
        (col == "id")
]
df = df.drop(drop_cols, axis=1)

# Interpolate some left missing values:
df = df.fillna(df.mean(numeric_only=True))

Reading done!


In [25]:
df = calculate_binary_variables(df)
# Preprocessing and split:
df["current_week"] = pd.to_datetime(df["current_week"])
train, validation, test = split(df, n_weeks=77)

cols_to_train = df.drop(["current_week", "hours_next_four_weeks", "error_next_four_weeks", "hours_week+1", "hours_week+2", "hours_week+3", "hours_week+4"], axis=1).columns
x_train, y_train = split_x_y(train, cols_to_train)
x_validation, y_validation = split_x_y(validation, cols_to_train)
x_test, y_test = split_x_y(test, cols_to_train)

0.7606851470373555
0.32915929993025256
0.38210248008953623
0.2887382199802112
2017-03-20 00:00:00
2020-03-16 00:00:00
2020-03-23 00:00:00
2021-09-06 00:00:00
2021-09-13 00:00:00
2023-03-06 00:00:00


### Train model with the whole train dataset:

In [None]:
model_results = evaluate_models(
    max_depth_l = [1, 2, 3],
    n_estimators_l = [1, 2, 5, 10, 15, 20, 30],
    lr_l = [0.1, 0.2, 0.5],
    prob_threshold_l = np.arange(0.45, 0.55, 0.02).tolist(),
    x_train = x_train,
    y_train = y_train,
    x_validation = x_validation,
    y_validation = y_validation
)
display(model_results.loc[(model_results["yes_accuracy"] >= model_results["yes_accuracy"].max()-0.05)])

We are going with the model lr=0.5, n_estimators=30, prob=0.47, depth=1

In [26]:
ada_model = train_ada_boost(
    max_depth_tree = 1,
    n_estimators = 30,
    lr = 0.5,
    x  = x_train,
    y = y_train
)
# Test:
test_model(
    model = ada_model,
    x = x_test, 
    y = y_test,
    prob_threshold = 0.47
)

{'total_accuracy': 0.9632043143643616,
 'yes_accuracy': 0.91628145865434,
 'no_accuracy': 0.9808636486643213}

In [27]:
# Check of the importance of the variables:
feature_importances = return_variable_importance(
    ada_model = ada_model, 
    trained_columns = cols_to_train
)
display(feature_importances.loc[feature_importances["importance"] != 0])

Unnamed: 0,variable,importance
0,hours_week-4,0.033333
0,hours_week-2,0.033333
0,hours_week-1,0.033333
0,hours_current_week,0.133333
0,ReactivePeak_week-1,0.033333
0,powerReactive_p2_week-1,0.033333
0,powerReactive_p2_current_week,0.033333
0,powerReactive_p3_current_week,0.066667
0,ActivePeak_week-4,0.033333
0,powerActive_p3_week-4,0.033333


We can see that in this case the model does use the readings for doing predictions

### Sudden errors
We have a suspicion that the model we just trained does a very simple classification task using the errors of the last weeks, If the model finds errors in that weeks then it predicts an error in the following 4 weeks because it is normal to have errors recurrent errors.

From now on we will denote this errors as **"easy errors"**. This kind of error is easy to predict and we are more interested in predicting the from now on denoted **"sudden errors"**. This kind of errors are errors that appear without having a hard history of errors in the past weeks.

For example an error of type sudden whould be a lampost that suffers and error but we have all the variables hors_week-i at 0 or even one of them >0.

We will define an error as sudden error if we have 2 or less past weeks with errors. We consider the variable current_week to be a past week.

test_sudden, test_easy = split_easy_and_sudden_errors(test)

x_test_sudden, y_test_sudden = split_x_y(test_sudden, cols_to_train)
x_test_easy, y_test_easy = split_x_y(test_easy, cols_to_train)

In [28]:
test_sudden, test_easy = split_easy_and_sudden_errors(test)

x_test_sudden, y_test_sudden = split_x_y(test_sudden, cols_to_train)
x_test_easy, y_test_easy = split_x_y(test_easy, cols_to_train)

Let's apply the model "ada_model" to the splited test dataset to verify our theory:

In [29]:
test_model(ada_model, x_test_sudden, y_test_sudden, 0.47)

{'total_accuracy': 0.2453748782862707,
 'yes_accuracy': 0.2453748782862707,
 'no_accuracy': None}

In [30]:
test_model(ada_model, x_test_easy, y_test_easy, 0.47)

{'total_accuracy': 0.984526391901663,
 'yes_accuracy': 0.9954065227377125,
 'no_accuracy': 0.9808636486643213}

We can see that the model almost has a perfect fit with the test_easy dataset. On the other hand, it has very bad performance on the test_sudden dataset. We can not consider ada_model to be the optimal predictor because it has a limited capacity of detecting sudden errors. We will have to design a better model for this cases.

To do it, the best way will be to split the datasets train, validation and test to train the sub-model so we make sure that the models are trained with the same datasests or subdatasets.

In [31]:
train_sudden, train_easy = split_easy_and_sudden_errors(train)
validation_sudden, validation_easy = split_easy_and_sudden_errors(validation)

# The dataset train_sudden has just rows with the column "error_next_four_weeks" = "Yes" so we will have to add
# some "No" rows in order to the model to train. To do this we will simply add a random sample of length len(train_sudden)
# of "No" rows to the dataset train_sudden.
train_sudden = pd.concat(
    [
        train.loc[train["error_next_four_weeks"] == "No"].sample(len(train_sudden)),
        train_sudden
    ],
    sort=True
).sample(frac=1.0, random_state=42)

validation_sudden = pd.concat(
    [
        validation.loc[validation["error_next_four_weeks"] == "No"].sample(len(validation_sudden)),
        validation_sudden
    ],
    sort=True
).sample(frac=1.0, random_state=42)

x_train_sudden, y_train_sudden = split_x_y(train_sudden, cols_to_train)
x_validation_sudden, y_validation_sudden = split_x_y(validation_sudden, cols_to_train)

In [None]:
# Train the model:
model_sudden_results = evaluate_models(
    max_depth_l = [1, 2, 3],
    n_estimators_l = [1, 10, 30, 100],
    lr_l = [0.1, 0.2, 0.5],
    prob_threshold_l = np.arange(0.45, 0.55, 0.02).tolist(),
    x_train = x_train_sudden,
    y_train = y_train_sudden,
    x_validation = x_validation_sudden,
    y_validation = y_validation_sudden
)

display(model_sudden_results.loc[(model_sudden_results["no_accuracy"] >= model_sudden_results["no_accuracy"].max()-0.37) & (model_sudden_results["yes_accuracy"] >= model_sudden_results["yes_accuracy"].max()-0.37)])

We will go with the model max_depth = 3, n_estimators=100, lr=0.2, prob=0.49

In [32]:
ada_sudden_model = train_ada_boost(
    max_depth_tree = 3,
    n_estimators = 100,
    lr = 0.2,
    x  = x_train_sudden,
    y = y_train_sudden
)
# Test:
test_model(
    model = ada_sudden_model, 
    x = x_test_sudden, 
    y = y_test_sudden,
    prob_threshold = 0.49
)

{'total_accuracy': 0.5666991236611489,
 'yes_accuracy': 0.5666991236611489,
 'no_accuracy': None}

We are not getting much better results than when dropping the readings columns. Since this model is based in much more less data and more columns, there is no need to develop it further with the adboc aproach.

### Export the best model:

In [None]:
with open("predictive_models/ada_model_readings.pk1", "wb") as file:
    pickle.dump(ada_model, file)

# Save the probability thresholds:
with open("predictive_models/ada_prob_readings.pk1", "wb") as file:
    pickle.dump({"prob_ada_model": 0.47}, file)

# Model for eboxes:

## Drop the readings columns:

In [33]:
dff = pd.read_csv("/home/leibniz/Desktop/IHMAN/preprocessing_results/out_eboxes.csv")
print("Reading done!")

# We will drop all the columns readings and lon and lat:
drop_cols = [
                col for col in dff.columns if 
                (col.startswith("power")) | (col.startswith("Active")) | (col.startswith("Reactive") | 
                (col in ["lat", "lon"])) | 
                (col == "Unnamed: 0") |
                (col.startswith("week")) |
                (col == "type") |
                (col == "ebox_id") |
                (col == "location") |
                (col == "id")
            ]
df = dff.drop(drop_cols, axis=1)
print("Drop done!")
# Interpolate with the mean in case it is necessary:
df = df.fillna(df.mean(numeric_only=True))

Reading done!
Drop done!


In [34]:
df = calculate_binary_variables(df)
# Preprocessing and split:
df["current_week"] = pd.to_datetime(df["current_week"])
train, validation, test = split(df, n_weeks=80)

cols_to_train = df.drop(["current_week", "hours_next_four_weeks", "error_next_four_weeks", "hours_week+1", "hours_week+2", "hours_week+3", "hours_week+4"], axis=1).columns
x_train, y_train = split_x_y(train, cols_to_train)
x_validation, y_validation = split_x_y(validation, cols_to_train)
x_test, y_test = split_x_y(test, cols_to_train)

0.6905388399496625
0.3274224917057545
0.3386340235670976
0.3339434847271479
2015-06-08 00:00:00
2020-01-20 00:00:00
2020-01-27 00:00:00
2021-08-02 00:00:00
2021-08-09 00:00:00
2023-02-20 00:00:00


### Train model with the whole train dataset:

In [None]:
model_results = evaluate_models(
    max_depth_l = [1],
    n_estimators_l = [10, 20],
    lr_l = [0.5, 0.7, 0.8],
    prob_threshold_l = np.arange(0.46, 0.5, 0.01).tolist(),
    x_train = x_train,
    y_train = y_train,
    x_validation = x_validation,
    y_validation = y_validation
)

display(model_results.loc[(model_results["yes_accuracy"] >= model_results["yes_accuracy"].max()-0.37) & ((model_results["no_accuracy"] >= model_results["no_accuracy"].max()-0.37))])

We are going with the model lr=0.5, n_estimators=10, prob_threshold=0.48, tree_depth=1

In [37]:
ada_model = train_ada_boost(
    max_depth_tree = 1,
    n_estimators = 10,
    lr = 0.5,
    x  = x_train,
    y = y_train
)
# Test:
test_model(
    model = ada_model,
    x = x_test, 
    y = y_test,
    prob_threshold = 0.48
)

{'total_accuracy': 0.7115450496745461,
 'yes_accuracy': 0.5689381933438986,
 'no_accuracy': 0.7508741258741258}

In [38]:
# Check of the importance of the variables:
feature_importances = return_variable_importance(
    ada_model = ada_model, 
    trained_columns = cols_to_train
)
display(feature_importances.loc[feature_importances["importance"] != 0])

Unnamed: 0,variable,importance
0,hours_week-4,0.1
0,hours_week-3,0.1
0,hours_week-2,0.1
0,hours_week-1,0.1
0,hours_current_week,0.1
0,Pres_min_min_week-4,0.1
0,Wind_max_max_week-3,0.1
0,Wind_avg_avg_week-3,0.1
0,Dew_max_max_current_week,0.1
0,Pres_min_min_current_week,0.1


# Let's test with droping the nan rows and keeping the readings

In [39]:
# drop nan rows:
df = dff.loc[~dff["ActivePeak_current_week"].isna()]

# Drop some usless columns for the model:
drop_cols = [
    col for col in df.columns if
        (col in ["lat", "lon"]) | 
        (col == "Unnamed: 0") |
        (col.startswith("week")) |
        (col == "type") |
        (col == "ebox_id") |
        (col == "location") |
        (col == "id")
]
df = df.drop(drop_cols, axis=1)

# Interpolate some left missing values:
df = df.fillna(df.mean(numeric_only=True))

In [42]:
df = calculate_binary_variables(df)
# Preprocessing and split:
df["current_week"] = pd.to_datetime(df["current_week"])
train, validation, test = split(df, n_weeks=60)

cols_to_train = df.drop(["current_week", "hours_next_four_weeks", "error_next_four_weeks", "hours_week+1", "hours_week+2", "hours_week+3", "hours_week+4"], axis=1).columns
x_train, y_train = split_x_y(train, cols_to_train)
x_validation, y_validation = split_x_y(validation, cols_to_train)
x_test, y_test = split_x_y(test, cols_to_train)

0.6592943654555029
0.41758820431806215
0.28949447077409163
0.2929173249078462
2015-06-08 00:00:00
2020-06-08 00:00:00
2020-06-15 00:00:00
2021-08-02 00:00:00
2021-08-09 00:00:00
2022-10-03 00:00:00


### Train model with the whole train dataset:

In [None]:
model_results = evaluate_models(
    max_depth_l = [1, 2, 3],
    n_estimators_l = [1, 2, 5, 10, 15],
    lr_l = [0.1, 0.2, 0.5],
    prob_threshold_l = np.arange(0.40, 0.55, 0.02).tolist(),
    x_train = x_train,
    y_train = y_train,
    x_validation = x_validation,
    y_validation = y_validation
)

display(model_results.loc[(model_results["yes_accuracy"] >= model_results["yes_accuracy"].max()-0.33) & ((model_results["no_accuracy"] >= model_results["no_accuracy"].max()-0.33))])

We will go with the model lr=0.5, n_estimators=15, prob_threshold=0.47, tree_depth=1

In [43]:
ada_model_readings = train_ada_boost(
    max_depth_tree = 1,
    n_estimators = 15,
    lr = 0.5,
    x  = x_train,
    y = y_train
)
# Test:
test_model(
    model = ada_model_readings,
    x = x_test,
    y = y_test,
    prob_threshold = 0.47
)

{'total_accuracy': 0.29842696629213483,
 'yes_accuracy': 0.8971428571428571,
 'no_accuracy': 0.11352941176470588}

We get worse results so for the sake of simplicity we are going to just use the model that does not use the readings

### Export the best model (Not the one we have trained in the last line)

In [None]:
with open("predictive_models/ada_model_eboxes.pk1", "wb") as file:
    pickle.dump(ada_model, file)

# Save the probability thresholds:
with open("predictive_models/ada_prob_eboxes.pk1", "wb") as file:
    pickle.dump({"prob_ada_model": 0.48}, file)