In [1]:
import json
import pathlib
from IPython.display import display, Markdown

import toml
import pandas as pd
import mlflow
import mlflow.sklearn
from hyperopt.pyll import scope
from tpot import TPOTClassifier
from imblearn.combine import SMOTETomek
from hyperopt import (fmin, tpe,
                      hp, Trials,
                      STATUS_OK)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix,
                             average_precision_score)

# Read TPOT configuration from toml file
config = toml.load("../config.toml")
paths = config['paths']
mlflow_tracking_uri = pathlib.Path(paths['mlflow']['tracking_uri'])
SEED = config["settings"]["general"]["seed"]

# Load the dataset
df = pd.read_parquet(paths['pre']["data"])

# Display the first few rows to ensure the data is loaded correctly
display(Markdown(f'The dataset has {df.shape[0]:,} rows and {df.shape[1]:,} columns.'))
display(Markdown(f'The dataset has the following columns: {", ".join(df.columns)}.'))
display(df.sample(3))
display(df.describe())

The dataset has 40,000 rows and 22 columns.

The dataset has the following columns: y, job_blue-collar, job_entrepreneur, job_housemaid, job_management, job_retired, job_self-employed, job_services, job_student, job_technician, job_unemployed, marital_married, marital_single, education_secondary, education_tertiary, default_yes, housing_yes, loan_yes, contact_telephone, high_balance, age_group_18-30, age_group_60+.

Unnamed: 0,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,marital_single,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes,contact_telephone,high_balance,age_group_18-30,age_group_60+
37202,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,1,0
9033,0,0,0,0,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
6079,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0


Unnamed: 0,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,marital_single,education_secondary,education_tertiary,default_yes,housing_yes,loan_yes,contact_telephone,high_balance,age_group_18-30,age_group_60+
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,...,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,0.0724,0.234575,0.035125,0.027175,0.20415,0.035925,0.03535,0.09775,0.0131,0.1713,...,0.272225,0.524825,0.28015,0.020225,0.600775,0.17325,0.058025,0.39795,0.145975,0.00585
std,0.259152,0.423738,0.184098,0.162595,0.403084,0.186106,0.184665,0.29698,0.113704,0.376776,...,0.445111,0.49939,0.449078,0.140771,0.489745,0.378468,0.233794,0.489481,0.353086,0.076262
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [2]:
# Separate features and target variable
y = (pd.read_parquet(paths['post']["data"], columns=['number_calls'])==1).astype(int)
# y = df['y']  # Target variable
X = df.drop('y', axis=1)  # Features

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['uint8']).columns.tolist()  # Treat uint8 as categorical

# Debugging: Check if the columns are being correctly identified
print(f"Categorical columns (uint8): {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Preprocessing pipeline for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical data (if needed, though it may not require any transformation)
# We assume categorical columns are already encoded, so no OneHotEncoder is used here.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))  # Just in case there are any NaNs
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing to both numerical and categorical columns
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Convert the processed data into a DataFrame with correct column names
X_train_processed = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

# Apply SMOTETomek on the training data
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train_processed, y_train)

# Convert back to DataFrame if needed
df_train_balanced = pd.DataFrame(X_train_balanced, columns=preprocessor.get_feature_names_out())

# Now your balanced data is ready for further use
df_train_balanced

Categorical columns (uint8): ['job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'marital_married', 'marital_single', 'education_secondary', 'education_tertiary', 'default_yes', 'housing_yes', 'loan_yes', 'contact_telephone', 'age_group_18-30', 'age_group_60+']
Numerical columns: []


Unnamed: 0,cat__job_blue-collar,cat__job_entrepreneur,cat__job_housemaid,cat__job_management,cat__job_retired,cat__job_self-employed,cat__job_services,cat__job_student,cat__job_technician,cat__job_unemployed,cat__marital_married,cat__marital_single,cat__education_secondary,cat__education_tertiary,cat__default_yes,cat__housing_yes,cat__loan_yes,cat__contact_telephone,cat__age_group_18-30,cat__age_group_60+
0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57249,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
57250,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0
57251,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0
57252,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0


In [3]:
# Define a function to log parameters, metrics, and model to MLflow
def log_results(y_test, y_pred, model, params):
    # Log hyperparameters
    mlflow.log_param('n_estimators', params.get('n_estimators', 'N/A'))  # Handle missing params for TPOT
    mlflow.log_param('max_depth', params.get('max_depth', 'N/A'))  # Handle missing params for TPOT
    mlflow.log_param('min_samples_split', params.get('min_samples_split', 'N/A'))  # Handle missing params for TPOT

    # Log key metrics
    mlflow.log_metric('precision', precision_score(y_test, y_pred))
    mlflow.log_metric('recall', recall_score(y_test, y_pred))
    mlflow.log_metric('f1_score', f1_score(y_test, y_pred))
    mlflow.log_metric('average_precision', average_precision_score(y_test, y_pred))

    # Log classification report as a text artifact
    mlflow.log_text(classification_report(y_test, y_pred), artifact_file="classification_report.txt")

    # Log confusion matrix as a text artifact
    mlflow.log_text(str(confusion_matrix(y_test, y_pred)), artifact_file="confusion_matrix.txt")

    # Log the model
    mlflow.sklearn.log_model(model, "model")

# Define the objective function
def objective(params):
    with mlflow.start_run(nested=True):  # Create a new nested run for each evaluation
        # Extract hyperparameters
        n_estimators = int(params['n_estimators'])
        max_depth = int(params['max_depth'])
        min_samples_split = int(params['min_samples_split'])

        # Define and train the model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42
        )
        model.fit(X_train, y_train)

        # Make predictions and calculate metrics
        y_pred = model.predict(X_test)
        precision = precision_score(y_test, y_pred)

        # Log results to MLflow
        log_results(y_test, y_pred, model, params)

        # Return a dictionary with status and loss (to minimize)
        return {'loss': -precision, 'status': STATUS_OK}

# Define the search space for Hyperopt
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 200, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
}

# Create a Trials object to store information about the optimization process 
trials = Trials()

# Start the MLflow run and capture the existing run ID and tracking URI
with mlflow.start_run() as run:
    run_id = run.info.run_id
    mlflow.set_tracking_uri(mlflow_tracking_uri)

    # Run Hyperopt
    best = fmin(
        fn=objective,  # Objective function
        space=space,  # Search space
        algo=tpe.suggest,  # Tree-structured Parzen Estimator
        max_evals=50,  # Number of evaluations (can be adjusted)
        trials=trials  # Store results of each evaluation
    )

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  return fit_method(estimator, *args, **kwargs)




  2%|▏         | 1/50 [00:10<08:20, 10.21s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




  4%|▍         | 2/50 [00:16<06:15,  7.83s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




  6%|▌         | 3/50 [00:23<05:42,  7.28s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




  8%|▊         | 4/50 [00:28<05:00,  6.54s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 10%|█         | 5/50 [00:33<04:32,  6.05s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 12%|█▏        | 6/50 [00:38<04:12,  5.75s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




 14%|█▍        | 7/50 [00:43<03:53,  5.43s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 16%|█▌        | 8/50 [00:50<04:03,  5.79s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 18%|█▊        | 9/50 [00:55<03:49,  5.59s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 20%|██        | 10/50 [01:01<03:52,  5.81s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 22%|██▏       | 11/50 [01:08<04:03,  6.23s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 24%|██▍       | 12/50 [01:13<03:44,  5.92s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 26%|██▌       | 13/50 [01:20<03:47,  6.15s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 28%|██▊       | 14/50 [01:27<03:45,  6.25s/trial, best loss: -0.8942548921224285]

  return fit_method(estimator, *args, **kwargs)




 30%|███       | 15/50 [01:34<03:46,  6.46s/trial, best loss: -0.8942620897018291]

  return fit_method(estimator, *args, **kwargs)




 32%|███▏      | 16/50 [01:39<03:30,  6.20s/trial, best loss: -0.8942620897018291]

  return fit_method(estimator, *args, **kwargs)




 34%|███▍      | 17/50 [01:45<03:24,  6.21s/trial, best loss: -0.8942620897018291]

  return fit_method(estimator, *args, **kwargs)




 36%|███▌      | 18/50 [01:52<03:18,  6.20s/trial, best loss: -0.8942620897018291]

  return fit_method(estimator, *args, **kwargs)




 38%|███▊      | 19/50 [01:57<03:06,  6.01s/trial, best loss: -0.8942620897018291]

  return fit_method(estimator, *args, **kwargs)




 40%|████      | 20/50 [02:02<02:52,  5.74s/trial, best loss: -0.8942620897018291]

  return fit_method(estimator, *args, **kwargs)




 42%|████▏     | 21/50 [02:07<02:41,  5.58s/trial, best loss: -0.8942742340532396]

  return fit_method(estimator, *args, **kwargs)




 44%|████▍     | 22/50 [02:13<02:32,  5.43s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 46%|████▌     | 23/50 [02:18<02:24,  5.36s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 48%|████▊     | 24/50 [02:23<02:17,  5.29s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 50%|█████     | 25/50 [02:28<02:10,  5.22s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 52%|█████▏    | 26/50 [02:33<02:06,  5.29s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 54%|█████▍    | 27/50 [02:39<02:01,  5.30s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 56%|█████▌    | 28/50 [02:44<01:58,  5.40s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 58%|█████▊    | 29/50 [02:49<01:50,  5.27s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 60%|██████    | 30/50 [02:54<01:42,  5.14s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 62%|██████▏   | 31/50 [02:59<01:37,  5.12s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 64%|██████▍   | 32/50 [03:05<01:35,  5.31s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 66%|██████▌   | 33/50 [03:10<01:28,  5.20s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 68%|██████▊   | 34/50 [03:16<01:27,  5.46s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 70%|███████   | 35/50 [03:21<01:20,  5.38s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 72%|███████▏  | 36/50 [03:27<01:16,  5.46s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 74%|███████▍  | 37/50 [03:32<01:08,  5.25s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




 76%|███████▌  | 38/50 [03:36<01:01,  5.12s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 78%|███████▊  | 39/50 [03:42<00:57,  5.27s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




 80%|████████  | 40/50 [03:46<00:50,  5.02s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 82%|████████▏ | 41/50 [03:53<00:48,  5.38s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 84%|████████▍ | 42/50 [03:58<00:43,  5.44s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 86%|████████▌ | 43/50 [04:03<00:37,  5.31s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 88%|████████▊ | 44/50 [04:09<00:33,  5.53s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 90%|█████████ | 45/50 [04:14<00:26,  5.30s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 92%|█████████▏| 46/50 [04:19<00:20,  5.15s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 94%|█████████▍| 47/50 [04:24<00:15,  5.28s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 96%|█████████▌| 48/50 [04:31<00:11,  5.59s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




 98%|█████████▊| 49/50 [04:37<00:05,  5.68s/trial, best loss: -0.8943732730469731]

  return fit_method(estimator, *args, **kwargs)




100%|██████████| 50/50 [04:43<00:00,  5.67s/trial, best loss: -0.8943732730469731]
100%|██████████| 50/50 [04:43<00:00,  5.67s/trial, best loss: -0.8943732730469731]


In [86]:
print(f"Best Hyperparameters: {best}")
# print(f"Existing Run ID: {run_id}")
print(f"MLflow Tracking URI: {mlflow_tracking_uri}")

# Calculate precision, recall, and F1 score
model = RandomForestClassifier(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_split=int(best['min_samples_split']),
    random_state=42
)
model.fit(df_train_balanced, y_train_balanced)
y_pred = model.predict(X_test)

Best Hyperparameters: {'max_depth': 27.0, 'min_samples_split': 2.0, 'n_estimators': 81.0}
MLflow Tracking URI: ../mlruns/pre/


  return fit_method(estimator, *args, **kwargs)


In [101]:
# Calculate the True possitive rate
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Display the confusion matrix
display(Markdown("Confusion Matrix _(normalized: True):_"))
display(pd.DataFrame(
    data=confusion_matrix(y_test, y_pred, normalize='true').round(3),
    columns=["0", "1"],
    index=["0", "1"]
    )
)

print(f"F1 Score: {f1_score(y_test, y_pred):.3f}") # Calculate the F1 score
print(f"precision_score: {precision_score(y_test, y_pred):.3f}")  # Calculate the precision
print(f"True Positive Rate: {tp / (tp + fn):.3f}")

# Calculate the False possitive rate
print(f"False Positive Rate: {fp / (fp + tn):.3f}")
display(Markdown("Confusion Matrix _(normalized: pred):_"))
cm_pred = confusion_matrix(y_test, y_pred, normalize='pred')
display(pd.DataFrame(cm_pred, columns=["0", "1"], index=["0", "1"]))

Confusion Matrix _(normalized: True):_

Unnamed: 0,0,1
0,0.475,0.525
1,0.397,0.603


F1 Score: 0.725
precision_score: 0.907
True Positive Rate: 0.603
False Positive Rate: 0.525


Confusion Matrix _(normalized: pred):_

Unnamed: 0,0,1
0,0.124112,0.093468
1,0.875888,0.906532


## TPOT

Exploring different models with TPOT:

1. Config 1 as a 5 generations default TPOT config scoring over f1.

In [22]:
def ensure_path(path, filename):
    path = Path(path).joinpath(filename)
    path.parent.mkdir(parents=True, exist_ok=True)
    return path

path_tpot = Path(paths["TPOT"]["base"])

# Define the TPOT configuration
path_checkpoint = ensure_path(path=paths["TPOT"]["periodic_checkpoint_folder"],
            filename="checkpoint_{run_id}")

path_pipeline = ensure_path(path_tpot, filename="pipelines/pipeline_{run_id}.py")
path_log = ensure_path(path_tpot, filename="logs/log_{run_id}.txt")

In [None]:
tpot = TPOTClassifier(**config["TPOT"]["one"],
                      log_file = path_log,
                      periodic_checkpoint_folder = path_checkpoint,
                      )

# Ensure target labels are in 1-D array format
if isinstance(y_train_balanced, pd.DataFrame):
    y_train_balanced = y_train_balanced["number_calls"]  # Convert to pandas Series (1-D array)

# Now fit TPOT with the balanced training data
tpot.fit(df_train_balanced, y_train_balanced)

In [7]:
# Score the model
print("Test Score:", tpot.score(X_test_processed, y_test))

# Export the generated pipeline
tpot.export(path_pipeline)

# Log the TPOT pipeline
with open(path_log, 'r') as file:
    pipeline_code = file.read()  # Read the pipeline code
    mlflow.log_text(pipeline_code, artifact_file="pipeline_code.py")

  y = column_or_1d(y, warn=True)


Test Score: 0.9076280041797283


In [8]:
# Set a new experiment for TPOT
tpot_experiment_name = "TPOT_Model_Experiment"
mlflow.set_experiment(tpot_experiment_name)

# Assume y_pred is obtained from the trained TPOT model
y_pred = tpot.predict(X_test_processed)

# Set the tracking URI if needed (optional)
mlflow.set_tracking_uri(mlflow_tracking_uri)  # Use the captured tracking URI

# Log metrics to a new run in the TPOT experiment
with mlflow.start_run(nested=True) as run:  # Start a new run for the TPOT experiment
    # Log results using the log_results function
    log_results(y_test, y_pred, tpot.fitted_pipeline_, {})  # Pass empty params for TPOT since they may not exist

print("TPOT metrics and artifacts have been logged to a new MLflow experiment.")

2024/10/02 11:01:22 INFO mlflow.tracking.fluent: Experiment with name 'TPOT_Model_Experiment' does not exist. Creating a new experiment.




TPOT metrics and artifacts have been logged to a new MLflow experiment.


## Experiment 2

In [10]:
with open(path_tpot.join("classifier_config_dict.json"), "r") as file:
    classifier_config_dict = json.load(file)
    
# change the types to int8 for X_train
X_train = X_train.astype('int8')
y_train = y_train.astype('int8')

# balance the datasets using SMOTEENN
smote_tomek = SMOTETomek(random_state=42)

X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

# convert the balanced datasets to DataFrame
df_train_balanced = pd.DataFrame(X_train_balanced, columns=X_train.columns)

In [19]:
# create a new experiment for the classifier
classifier_experiment_name = "Classifier_Model_Experiment"

tpot = TPOTClassifier(**config["TPOT"]["two"],
                      config_dict=classifier_config_dict)
tpot.fit(df_train_balanced, y_train_balanced)

  y = column_or_1d(y, warn=True)


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: KNeighborsClassifier(ZeroCount(input_matrix), n_neighbors=93, p=2, weights=distance)


In [56]:
# Set a new experiment for TPOT
tpot_experiment_name = "TPOT_Model_Experiment"
mlflow.set_experiment(tpot_experiment_name)

# Assume y_pred is obtained from the trained TPOT model
y_pred = tpot.predict(X_test)

# Set the tracking URI if needed (optional)
# mlflow.set_tracking_uri(mlflow_tracking_uri)  # Use the captured tracking URI

# Log metrics to a new run in the TPOT experiment
with mlflow.start_run(nested=True) as run:  # Start a new run for the TPOT experiment
    # Log results using the log_results function
    log_results(y_test, y_pred, tpot.fitted_pipeline_, {})  # Pass empty params for TPOT since they may not exist

print("TPOT metrics and artifacts have been logged to a new MLflow experiment.")

# Score the model
print("Test Score:", tpot.score(X_test, y_test))
print("Report:\n", classification_report(y_test, y_pred))  # report

# Export the generated pipeline
tpot.export(path_tpot.joinpath("precall_pipeline_2.py"))

  y = column_or_1d(y, warn=True)


Test Score: 0.9048305695746215
Report:
               precision    recall  f1-score   support

           0       0.12      0.53      0.19       847
           1       0.90      0.53      0.67      7153

    accuracy                           0.53      8000
   macro avg       0.51      0.53      0.43      8000
weighted avg       0.82      0.53      0.62      8000



In [103]:
# Calculate the True possitive rate
y_pred = tpot.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Display the confusion matrix
display(Markdown("Confusion Matrix _(normalized: True):_"))
display(pd.DataFrame(
    data=confusion_matrix(y_test, y_pred, normalize='true').round(3),
    columns=["0", "1"],
    index=["0", "1"]
    )
)

print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")  # Calculate the F1 score
print(f"precision_score: {precision_score(y_test, y_pred):.3f}")  # Calculate the precision
print(f"True Positive Rate: {tp / (tp + fn):.3f}")

# Calculate the False possitive rate
print(f"False Positive Rate: {fp / (fp + tn):.3f}")
display(Markdown("Confusion Matrix _(normalized: pred):_"))
cm_pred = confusion_matrix(y_test, y_pred, normalize='pred')
display(pd.DataFrame(cm_pred, columns=["0", "1"], index=["0", "1"]))

Confusion Matrix _(normalized: True):_

Unnamed: 0,0,1
0,0.532,0.468
1,0.474,0.526


F1 Score: 0.666
precision_score: 0.905
True Positive Rate: 0.526
False Positive Rate: 0.468


Confusion Matrix _(normalized: pred):_

Unnamed: 0,0,1
0,0.117479,0.095169
1,0.882521,0.904831


## False negative rate: The Missed 1's

**Understanding the Problem:**
We want to determine the percentage of actual class 1 instances that the model failed to predict. This is essentially the **false negative rate**.

# Analizing results

### Key Comparisons:

1. **F1 Score**:
   - Hyperopt: 0.725
   - TPOT: 0.666
   - **Winner**: Hyperopt. The F1 score is a key measure of overall balance between precision and recall, and Hyperopt performs better here.
2. **Precision Score**:
   - Hyperopt: 0.907
   - TPOT: 0.905
   - **Winner**: Hyperopt (slightly). Precision measures how many of the predicted positives are correct. Hyperopt performs slightly better.

In [66]:
# def convert_to_list(d):
#     if isinstance(d, dict):
#         return {k: convert_to_list(v) for k, v in d.items()}
#     elif isinstance(d, list):
#         return [convert_to_list(i) for i in d]
#     elif isinstance(d, (range, np.ndarray)):
#         return list(d)
#     else:
#         return d

# with open("../models/tpot/pre/classifier_config_dict.json", "w") as json_file:
#     json.dump(convert_to_list(classifier_config_dict), json_file, indent=4)

# next steps

Predict on the entire dataset to calculate the profit increase:
- Human hours saved
- Calls reduction
- Deposit increase increase in a week
- Percentage of calls lost

Customer segmentation
- Try different techniques
- TSNE, PCA, uMAP