In [1]:
import os
import sys

# Check if "term_deposit" is a folder in the current path
while not os.path.isdir("term_deposit"):
    os.chdir("..")
    print(f"Changed directory to: {os.getcwd()}")

# Add the path to the sys.path list
sys.path.append("term_deposit")

# Import the custom functions
from term_deposit import causal_analysis as ca, utils as ut

Changed directory to: /workspaces/2-term_deposit_marketing


In [2]:
import json
from pathlib import Path
from IPython.display import display, Markdown

import toml
import pandas as pd
import mlflow
import mlflow.sklearn
from hyperopt.pyll import scope
from tpot import TPOTClassifier
from imblearn.combine import SMOTETomek
from hyperopt import (fmin, tpe,
                      hp, Trials,
                      STATUS_OK)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix,
                             average_precision_score)

In [18]:
# Read TPOT configuration from toml file
config = toml.load("config.toml")
paths = config['paths']
paths_data = paths['data']
path_base = Path(paths_data['base'])
mlflow_tracking_uri = Path(paths['tracking_uri'])
SEED = config["settings"]["general"]["seed"]

# Load the dataset
df = pd.read_parquet(path_base / 'interim/encoded_wNCall_Predictions.parquet')

# Display the first few rows to ensure the 
# data is loaded correctly
display(Markdown(f'The dataset has {df.shape[0]:,} rows and {df.shape[1]:,} columns.'))
display(Markdown(f'The dataset has the following columns: {", ".join(df.columns)}.'))
display(df.sample(3))
display(df.describe())

The dataset has 2,896 rows and 42 columns.

The dataset has the following columns: age, default, balance, housing, loan, day, job_admin, job_blue-collar, job_entrepreneur, job_housemaid, job_management, job_retired, job_self-employed, job_services, job_student, job_technician, job_unemployed, job_unknown, marital_divorced, marital_married, marital_single, education_primary, education_secondary, education_tertiary, education_unknown, contact_cellular, contact_telephone, contact_unknown, month_apr, month_aug, month_dec, month_feb, month_jan, month_jul, month_jun, month_mar, month_may, month_nov, month_oct, day_encoded, campaign, predictions.

Unnamed: 0,age,default,balance,housing,loan,day,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,day_encoded,campaign,predictions
33707,56,0,3391,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.02744
37780,34,0,273,1,0,14,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1.634823
6821,23,0,1129,1,0,28,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1.59352


Unnamed: 0,age,default,balance,housing,loan,day,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,day_encoded,campaign,predictions
count,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,...,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0,2896.0
mean,39.844959,0.01692,1588.495856,0.505525,0.131215,15.827003,0.121202,0.184738,0.030387,0.018301,...,0.013122,0.132597,0.101519,0.043163,0.273135,0.075967,0.01692,0.106699,2.414365,2.388841
std,11.001984,0.128994,2924.894429,0.500056,0.337694,8.557604,0.326418,0.388152,0.171679,0.134061,...,0.113815,0.339197,0.302067,0.203259,0.445647,0.264991,0.128994,0.308783,2.306015,1.001631
min,19.0,0.0,-3058.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.566044
25%,31.0,0.0,141.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.693915
50%,37.0,0.0,620.0,1.0,0.0,16.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.156819
75%,47.0,0.0,1820.5,1.0,0.0,22.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,2.819323
max,95.0,1.0,45248.0,1.0,1.0,31.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,32.0,6.690618


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2896 entries, 5276 to 32039
Data columns (total 42 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  2896 non-null   uint8  
 1   default              2896 non-null   uint8  
 2   balance              2896 non-null   int64  
 3   housing              2896 non-null   uint8  
 4   loan                 2896 non-null   uint8  
 5   day                  2896 non-null   uint8  
 6   job_admin            2896 non-null   int8   
 7   job_blue-collar      2896 non-null   int8   
 8   job_entrepreneur     2896 non-null   int8   
 9   job_housemaid        2896 non-null   int8   
 10  job_management       2896 non-null   int8   
 11  job_retired          2896 non-null   int8   
 12  job_self-employed    2896 non-null   int8   
 13  job_services         2896 non-null   int8   
 14  job_student          2896 non-null   int8   
 15  job_technician       2896 non-null

In [20]:
# df_filtered
y = pd.read_parquet(path_base / 'processed/data_precall_encoded.parquet')['y']

In [26]:
# Define the path to the raw CSV file
raw_csv_path = Path(paths_data['raw'])

# Read the raw CSV file
df_raw = pd.read_csv(raw_csv_path)

# Display the first few rows of the dataframe to ensure it is loaded correctly
display(df_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  object
dtypes: int64(5), object(9)
memory usage: 4.3+ MB


None

In [None]:
df.sort_index()

Unnamed: 0,age,default,balance,housing,loan,day,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,day_encoded,campaign,predictions
83,59,0,2343,1,0,5,1,0,0,0,...,0,0,0,0,1,0,0,0,1,1.297444
87,41,0,1270,1,0,5,0,0,0,0,...,0,0,0,0,1,0,0,0,1,2.686381
390,60,0,545,1,0,6,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1.562541
446,39,0,45248,1,0,6,0,0,0,0,...,0,0,0,0,1,0,0,0,1,2.651497
624,30,0,484,1,0,6,0,0,0,0,...,0,0,0,0,1,0,0,0,1,2.361493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39975,29,0,5,0,0,3,0,1,0,0,...,0,0,1,0,0,0,0,0,1,1.833759
39978,31,0,882,0,0,3,0,0,0,0,...,0,0,1,0,0,0,0,0,1,1.498481
39982,25,0,1189,0,0,3,0,0,0,0,...,0,0,1,0,0,0,0,0,1,2.091526
39990,26,0,1231,1,0,3,0,0,0,0,...,0,0,1,0,0,0,0,0,2,1.898184


In [5]:
import numpy as np

# Filter out clients that required more than 12 calls to sign up
df_filtered = df[df['campaign'] <= 10]

# Create a new column 'label' based on the conditions
conditions = [
    (df_filtered['y'] == 1) & (df_filtered['campaign'] == 1),
    (df_filtered['y'] == 1) & (df_filtered['campaign'].isin([2, 3])),
    (df_filtered['y'] == 1) & (df_filtered['campaign'].isin([4, 5, 6])),
    (df_filtered['y'] == 1) & (df_filtered['campaign'].isin([7, 8, 9, 10])),
    (df_filtered['y'] == 0) & (df_filtered['campaign'] > 10)
]

choices = ['A', 'B', 'C', 'D', 'F']

df_filtered['label'] = np.select(conditions, choices, default=np.nan)

# Display the filtered dataframe
display(df_filtered)

KeyError: 'y'

In [None]:
# Separate features and target variable
y = df['y']  # Target variable
X = df.drop('y', axis=1)  # Features 

# Define numerical columns explicitly since all are either int or encoded as uint8
numerical_cols = X.columns.tolist()  # Since all columns are numeric in type (int64, int8, uint8)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols)
    ])

# Apply preprocessing to both training and test sets
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Convert the processed data into DataFrames with correct column names
X_train_processed = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

# Apply SMOTETomek on the training data to balance the dataset
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train_processed, y_train)

# Convert back to DataFrame if needed
df_train_balanced = pd.DataFrame(X_train_balanced, columns=preprocessor.get_feature_names_out())

# Balanced data is ready for further use
df_train_balanced

Unnamed: 0,num__job_blue-collar,num__job_entrepreneur,num__job_housemaid,num__job_management,num__job_retired,num__job_self-employed,num__job_services,num__job_student,num__job_technician,num__job_unemployed,...,num__marital_single,num__education_secondary,num__education_tertiary,num__default_yes,num__housing_yes,num__loan_yes,num__contact_telephone,num__high_balance,num__age_group_18-30,num__age_group_60+
0,-0.641042,-0.179605,-0.100504,-0.399275,-0.226381,-0.155974,-0.390817,-0.112509,2.380476,-0.142857,...,1.842717,0.875217,-0.482354,-0.13346,0.353802,2.630740,0.0,-0.472484,-0.299864,-0.035377
1,1.559960,-0.179605,-0.100504,-0.399275,-0.226381,-0.155974,-0.390817,-0.112509,-0.420084,-0.142857,...,-0.542677,0.875217,-0.482354,-0.13346,0.353802,-0.380121,0.0,2.116472,-0.299864,-0.035377
2,-0.641042,5.567764,-0.100504,-0.399275,-0.226381,-0.155974,-0.390817,-0.112509,-0.420084,-0.142857,...,-0.542677,-1.142574,2.073165,-0.13346,0.353802,-0.380121,0.0,2.116472,-0.299864,-0.035377
3,-0.641042,-0.179605,-0.100504,-0.399275,-0.226381,-0.155974,-0.390817,-0.112509,-0.420084,-0.142857,...,-0.542677,0.875217,-0.482354,-0.13346,0.353802,-0.380121,0.0,2.116472,-0.299864,-0.035377
4,-0.641042,-0.179605,-0.100504,-0.399275,-0.226381,-0.155974,2.558744,-0.112509,-0.420084,-0.142857,...,1.842717,0.875217,-0.482354,-0.13346,0.353802,-0.380121,0.0,-0.472484,3.334848,-0.035377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1563,-0.641042,-0.179605,-0.100504,1.543699,-0.226381,-0.155974,-0.390817,-0.112509,-0.420084,-0.142857,...,-0.542677,-0.474909,1.227571,-0.13346,0.353802,1.634478,0.0,2.116472,-0.299864,-0.035377
1564,-0.641042,-0.179605,-0.100504,-0.399275,-0.226381,-0.155974,-0.390817,-0.112509,2.380476,-0.142857,...,0.772205,-0.237035,-0.482354,-0.13346,0.353802,-0.380121,0.0,2.116472,-0.299864,-0.035377
1565,0.832288,-0.179605,-0.100504,-0.399275,-0.226381,-0.155974,-0.390817,-0.112509,-0.420084,-0.142857,...,-0.542677,0.875217,-0.482354,-0.13346,0.353802,-0.380121,0.0,0.383449,-0.299864,-0.035377
1566,-0.320231,-0.179605,-0.100504,-0.399275,-0.226381,-0.155974,-0.390817,-0.112509,1.972276,-0.142857,...,-0.542677,0.875217,-0.482354,-0.13346,0.353802,-0.380121,0.0,-0.472484,2.805065,-0.035377


In [None]:
import mlflow
from hyperopt import fmin, tpe, Trials, hp, STATUS_OK
from hyperopt.pyll.base import scope
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

# Define the search space for Hyperopt
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 5, 20, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 7, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 4, 1)),
}

# Create a Trials object to store information about the optimization process
trials = Trials()

# Ensure the correct MLflow tracking URI and experiment are set
mlflow_tracking_uri = "your_mlflow_tracking_uri2"  # Define your tracking URI
mlflow.set_tracking_uri(mlflow_tracking_uri)
experiment_name = "Your_Experiment_Name2"  # Replace with your experiment name
mlflow.set_experiment(experiment_name)

# Start the MLflow run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Run Hyperopt with the modified objective function, passing additional arguments
    best = fmin(
        fn=lambda params: ut.objective(params, data_train_balanced, X_test, y_train_balanced, y_test),  # Pass data to objective
        space=space,  # Search space
        algo=tpe.suggest,  # Tree-structured Parzen Estimator
        max_evals=50,  # Number of evaluations
        trials=trials  # Store results of each evaluation
    )

    print("Best parameters found:", best)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

job exception: objective() missing 4 required positional arguments: 'X_train', 'X_test', 'y_train', and 'y_test'



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


TypeError: objective() missing 4 required positional arguments: 'X_train', 'X_test', 'y_train', and 'y_test'

## TPOT

Exploring different models with TPOT:

1. Config 1 as a 5 generations default TPOT config scoring over f1.

In [None]:
def ensure_path(path, filename):
    path = Path(path).joinpath(filename)
    path.parent.mkdir(parents=True, exist_ok=True)
    return path

path_tpot = Path(paths["TPOT"]["base"])

# Define the TPOT configuration
path_checkpoint = ensure_path(path=paths["TPOT"]["periodic_checkpoint_folder"],
            filename="checkpoint_{run_id}")

path_pipeline = ensure_path(path_tpot, filename="pipelines/pipeline_{run_id}.py")
path_log = ensure_path(path_tpot, filename="logs/log_{run_id}.txt")

In [None]:
tpot = TPOTClassifier(**config["TPOT"]["one"],
                      log_file = path_log,
                      periodic_checkpoint_folder = path_checkpoint,
                      )

# Ensure target labels are in 1-D array format
if isinstance(y_train_balanced, pd.DataFrame):
    y_train_balanced = y_train_balanced["number_calls"]  # Convert to pandas Series (1-D array)

# Now fit TPOT with the balanced training data
tpot.fit(df_train_balanced, y_train_balanced)

In [None]:
# Score the model
print("Test Score:", tpot.score(X_test_processed, y_test))

# Export the generated pipeline
tpot.export(path_pipeline)

# Log the TPOT pipeline
with open(path_log, 'r') as file:
    pipeline_code = file.read()  # Read the pipeline code
    mlflow.log_text(pipeline_code, artifact_file="pipeline_code.py")

  y = column_or_1d(y, warn=True)


Test Score: 0.9076280041797283


In [None]:
# Set a new experiment for TPOT
tpot_experiment_name = "TPOT_Model_Experiment"
mlflow.set_experiment(tpot_experiment_name)

# Assume y_pred is obtained from the trained TPOT model
y_pred = tpot.predict(X_test_processed)

# Set the tracking URI if needed (optional)
mlflow.set_tracking_uri(mlflow_tracking_uri)  # Use the captured tracking URI

# Log metrics to a new run in the TPOT experiment
with mlflow.start_run(nested=True) as run:  # Start a new run for the TPOT experiment
    # Log results using the log_results function
    log_results(y_test, y_pred, tpot.fitted_pipeline_, {})  # Pass empty params for TPOT since they may not exist

print("TPOT metrics and artifacts have been logged to a new MLflow experiment.")

2024/10/02 11:01:22 INFO mlflow.tracking.fluent: Experiment with name 'TPOT_Model_Experiment' does not exist. Creating a new experiment.




TPOT metrics and artifacts have been logged to a new MLflow experiment.


## Experiment 2

In [None]:
with open(path_tpot.join("classifier_config_dict.json"), "r") as file:
    classifier_config_dict = json.load(file)
    
# change the types to int8 for X_train
X_train = X_train.astype('int8')
y_train = y_train.astype('int8')

# balance the datasets using SMOTEENN
smote_tomek = SMOTETomek(random_state=42)

X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

# convert the balanced datasets to DataFrame
df_train_balanced = pd.DataFrame(X_train_balanced, columns=X_train.columns)

In [None]:
# create a new experiment for the classifier
classifier_experiment_name = "Classifier_Model_Experiment"

tpot = TPOTClassifier(**config["TPOT"]["two"],
                      config_dict=classifier_config_dict)
tpot.fit(df_train_balanced, y_train_balanced)

  y = column_or_1d(y, warn=True)


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: KNeighborsClassifier(ZeroCount(input_matrix), n_neighbors=93, p=2, weights=distance)


In [None]:
# Set a new experiment for TPOT
tpot_experiment_name = "TPOT_Model_Experiment"
mlflow.set_experiment(tpot_experiment_name)

# Assume y_pred is obtained from the trained TPOT model
y_pred = tpot.predict(X_test)

# Set the tracking URI if needed (optional)
# mlflow.set_tracking_uri(mlflow_tracking_uri)  # Use the captured tracking URI

# Log metrics to a new run in the TPOT experiment
with mlflow.start_run(nested=True) as run:  # Start a new run for the TPOT experiment
    # Log results using the log_results function
    log_results(y_test, y_pred, tpot.fitted_pipeline_, {})  # Pass empty params for TPOT since they may not exist

print("TPOT metrics and artifacts have been logged to a new MLflow experiment.")

# Score the model
print("Test Score:", tpot.score(X_test, y_test))
print("Report:\n", classification_report(y_test, y_pred))  # report

# Export the generated pipeline
tpot.export(path_tpot.joinpath("precall_pipeline_2.py"))

  y = column_or_1d(y, warn=True)


Test Score: 0.9048305695746215
Report:
               precision    recall  f1-score   support

           0       0.12      0.53      0.19       847
           1       0.90      0.53      0.67      7153

    accuracy                           0.53      8000
   macro avg       0.51      0.53      0.43      8000
weighted avg       0.82      0.53      0.62      8000



In [None]:
# Calculate the True possitive rate
y_pred = tpot.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Display the confusion matrix
display(Markdown("Confusion Matrix _(normalized: True):_"))
display(pd.DataFrame(
    data=confusion_matrix(y_test, y_pred, normalize='true').round(3),
    columns=["0", "1"],
    index=["0", "1"]
    )
)

print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")  # Calculate the F1 score
print(f"precision_score: {precision_score(y_test, y_pred):.3f}")  # Calculate the precision
print(f"True Positive Rate: {tp / (tp + fn):.3f}")

# Calculate the False possitive rate
print(f"False Positive Rate: {fp / (fp + tn):.3f}")
display(Markdown("Confusion Matrix _(normalized: pred):_"))
cm_pred = confusion_matrix(y_test, y_pred, normalize='pred')
display(pd.DataFrame(cm_pred, columns=["0", "1"], index=["0", "1"]))

Confusion Matrix _(normalized: True):_

Unnamed: 0,0,1
0,0.532,0.468
1,0.474,0.526


F1 Score: 0.666
precision_score: 0.905
True Positive Rate: 0.526
False Positive Rate: 0.468


Confusion Matrix _(normalized: pred):_

Unnamed: 0,0,1
0,0.117479,0.095169
1,0.882521,0.904831


## False negative rate: The Missed 1's

**Understanding the Problem:**
We want to determine the percentage of actual class 1 instances that the model failed to predict. This is essentially the **false negative rate**.

# Analizing results

### Key Comparisons:

1. **F1 Score**:
   - Hyperopt: 0.725
   - TPOT: 0.666
   - **Winner**: Hyperopt. The F1 score is a key measure of overall balance between precision and recall, and Hyperopt performs better here.
2. **Precision Score**:
   - Hyperopt: 0.907
   - TPOT: 0.905
   - **Winner**: Hyperopt (slightly). Precision measures how many of the predicted positives are correct. Hyperopt performs slightly better.

In [None]:
# def convert_to_list(d):
#     if isinstance(d, dict):
#         return {k: convert_to_list(v) for k, v in d.items()}
#     elif isinstance(d, list):
#         return [convert_to_list(i) for i in d]
#     elif isinstance(d, (range, np.ndarray)):
#         return list(d)
#     else:
#         return d

# with open("../models/tpot/pre/classifier_config_dict.json", "w") as json_file:
#     json.dump(convert_to_list(classifier_config_dict), json_file, indent=4)

### Interpretación del Reporte

1. **Precision y Recall**:
   - **Clase 0 (clientes no identificados)**:
     - **Precision**: 0.12. Esto indica que de todas las llamadas identificadas como clientes no identificados, solo el 12% eran realmente clientes no identificados. 
     - **Recall**: 0.53. Esto sugiere que el modelo identifica correctamente el 53% de los clientes no identificados de las llamadas totales.
   - **Clase 1 (clientes identificados)**:
     - **Precision**: 0.90. El modelo es bastante preciso al identificar clientes.
     - **Recall**: 0.53. El modelo solo identifica correctamente el 53% de los clientes.

2. **F1-Score**:
   - La F1 para la clase 0 es baja (0.19), lo que muestra un desbalance significativo en la capacidad del modelo para identificar clientes no identificados.
   - Para la clase 1, la F1 es de 0.67, lo que indica que aunque la precisión es alta, la capacidad de identificar todos los casos posibles no lo es tanto.

3. **Exactitud General**:
   - La exactitud es del 53%, lo que implica que el modelo acierta en poco más de la mitad de los casos.

### Análisis de Impacto

1. **Ahorro de Tiempo Humano**:
   - Dado que el modelo tiene una alta precisión para identificar clientes (0.90), podría reducir significativamente el tiempo que un agente humano dedica a verificar llamadas de clientes ya identificados. Esto es positivo, ya que automatiza gran parte del proceso.
   - Sin embargo, con un recall bajo (0.53) para la clase 1, muchas llamadas podrían no ser reconocidas correctamente, requiriendo intervención humana adicional, lo que disminuye la eficiencia.

2. **Pérdida de Clientes No Identificados**:
   - El modelo tiene una precisión muy baja (0.12) para los clientes no identificados, lo que implica que la mayoría de las predicciones para esta clase son incorrectas. Esto puede resultar en la pérdida de oportunidades al no identificar clientes potenciales correctamente.
   - Además, con un recall del 53% en la clase 0, solo la mitad de los clientes no identificados se detectan, dejando muchas oportunidades de contacto sin aprovechar.

### Conclusión

El modelo, tal y como está, podría ahorrar tiempo en la identificación de clientes ya reconocidos, pero tiene un rendimiento insuficiente en la detección de clientes no identificados, lo que implica pérdidas potenciales significativas. Para mejorar el impacto:
- **Ajustes en el modelo**: Aumentar el recall de la clase 1 y la precisión de la clase 0 para equilibrar mejor la identificación.
- **Reentrenamiento con más datos** o **ajuste de hiperparámetros**: Para mejorar la sensibilidad y la capacidad de identificar correctamente a los clientes potenciales.

# Next steps

Predict on the entire dataset to calculate the profit increase:
- Human hours saved
- Calls reduction
- Deposit increase increase in a week
- Percentage of calls lost

Customer segmentation
- Try different techniques
- TSNE, PCA, uMAP