In [1]:
import pandas as pd
import numpy as np

In [2]:
    base = pd.read_csv('data_chida_diego.csv')

In [3]:
base = base.sort_values(by=['year', 'month'], ascending=[True, True])

train_val_df = base[(base['year'] < 2023) | ((base['year'] == 2023) & (base['month'] < 5))]
test_df = base[(base['year'] > 2023) | ((base['year'] == 2023) & (base['month'] >= 5))]

In [4]:
train_val_df.churn_next_month.value_counts()

churn_next_month
0.0    4633833
1.0      61574
Name: count, dtype: int64

In [5]:
train_val_df.head()

Unnamed: 0,customer_id,month,amount,churn_next_month,date,year,type,antiguedad,componente_estacional,cluster,varianza,promedio_temporada,porcentaje_vs_promedio,tiempo_desde_ultima_compra,percentage_change
0,100000,11,49.0843,0.0,2019-11-01,2019,Estanquillos / kioscos,0,54.6334,0,746.184533,205.849512,0.238447,0,0.0
48,100001,11,131.2384,0.0,2019-11-01,2019,Abarrotes / Almacenes / Bodegas / Víveres,0,88.82718,0,1456.907931,311.054872,0.421914,0,0.0
106,100004,11,276.9193,0.0,2019-11-01,2019,Carnicería / Pollería / Pescadería,0,202.04275,3,10935.950513,187.478628,1.477071,0,0.0
185,100006,11,445.4996,0.0,2019-11-01,2019,Abarrotes / Almacenes / Bodegas / Víveres,0,707.21768,3,81216.261229,311.054872,1.432222,0,0.0
233,100007,11,57.62585,0.0,2019-11-01,2019,Estanquillos / kioscos,0,57.28525,0,1019.744541,205.849512,0.279942,0,0.0


In [6]:
import numpy as np
from sklearn.model_selection import train_test_split


# Drop the "customer_id" column
train_val_df = train_val_df.drop("customer_id", axis=1)

# Split the balanced dataframe into train and validation sets
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)

# Select rows with "churn_next_month" value of 1
churn_1 = train_df[train_df["churn_next_month"] == 1]

# Randomly sample the same amount of rows with "churn_next_month" value of 0
churn_0 = train_df[train_df["churn_next_month"] == 0].sample(n=len(churn_1), random_state=42)

# Concatenate the sampled dataframes
train_df = pd.concat([churn_1, churn_0])


In [7]:
val_df.churn_next_month.value_counts()

churn_next_month
0.0    926682
1.0     12400
Name: count, dtype: int64

In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Codificación de variables categóricas (como 'type')
label_encoder = LabelEncoder()
train_df['type_encoded'] = label_encoder.fit_transform(train_df['type'])
val_df['type_encoded'] = label_encoder.transform(val_df['type'])

# Eliminar la columna 'type' original ya codificada
train_df = train_df.drop('type', axis=1)
val_df = val_df.drop('type', axis=1)
train_df = train_df.drop('date', axis=1)
val_df = val_df.drop('date', axis=1)

# Dividir los datos en características y objetivo
X_train = train_df.drop('churn_next_month', axis=1)
y_train = train_df['churn_next_month']
X_val = val_df.drop('churn_next_month', axis=1)
y_val = val_df['churn_next_month']


In [12]:

# Inicializar y entrenar el modelo XGBoost
model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=19,
    colsample_bytree=0.5,
    seed=42,
    scale_pos_weight=1/10
)
model.fit(X_train, y_train)


In [13]:
base=None
churn_0=None
churn_1=None
train_val_df = None
val_df = None
train_df = None
test_df = None

In [14]:
import itertools
from sklearn.metrics import f1_score

param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.03, 0.1, 0.2],
    'max_depth': [7, 13, 20],
    # 'min_child_weight': [1, 5, 10],
    # 'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.25, 0.6, 0.8],
    'scale_pos_weight':[1/3, 1/7, 1/10, 1/14, 1/20]
}

# param_grid = {
#     'n_estimators': [100],
#     'learning_rate': [0.1],
#     'max_depth': [19],
#     # 'min_child_weight': [1, 5, 10],
#     # 'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.5],
#     'scale_pos_weight':[1/10, 1/14]
# }

param_combinations = list(itertools.product(
    param_grid['n_estimators'],
    param_grid['learning_rate'],
    param_grid['max_depth'],
    # param_grid['min_child_weight'],
    # param_grid['subsample'],
    param_grid['colsample_bytree'],
    param_grid['scale_pos_weight']
))

best_score = 0
best_args = None
best_model = None
tot = len(param_combinations)
i=0

for (n_estimators, learning_rate, max_depth, colsample_bytree, scale_pos_weight) in param_combinations:
    i += 1
    print(f'--------------------------------------Run: {i} of {tot}---------------------------------------------')
    print((n_estimators, learning_rate, max_depth, colsample_bytree, scale_pos_weight))
    
    # Inicializar y entrenar el modelo XGBoost
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        colsample_bytree=colsample_bytree,
        seed=42,
        scale_pos_weight=scale_pos_weight
    )
    
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_val_pred = model.predict(X_val)

    # Calculate the F1 score
    val_f1 = f1_score(y_val, y_val_pred)
    
    print(f'F1: {val_f1}')
    
    
    if(val_f1 > best_score):
        best_score = val_f1
        best_args = (n_estimators, learning_rate, max_depth, colsample_bytree, scale_pos_weight)
        best_model = model
    
    print()







--------------------------------------Run: 1 of 270---------------------------------------------
(50, 0.03, 7, 0.25, 0.3333333333333333)
F1: 0.30786469099549874

--------------------------------------Run: 2 of 270---------------------------------------------
(50, 0.03, 7, 0.25, 0.14285714285714285)
F1: 0.35644573354760317

--------------------------------------Run: 3 of 270---------------------------------------------
(50, 0.03, 7, 0.25, 0.1)
F1: 0.3095345458189288

--------------------------------------Run: 4 of 270---------------------------------------------
(50, 0.03, 7, 0.25, 0.07142857142857142)
F1: 0.21795030288045494

--------------------------------------Run: 5 of 270---------------------------------------------
(50, 0.03, 7, 0.25, 0.05)
F1: 0.11719030843338846

--------------------------------------Run: 6 of 270---------------------------------------------
(50, 0.03, 7, 0.6, 0.3333333333333333)
F1: 0.2851866466735064

--------------------------------------Run: 7 of 270-------

In [15]:
best_score

0.41175889002405824

In [16]:
best_args

(100, 0.03, 13, 0.6, 0.05)

In [10]:
def confusion_matrix_to_dataframe(confusion_matrix):
    """
    Convert a confusion matrix to a DataFrame with specific columns and rows.
    
    Parameters:
    confusion_matrix (numpy.ndarray): A 2x2 confusion matrix.
    
    Returns:
    pandas.DataFrame: A DataFrame with columns 'Predicted Positive', 'Predicted Negative' 
                      and rows 'Actual Positive', 'Actual Negative'.
    """
    # Ensure the input is a 2x2 numpy array
    if confusion_matrix.shape != (2, 2):
        raise ValueError("Confusion matrix must be a 2x2 matrix")
    
    # Create the DataFrame
    df = pd.DataFrame(confusion_matrix, 
                      columns=['Predicted Negative', 'Predicted Positive'], 
                      index=['Actual Negative', 'Actual Positive'])
    
    return df

In [21]:
# Predecir los resultados para el conjunto de validación
y_pred_train = model.predict(X_train)

# Calcular y mostrar el accuracy y el informe de clasificación
accuracy = accuracy_score(y_train, y_pred_train)
conf_matrix = confusion_matrix(y_train, y_pred_train)
class_report = classification_report(y_train, y_pred_train)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix_to_dataframe(conf_matrix))
print("Classification Report:\n", class_report)

Accuracy: 0.9086102411843657
Confusion Matrix:
                  Predicted Negative  Predicted Positive
Actual Negative               98346                   2
Actual Positive               13480               35694
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      1.00      0.94     98348
         1.0       1.00      0.73      0.84     49174

    accuracy                           0.91    147522
   macro avg       0.94      0.86      0.89    147522
weighted avg       0.92      0.91      0.90    147522



In [29]:

# Predecir los resultados para el conjunto de validación
y_pred = model.predict(X_val)

# Calcular y mostrar el accuracy y el informe de clasificación
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix_to_dataframe(conf_matrix))
print("Classification Report:\n", class_report)


Accuracy: 0.9717149301125994
Confusion Matrix:
                  Predicted Negative  Predicted Positive
Actual Negative              904379               22303
Actual Positive                4259                8141
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.98      0.99    926682
         1.0       0.27      0.66      0.38     12400

    accuracy                           0.97    939082
   macro avg       0.63      0.82      0.68    939082
weighted avg       0.99      0.97      0.98    939082



In [30]:
# from sklearn.metrics import f1_score

f1_score(y_val, y_pred)

0.38002987582858744

In [14]:
# Asumiendo que test_df ya está cargado y preparado adecuadamente

# Codificar la variable categórica 'type' en test_df usando el mismo LabelEncoder
test_df['type_encoded'] = label_encoder.transform(test_df['type'])
test_df = test_df.drop("customer_id", axis=1)
test_df = test_df.drop('type', axis=1)
test_df = test_df.drop('date', axis=1)

# Dividir los datos en características y objetivo
X_test = test_df.drop('churn_next_month', axis=1)
y_test = test_df['churn_next_month']

# Predecir los resultados para el conjunto de prueba
y_pred_test = model.predict(X_test)

# Calcular y mostrar el accuracy y el informe de clasificación para el conjunto de prueba
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
class_report_test = classification_report(y_test, y_pred_test)

print("Accuracy on Test Set:", accuracy_test)
print("Confusion Matrix on Test Set:\n", conf_matrix_test)
print("Classification Report on Test Set:\n", class_report_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['type_encoded'] = label_encoder.transform(test_df['type'])


Accuracy on Test Set: 0.9292926746464718
Confusion Matrix on Test Set:
 [[544789  40339]
 [  1732   8142]]
Classification Report on Test Set:
               precision    recall  f1-score   support

         0.0       1.00      0.93      0.96    585128
         1.0       0.17      0.82      0.28      9874

    accuracy                           0.93    595002
   macro avg       0.58      0.88      0.62    595002
weighted avg       0.98      0.93      0.95    595002

