In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
import seaborn as sns
#from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Modelado y preprocesamiento
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,KFold

# Métricas
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    roc_auc_score
)



#from kmodes.kprototypes import KPrototypes

In [2]:
# Set options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000) # Adjust width for better display of many columns
pd.set_option('display.max_colwidth', None) # Display full content of cells

In [3]:
categorical_variables = ['phone_carrier','phone_disposable_validation','phone_ported_original_carrier',
                         'phone_ported_validation','email_deliverable_validation','email_disposable_validation',
                        'email_domain_free_provider_flag','email_is_breached_flag','website_exists']

In [4]:
model_data = pd.read_csv("data_clean.csv")

In [5]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9129 entries, 0 to 9128
Data columns (total 30 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   phone_carrier                                          9129 non-null   object 
 1   phone_disposable_validation                            9129 non-null   bool   
 2   phone_ported_original_carrier                          9129 non-null   object 
 3   phone_ported_validation                                9129 non-null   bool   
 4   phone_social_number_of_names_returned                  9129 non-null   int64  
 5   phone_social_number_of_photos_returned                 9129 non-null   int64  
 6   phone_social_registered_ecommerce_profiles             9129 non-null   int64  
 7   phone_social_registered_email_provider_profiles        9129 non-null   int64  
 8   phone_social_registered_messaging_profiles      

In [7]:
y = model_data['label']
model_data_train = model_data.drop('label',axis=1)

In [8]:
model_data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9129 entries, 0 to 9128
Data columns (total 29 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   phone_carrier                                          9129 non-null   object 
 1   phone_disposable_validation                            9129 non-null   bool   
 2   phone_ported_original_carrier                          9129 non-null   object 
 3   phone_ported_validation                                9129 non-null   bool   
 4   phone_social_number_of_names_returned                  9129 non-null   int64  
 5   phone_social_number_of_photos_returned                 9129 non-null   int64  
 6   phone_social_registered_ecommerce_profiles             9129 non-null   int64  
 7   phone_social_registered_email_provider_profiles        9129 non-null   int64  
 8   phone_social_registered_messaging_profiles      

<h3> Catboost Classifier </h3>

In [9]:
X_train, X_test, y_train, y_test = train_test_split(model_data_train, y, test_size=0.2, random_state=42)

In [10]:
pos_count = sum(y)

In [11]:
pos_count

644

In [12]:
neg_count = len(model_data_train) - pos_count

In [13]:
neg_count

8485

In [14]:
scale_pos_weight_value = neg_count / pos_count

print(f"Calculated scale_pos_weight: {scale_pos_weight_value}")

Calculated scale_pos_weight: 13.175465838509316


In [15]:
base_model = CatBoostClassifier(iterations=100,
                               depth=6,
                               learning_rate=0.1,
                               loss_function='Logloss',
                               cat_features=categorical_variables,
                               eval_metric='F1',
                               verbose=True,
                               scale_pos_weight=scale_pos_weight_value
                              )

In [16]:
base_model.fit(X_train, y_train)

0:	learn: 0.6570932	total: 148ms	remaining: 14.6s
1:	learn: 0.5943596	total: 172ms	remaining: 8.4s
2:	learn: 0.5923449	total: 197ms	remaining: 6.36s
3:	learn: 0.6136128	total: 222ms	remaining: 5.32s
4:	learn: 0.6197993	total: 232ms	remaining: 4.41s
5:	learn: 0.6144828	total: 257ms	remaining: 4.03s
6:	learn: 0.6222115	total: 272ms	remaining: 3.61s
7:	learn: 0.6289514	total: 290ms	remaining: 3.33s
8:	learn: 0.6230620	total: 311ms	remaining: 3.14s
9:	learn: 0.6143322	total: 335ms	remaining: 3.02s
10:	learn: 0.6189358	total: 355ms	remaining: 2.87s
11:	learn: 0.6221422	total: 379ms	remaining: 2.78s
12:	learn: 0.6280510	total: 407ms	remaining: 2.72s
13:	learn: 0.6238624	total: 438ms	remaining: 2.69s
14:	learn: 0.6275339	total: 465ms	remaining: 2.63s
15:	learn: 0.6266817	total: 486ms	remaining: 2.55s
16:	learn: 0.6346279	total: 510ms	remaining: 2.49s
17:	learn: 0.6364048	total: 522ms	remaining: 2.38s
18:	learn: 0.6523959	total: 545ms	remaining: 2.32s
19:	learn: 0.6610596	total: 568ms	remainin

<catboost.core.CatBoostClassifier at 0x29d9e978710>

In [17]:
# 4. Evaluate the model
y_pred = base_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.61      0.74      1700
           1       0.08      0.44      0.13       126

    accuracy                           0.60      1826
   macro avg       0.51      0.53      0.44      1826
weighted avg       0.88      0.60      0.70      1826



In [18]:
feature_importances = base_model.get_feature_importance()

feature_importances_df = pd.DataFrame({'Feature': model_data_train.columns, 'Importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

print(feature_importances_df.head(20))

top_features = feature_importances_df.nlargest(10, 'Importance')['Feature']

                                                  Feature  Importance
2                           phone_ported_original_carrier   11.688295
10                       phone_social_registered_profiles   10.928113
25                       email_social_registered_profiles    9.649132
11          phone_social_registered_social_media_profiles    8.342622
16                               email_number_of_breaches    7.655931
21        email_social_registered_email_provider_profiles    6.791395
26          email_social_registered_social_media_profiles    6.585435
0                                           phone_carrier    6.344137
24          email_social_registered_professional_profiles    6.166218
20             email_social_registered_ecommerce_profiles    5.203487
15                                 email_is_breached_flag    4.597751
3                                 phone_ported_validation    3.509762
6              phone_social_registered_ecommerce_profiles    2.388198
5                  p

In [19]:
top_features

2                       phone_ported_original_carrier
10                   phone_social_registered_profiles
25                   email_social_registered_profiles
11      phone_social_registered_social_media_profiles
16                           email_number_of_breaches
21    email_social_registered_email_provider_profiles
26      email_social_registered_social_media_profiles
0                                       phone_carrier
24      email_social_registered_professional_profiles
20         email_social_registered_ecommerce_profiles
Name: Feature, dtype: object

In [22]:
# --- Hyperparameter Tuning with GridSearchCV ---
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5],
    'scale_pos_weight': [scale_pos_weight_value, scale_pos_weight_value*0.8, scale_pos_weight_value*1.2]
}

catboost_grid = CatBoostClassifier(
    loss_function='Logloss',
    cat_features=categorical_variables,
    eval_metric='F1',
    verbose=0
)

grid_search = GridSearchCV(catboost_grid, param_grid, scoring='f1', cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

print('Best parameters:', grid_search.best_params_)
print('Best F1 score:', grid_search.best_score_)

# --- Feature Selection: Use Top Features ---
selected_features = list(top_features)
X_train_selected = X_train[selected_features].copy()
X_test_selected = X_test[selected_features].copy()

# --- Encode categorical features before SMOTE ---
from sklearn.preprocessing import LabelEncoder
encoders = {}
for col in selected_features:
    if X_train_selected[col].dtype == 'object':
        le = LabelEncoder()
        X_train_selected[col] = le.fit_transform(X_train_selected[col].astype(str))
        X_test_selected[col] = le.transform(X_test_selected[col].astype(str))
        encoders[col] = le

# --- Data Balancing: Try SMOTE Oversampling ---
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_selected, y_train)

# --- Retrain Model with Best Params and Balanced Data ---
best_params = grid_search.best_params_
balanced_model = CatBoostClassifier(
    **best_params,
    loss_function='Logloss',
    cat_features=categorical_variables,
    eval_metric='F1',
    verbose=0
)
balanced_model.fit(X_train_bal, y_train_bal)

# --- Predict Probabilities for Threshold Adjustment ---
y_pred_proba = balanced_model.predict_proba(X_test_selected)[:,1]

# --- Find Best Threshold for Precision/Recall ---
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Example: maximize F1
f1_scores = 2*precisions*recalls/(precisions+recalls+1e-8)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]
print(f'Best threshold for F1: {best_threshold:.2f}')

# --- Final Prediction with Best Threshold ---
y_pred_final = (y_pred_proba >= best_threshold).astype(int)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_final))

Best parameters: {'depth': 4, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.05, 'scale_pos_weight': 13.175465838509316}
Best F1 score: 0.14836264591223688


ValueError: y contains previously unseen labels: 'SUMA MOVIL SAS'