In [141]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
import seaborn as sns
#from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Modelado y preprocesamiento
#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,KFold

# Métricas
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    roc_auc_score
)



#from kmodes.kprototypes import KPrototypes

In [142]:
# Set options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000) # Adjust width for better display of many columns
pd.set_option('display.max_colwidth', None) # Display full content of cells

In [143]:
categorical_variables = ['phone_carrier','phone_disposable_validation','phone_ported_original_carrier',
                         'phone_ported_validation','email_deliverable_validation','email_disposable_validation',
                        'email_domain_free_provider_flag','email_is_breached_flag','website_exists']

In [144]:
model_data = pd.read_csv("data_clean.csv")

In [145]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9129 entries, 0 to 9128
Data columns (total 30 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   phone_carrier                                          9129 non-null   object 
 1   phone_disposable_validation                            9129 non-null   bool   
 2   phone_ported_original_carrier                          9129 non-null   object 
 3   phone_ported_validation                                9129 non-null   bool   
 4   phone_social_number_of_names_returned                  9129 non-null   int64  
 5   phone_social_number_of_photos_returned                 9129 non-null   int64  
 6   phone_social_registered_ecommerce_profiles             9129 non-null   int64  
 7   phone_social_registered_email_provider_profiles        9129 non-null   int64  
 8   phone_social_registered_messaging_profiles      

In [146]:
model_data.head()

Unnamed: 0,phone_carrier,phone_disposable_validation,phone_ported_original_carrier,phone_ported_validation,phone_social_number_of_names_returned,phone_social_number_of_photos_returned,phone_social_registered_ecommerce_profiles,phone_social_registered_email_provider_profiles,phone_social_registered_messaging_profiles,phone_social_registered_professional_profiles,phone_social_registered_profiles,phone_social_registered_social_media_profiles,email_deliverable_validation,email_disposable_validation,email_domain_free_provider_flag,email_is_breached_flag,email_number_of_breaches,email_social_number_of_names_returned,email_social_number_of_photos_returned,email_social_registered_consumer_electronics_profiles,email_social_registered_ecommerce_profiles,email_social_registered_email_provider_profiles,email_social_registered_entertainment_profiles,email_social_registered_messaging_profiles,email_social_registered_professional_profiles,email_social_registered_profiles,email_social_registered_social_media_profiles,email_social_registered_travel_profiles,website_exists,label
0,Comcel S.A. (Claro),False,Claro Comunicacion Celular S.A. (Comcel),True,0,0,1,0,1,1,5,2,True,False,True,True,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,6.0,1.0,0.0,True,0
1,Telefonica Colombia (Movistar),False,Movistar Colombia Telecomunicaciones S.A. ESP,False,0,1,0,0,1,0,1,0,True,False,True,True,5.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,8.0,4.0,0.0,True,0
2,Comcel S.A. (Claro),False,Claro Comunicacion Celular S.A. (Comcel),False,0,0,0,0,1,1,4,2,True,False,True,True,3.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,2.0,10.0,4.0,0.0,True,0
3,Comcel S.A. (Claro),False,Claro Comunicacion Celular S.A. (Comcel),False,0,1,0,0,1,0,4,3,True,False,True,True,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,6.0,2.0,0.0,True,0
4,Comcel S.A. (Claro),False,Claro Comunicacion Celular S.A. (Comcel),True,0,0,0,0,1,1,4,2,True,False,True,True,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,5.0,1.0,0.0,True,0


In [147]:
model_data['label'].value_counts()

label
0    8485
1     644
Name: count, dtype: int64

In [148]:

counts = model_data['phone_carrier'].value_counts()

to_replace = counts[counts < 10].index


model_data['phone_carrier'] = model_data['phone_carrier'].replace(to_replace, 'Other')

In [149]:
for col in categorical_variables:
    counts = model_data[col].value_counts()
    to_replace = counts[counts < 10].index
    model_data[col] = model_data[col].replace(to_replace, 'Other')


In [150]:
model_data['phone_carrier'].value_counts()

phone_carrier
Comcel S.A. (Claro)                        5467
Colombia Movil (Tigo)                      1508
Telefonica Colombia (Movistar)             1488
Partners Telecom (WOM)                      186
Virgin Mobile (Colombia)                    135
ALMACENES EXITO INVERSIONES S.A.S.           99
EMPRESA DE TELECOMUNICACIONES DE BOGOTA      74
COMUNICACION CELULAR S.A.                    58
Other                                        35
Telefnica Mviles Colombia (Movistar)         22
Comcel                                       18
COLOMBIA TELECOMUNICACIONES S.A. ESP         15
Comunicacion Celular S.A.                    14
COLOMBIA MOVIL S.A. E.S.P.                   10
Name: count, dtype: int64

In [151]:
y = model_data['label']
model_data_train = model_data.drop('label',axis=1)

In [152]:
model_data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9129 entries, 0 to 9128
Data columns (total 29 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   phone_carrier                                          9129 non-null   object 
 1   phone_disposable_validation                            9129 non-null   bool   
 2   phone_ported_original_carrier                          9129 non-null   object 
 3   phone_ported_validation                                9129 non-null   bool   
 4   phone_social_number_of_names_returned                  9129 non-null   int64  
 5   phone_social_number_of_photos_returned                 9129 non-null   int64  
 6   phone_social_registered_ecommerce_profiles             9129 non-null   int64  
 7   phone_social_registered_email_provider_profiles        9129 non-null   int64  
 8   phone_social_registered_messaging_profiles      

<h3> Catboost Classifier </h3>

In [153]:
X_train, X_test, y_train, y_test = train_test_split(model_data_train, y, test_size=0.2, random_state=50)

In [154]:
train_df = X_train.copy()
train_df['target'] = y_train  # cambia 'target' por el nombre real de tu variable objetivo si es distinto

# Separar clases mayoritaria y minoritaria
df_majority = train_df[train_df.target == 0]
df_minority = train_df[train_df.target == 1]

# Aumentar (duplicar) la clase minoritaria hasta igualar la mayoría
df_minority_upsampled = resample(
    df_minority,
    replace=True,                      # con reemplazo
    n_samples=len(df_majority),        # mismo número que clase 0
    random_state=42
)

# Unir clases balanceadas
train_balanced = pd.concat([df_majority, df_minority_upsampled])

# Separar de nuevo en X e y
X_train_balanced = train_balanced.drop('target', axis=1)
y_train_balanced = train_balanced['target']

print("Antes del oversampling:", y_train.value_counts())
print("Después del oversampling:", y_train_balanced.value_counts())

Antes del oversampling: label
0    6796
1     507
Name: count, dtype: int64
Después del oversampling: target
0    6796
1    6796
Name: count, dtype: int64


In [155]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7303, 29), (1826, 29), (7303,), (1826,))

In [156]:
sum(y_train)

507

In [157]:
sum(y_test)

137

In [158]:
pos_count = sum(y)

In [159]:
pos_count

644

In [160]:
neg_count = len(model_data_train) - pos_count

In [161]:
neg_count

8485

In [162]:
scale_pos_weight_value = neg_count / pos_count

print(f"Calculated scale_pos_weight: {scale_pos_weight_value}")

Calculated scale_pos_weight: 13.175465838509316


In [163]:
base_model = CatBoostClassifier(iterations=500,            # more boosting rounds for better learning
                                depth=8,                    # slightly deeper trees
                                learning_rate=0.05,         # smaller LR + more iterations helps stability
                                l2_leaf_reg=5,              # regularization to avoid overfitting
                                loss_function='Logloss',
                                cat_features=categorical_variables,
                                eval_metric='F1',
                                scale_pos_weight=scale_pos_weight_value,
                                random_seed=42,
                                verbose=100
                              )

In [164]:
base_model.fit(X_train, y_train)

0:	learn: 0.5265258	total: 148ms	remaining: 1m 13s
100:	learn: 0.7632145	total: 2.57s	remaining: 10.1s
200:	learn: 0.8412037	total: 4.12s	remaining: 6.12s
300:	learn: 0.9100280	total: 6.13s	remaining: 4.05s
400:	learn: 0.9369293	total: 7.92s	remaining: 1.95s
499:	learn: 0.9493191	total: 9.81s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7d0b374a7cd0>

In [166]:
base_model_balanced.fit(X_train_balanced, y_train_balanced)

0:	learn: 0.6083735	total: 123ms	remaining: 1m 1s
100:	learn: 0.8240974	total: 4.75s	remaining: 18.8s
200:	learn: 0.8912721	total: 8.95s	remaining: 13.3s
300:	learn: 0.9175141	total: 13.6s	remaining: 8.98s
400:	learn: 0.9311661	total: 17.9s	remaining: 4.41s
499:	learn: 0.9413764	total: 22.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7d0b352a1ad0>

In [167]:
# 4.Evaluate the model
y_pred = base_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred)) 


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      1689
           1       0.10      0.16      0.13       137

    accuracy                           0.83      1826
   macro avg       0.52      0.52      0.52      1826
weighted avg       0.87      0.83      0.85      1826



In [168]:
# 4.Evaluate the model
y_pred = base_model_balanced.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred)) 


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.87      0.90      1689
           1       0.09      0.16      0.11       137

    accuracy                           0.81      1826
   macro avg       0.51      0.51      0.51      1826
weighted avg       0.86      0.81      0.84      1826



In [None]:
feature_importances = base_model.get_feature_importance()

feature_importances_df = pd.DataFrame({'Feature': model_data_train.columns, 'Importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

print(feature_importances_df.head(20))

top_features = feature_importances_df.nlargest(10, 'Importance')['Feature']

                                                  Feature  Importance
25                       email_social_registered_profiles   11.318755
26          email_social_registered_social_media_profiles    8.966464
2                           phone_ported_original_carrier    8.960369
0                                           phone_carrier    8.684345
10                       phone_social_registered_profiles    8.493669
16                               email_number_of_breaches    7.124043
3                                 phone_ported_validation    6.637436
24          email_social_registered_professional_profiles    6.033395
11          phone_social_registered_social_media_profiles    5.427820
5                  phone_social_number_of_photos_returned    5.092335
9           phone_social_registered_professional_profiles    4.334775
7         phone_social_registered_email_provider_profiles    4.155297
20             email_social_registered_ecommerce_profiles    3.102682
15                  

In [None]:
top_features

25                 email_social_registered_profiles
26    email_social_registered_social_media_profiles
2                     phone_ported_original_carrier
0                                     phone_carrier
10                 phone_social_registered_profiles
16                         email_number_of_breaches
3                           phone_ported_validation
24    email_social_registered_professional_profiles
11    phone_social_registered_social_media_profiles
5            phone_social_number_of_photos_returned
Name: Feature, dtype: object