In [53]:
# Imports y configuración base
import math
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import LeaveOneOut, StratifiedKFold, RepeatedStratifiedKFold, StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [54]:
# Cargar BankChurners.csv y preparar X, y y el pipeline
import os

ruta_csv = r"C:\Users\INMORTAL\OneDrive\Documentos\python\notebooks\Classification Models\BankChurners.csv"
assert os.path.exists(ruta_csv), f"No se encontró el archivo en: {ruta_csv}"

raw = pd.read_csv(ruta_csv)

# Eliminar columnas que fugan información o son identificadores
cols_fuga = [
    "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
    "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2",
]
cols_drop = [c for c in ["CLIENTNUM", *cols_fuga] if c in raw.columns]
raw = raw.drop(columns=cols_drop)

# Mapear objetivo Attrition_Flag -> 1 si "Attrited Customer", 0 si "Existing Customer"
assert "Attrition_Flag" in raw.columns, "No se encontró Attrition_Flag en el CSV"
objetivo = "Attrition_Flag"
y = raw[objetivo].map({"Attrited Customer": 1, "Existing Customer": 0}).astype(int)

# Definir X sin la columna objetivo
X = raw.drop(columns=[objetivo])

# Detectar columnas numéricas y categóricas
cat_cols = [c for c in X.columns if X[c].dtype == object]
num_cols = [c for c in X.columns if c not in cat_cols]

# Preprocesamiento: imputación + one-hot para categóricas; imputación + escalado para numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols),
    ], remainder="drop"
)

# Pipeline con SVM (puedes ajustar kernel/C según rendimiento); class_weight para desbalanceo
modelo_svm = Pipeline([
    ("pre", preprocessor),
    ("svc", svm.SVC(kernel="rbf", C=1.0, gamma="scale", class_weight="balanced"))
])

X.shape, y.shape, y.value_counts().to_dict(), {"num": len(num_cols), "cat": len(cat_cols)}

((10127, 19), (10127,), {0: 8500, 1: 1627}, {'num': 14, 'cat': 5})

In [55]:
# Ejemplo de predicción con el pipeline entrenado (si deseas)
# Ojo: primero debe ejecutarse la validación o un ajuste explícito
# modelo_svm.fit(X, y)
# prediccion = modelo_svm.predict([X.iloc[0].values])
# print("Predicción ejemplo:", int(prediccion[0]))


In [56]:
# Verificar clases y definir n_splits de forma segura
import numpy as np

clases_unicas = np.unique(y)
if len(clases_unicas) < 2:
    counts = y.value_counts(dropna=False).to_dict()
    raise ValueError(f"Se requiere al menos 2 clases para validación. Distribución: {counts}")

conteos = y.value_counts().sort_index()
min_por_clase = int(conteos.min())
# Elegir n_splits seguro: al menos 2 y no mayor al tamaño de la clase minoritaria ni 5
n_splits_cv = max(2, min(5, min_por_clase))
print(f"Distribución de clases: {conteos.to_dict()} | n_splits={n_splits_cv}")


Distribución de clases: {0: 8500, 1: 1627} | n_splits=5


In [57]:
# Validación estratificada usando n_splits_cv calculado
skf = StratifiedKFold(n_splits=n_splits_cv, shuffle=True, random_state=42)
result_skf = cross_val_score(modelo_svm, X, y, cv=skf, n_jobs=-1)
print(f"StratifiedKFold({n_splits_cv}) - accuracy media: {result_skf.mean():.4f} (+/- {result_skf.std():.4f})")

StratifiedKFold(5) - accuracy media: 0.9179 (+/- 0.0038)


In [58]:
# Mostrar configuración del modelo SVM
modelo_svm

0,1,2
,steps,"[('pre', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [59]:
# Repeated Stratified K-Fold como estimación más estable
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
result_rskf = cross_val_score(modelo_svm, X, y, cv=rskf, n_jobs=-1)
print(f"RepeatedStratifiedKFold(5x3) - accuracy media: {result_rskf.mean():.4f} (+/- {result_rskf.std():.4f})")

RepeatedStratifiedKFold(5x3) - accuracy media: 0.9165 (+/- 0.0045)


In [60]:
import math

def convertir_a_binario(valor):
    if isinstance(valor, float) and math.isnan(valor):
        return 0
    return int(bool(valor))

In [61]:
import pandas as pd
import numpy as np

# Tabla de ejemplos para convertir_a_binario
valores = [
    'palabra', '123', 'a1b', '',
    5, 0.001, 0, -3,
    None, True, False,
    [], [1], {}, {'a': 1},
    np.nan
]

resultados = [convertir_a_binario(v) for v in valores]

df_resultados = pd.DataFrame({
    'valor': valores,
    'tipo': [type(v).__name__ for v in valores],
    'resultado_binario': resultados
})

df_resultados


Unnamed: 0,valor,tipo,resultado_binario
0,palabra,str,1
1,123,str,1
2,a1b,str,1
3,,str,0
4,5,int,1
5,0.001,float,1
6,0,int,0
7,-3,int,1
8,,NoneType,0
9,True,bool,1
