<a href="https://colab.research.google.com/github/BruceGabr/Neurokup-III/blob/main/notebooks/04_entrenamiento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importar librerías

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split

## Leer datasets preprocesados

In [3]:
train = pd.read_csv('train_clean.csv')
test_public = pd.read_csv('test_public_clean.csv')
test_private = pd.read_csv('test_private_clean.csv')

In [4]:
train_copy = train.copy()
test_private_copy = test_private.copy()
test_public_copy = test_public.copy()

In [5]:
train_copy.info()
test_public_copy.info()
test_private_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36168 entries, 0 to 36167
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   año_nacimiento         36168 non-null  int64  
 1   trabajo                36168 non-null  object 
 2   estado_civil           36168 non-null  object 
 3   educación              36168 non-null  object 
 4   fondos_promedio_anual  36168 non-null  float64
 5   deuda_personal         36168 non-null  bool   
 6   contacto               36168 non-null  object 
 7   duracion               36168 non-null  int64  
 8   campaña                36168 non-null  int64  
 9   p_dias                 36168 non-null  int64  
 10  contactos_previos      36168 non-null  int64  
 11  p_resultado            36168 non-null  object 
 12  y                      36168 non-null  int64  
 13  tiene_riesgo           36168 non-null  int64  
 14  tiene_hipoteca         36168 non-null  int64  
 15  me

## Preparar datos


In [8]:
# Preparar datos
df = train.copy()
# Codificar categóricas si las hay
df = pd.get_dummies(df, drop_first=True)
X = df.drop(columns=['y'])
y = df['y']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Regresión logística

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [19]:
# Predicciones
y_pred = model.predict(X_test)

In [20]:
f1 = f1_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print(f"F1-Score: {f1:.4f}")
print(f"F1-Score Macro: {f1_macro:.4f}")
print(f"F1-Score Weighted: {f1_weighted:.4f}")

F1-Score: 0.5323
F1-Score Macro: 0.7158
F1-Score Weighted: 0.8560


## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import numpy as np

# Modelo
rf = RandomForestClassifier(
    n_estimators=500,
    class_weight={0:1, 1:7},
    random_state=42,
    n_jobs=-1
)

In [13]:
# Entrenar
rf.fit(X_train, y_train)

In [14]:
# -----------------------------
# Probabilidades (NO predict)
# -----------------------------
y_proba = rf.predict_proba(X_test)[:, 1]

# -----------------------------
# Buscar mejor threshold
# -----------------------------
thresholds = np.arange(0.05, 0.6, 0.01)
f1_scores = []

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_t))

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

# -----------------------------
# Predicción final con threshold óptimo
# -----------------------------
y_pred_opt = (y_proba >= best_threshold).astype(int)

In [18]:
# Métricas finales
f1 = f1_score(y_test, y_pred_opt)
f1_macro = f1_score(y_test, y_pred_opt, average='macro')
f1_weighted = f1_score(y_test, y_pred_opt, average='weighted')

print(f"\nBest threshold: {best_threshold:.2f}")
print(f"F1-Score: {f1:.4f}")
print(f"F1-Score Macro: {f1_macro:.4f}")
print(f"F1-Score Weighted: {f1_weighted:.4f}")


Best threshold: 0.24
F1-Score: 0.5937
F1-Score Macro: 0.7620
F1-Score Weighted: 0.8906
