<a href="https://colab.research.google.com/github/BruceGabr/Neurokup-III/blob/main/notebooks/04_entrenamiento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Conexión a kaggle

In [None]:
!rm -rf ~/.kaggle           # Eliminar archivos antiguos
!rm -rf ~/.config/kaggle    # Eliminar configuraciones alternativas

import os
import json
for var in ['KAGGLE_USERNAME', 'KAGGLE_KEY', 'KAGGLE_API_TOKEN']:
    os.environ.pop(var, None)  # Eliminar variables de entorno viejas

In [None]:
!mkdir -p /root/.kaggle
!mv kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
# ========== 1. DEFINIR DIRECTORIO ==========
kaggle_dir = os.path.expanduser('~/.kaggle')
# ========== 3. CONFIGURACIÓN ==========
config = {
    "username": "brucemg",
    "key": "a09e6281740a6fe675766513592210d7"  # Reemplaza con tu key real
}

# Crear archivo NUEVO desde cero
with open(f'{kaggle_dir}/kaggle.json', 'w') as f:
    json.dump(config, f)

# Establecer variables de entorno NUEVAS
os.environ['KAGGLE_USERNAME'] = config['username']
os.environ['KAGGLE_KEY'] = config['key']

In [None]:
!kaggle config view

Configuration values from /root/.kaggle
- username: brucemg
- path: None
- proxy: None
- competition: None


In [None]:
!pip install -q kaggle

In [None]:
!kaggle competitions list | head

ref                                                                                 deadline             category                reward  teamCount  userHasEntered  
----------------------------------------------------------------------------------  -------------------  ---------------  -------------  ---------  --------------  
https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-3       2026-04-15 23:59:00  Featured         2,207,152 Usd       1068           False  
https://www.kaggle.com/competitions/vesuvius-challenge-surface-detection            2026-02-13 23:59:00  Research           200,000 Usd        496           False  
https://www.kaggle.com/competitions/google-tunix-hackathon                          2026-01-12 23:59:00  Featured           100,000 Usd        104           False  
https://www.kaggle.com/competitions/csiro-biomass                                   2026-01-28 23:59:00  Research            75,000 Usd       2787           False  
https://ww

In [None]:
!kaggle competitions download -c neuro-kup-iii-beta
!unzip neuro-kup-iii-beta.zip

Downloading neuro-kup-iii-beta.zip to /content
  0% 0.00/1.59M [00:00<?, ?B/s]
100% 1.59M/1.59M [00:00<00:00, 686MB/s]
Archive:  neuro-kup-iii-beta.zip
  inflating: sample_submission.csv   
  inflating: test_private.csv        
  inflating: test_public.csv         
  inflating: train.csv               


In [None]:
train_original = pd.read_csv('train.csv')
test_private_original = pd.read_csv('test_private.csv')
test_public_original = pd.read_csv('test_public.csv')

## Importar librerías

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split

## Leer datasets preprocesados

In [None]:
train = pd.read_csv('train_clean.csv')
test_public = pd.read_csv('test_public_clean.csv')
test_private = pd.read_csv('test_private_clean.csv')

In [None]:
train_copy = train.copy()
test_private_copy = test_private.copy()
test_public_copy = test_public.copy()

In [None]:
train_copy.info()
test_public_copy.info()
test_private_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36168 entries, 0 to 36167
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   año_nacimiento         36168 non-null  int64  
 1   trabajo                36168 non-null  object 
 2   estado_civil           36168 non-null  object 
 3   educación              36168 non-null  object 
 4   fondos_promedio_anual  36168 non-null  float64
 5   deuda_personal         36168 non-null  bool   
 6   contacto               36168 non-null  object 
 7   duracion               36168 non-null  int64  
 8   campaña                36168 non-null  int64  
 9   p_dias                 36168 non-null  int64  
 10  contactos_previos      36168 non-null  int64  
 11  p_resultado            36168 non-null  object 
 12  y                      36168 non-null  int64  
 13  tiene_riesgo           36168 non-null  int64  
 14  tiene_hipoteca         36168 non-null  int64  
 15  me

## Preparar datos


In [None]:
# Preparar datos
df = train.copy()
# Codificar categóricas si las hay
df = pd.get_dummies(df, drop_first=True)
X = df.drop(columns=['y'])
y = df['y']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Regresión logística

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [None]:
# Predicciones
y_pred = model.predict(X_test)

In [None]:
f1 = f1_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print(f"F1-Score: {f1:.4f}")
print(f"F1-Score Macro: {f1_macro:.4f}")
print(f"F1-Score Weighted: {f1_weighted:.4f}")

F1-Score: 0.5323
F1-Score Macro: 0.7158
F1-Score Weighted: 0.8560


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import numpy as np

# Modelo
rf = RandomForestClassifier(
    n_estimators=500,
    class_weight={0:1, 1:7},
    random_state=42,
    n_jobs=-1
)

In [None]:
# Entrenar
rf.fit(X_train, y_train)

In [None]:
# -----------------------------
# Probabilidades (NO predict)
# -----------------------------
y_proba = rf.predict_proba(X_test)[:, 1]

# -----------------------------
# Buscar mejor threshold
# -----------------------------
thresholds = np.arange(0.05, 0.6, 0.01)
f1_scores = []

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_t))

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

# -----------------------------
# Predicción final con threshold óptimo
# -----------------------------
y_pred_opt = (y_proba >= best_threshold).astype(int)

In [None]:
# Métricas finales
f1 = f1_score(y_test, y_pred_opt)
f1_macro = f1_score(y_test, y_pred_opt, average='macro')
f1_weighted = f1_score(y_test, y_pred_opt, average='weighted')

print(f"\nBest threshold: {best_threshold:.2f}")
print(f"F1-Score: {f1:.4f}")
print(f"F1-Score Macro: {f1_macro:.4f}")
print(f"F1-Score Weighted: {f1_weighted:.4f}")


Best threshold: 0.24
F1-Score: 0.5937
F1-Score Macro: 0.7620
F1-Score Weighted: 0.8906


## LightGBM

In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    n_estimators=1200,
    learning_rate=0.03,
    num_leaves=96,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=7.5,
    random_state=42,
    n_jobs=-1
)


In [None]:
import re

def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(r"[^\w]", "_", regex=True)
    )
    return df

In [None]:
X_train = clean_columns(X_train)
X_test  = clean_columns(X_test)

In [None]:
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 3346, number of negative: 25588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014932 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 974
[LightGBM] [Info] Number of data points in the train set: 28934, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115642 -> initscore=-2.034358
[LightGBM] [Info] Start training from score -2.034358


In [None]:
import numpy as np
from sklearn.metrics import f1_score

# Probabilidades
y_proba = lgbm.predict_proba(X_test)[:, 1]

# Barrer thresholds
thresholds = np.arange(0.05, 0.6, 0.01)
f1_scores = []

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_t))

best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

best_threshold, best_f1

(np.float64(0.44000000000000006), 0.5938430983118173)

In [None]:
y_pred_opt = (y_proba >= best_threshold).astype(int)

print("Best threshold:", round(best_threshold, 2))
print("F1:", f1_score(y_test, y_pred_opt))
print("F1 macro:", f1_score(y_test, y_pred_opt, average="macro"))
print("F1 weighted:", f1_score(y_test, y_pred_opt, average="weighted"))

Best threshold: 0.44
F1: 0.5938430983118173
F1 macro: 0.7640806948119228
F1 weighted: 0.8942180888740715


In [None]:
# Save original ids for submission from the raw test dataframes
test_public_original_ids = test_public_original['id']
test_private_original_ids = test_private_original['id']

# Re-process test_public and test_private correctly
# At this point, `test_public_copy` has been pre_processed (dropped id, etc., cleaned duracion, p_dias) and still contains 'y'.
# `test_private_copy` has been pre_processed and does NOT contain 'y'.

# Prepare test_public for dummy encoding and prediction
test_public_for_dummies = test_public_copy.drop(columns=['y']).copy()

# Apply one-hot encoding to both test sets, aligning columns with X (training data)
# It's crucial to ensure the test data has the same columns as the training data after one-hot encoding
test_private_dummies = pd.get_dummies(test_private_copy, drop_first=True)
test_public_dummies = pd.get_dummies(test_public_for_dummies, drop_first=True)

# Align columns of test data with training data (X.columns)
missing_cols_private = set(X.columns) - set(test_private_dummies.columns)
for c in missing_cols_private:
    test_private_dummies[c] = 0
test_private_final = test_private_dummies[X.columns]

missing_cols_public = set(X.columns) - set(test_public_dummies.columns)
for c in missing_cols_public:
    test_public_dummies[c] = 0
test_public_final = test_public_dummies[X.columns]

# Concatenate for prediction
df_to_predict = pd.concat([test_private_final, test_public_final], ignore_index=True)

# Make predictions
predicciones = lgbm.predict(df_to_predict)

# Combine original IDs in the same order as the concatenated prediction input
submission_ids = pd.concat([test_private_original_ids, test_public_original_ids], ignore_index=True)

# Create submission DataFrame
submission_v2 = pd.DataFrame({
    "id": submission_ids,
    "y": ['si' if p == 1 else 'no' for p in predicciones]
})

print("\n--- Predicciones para nuevos datos ---")
submission_v2


--- Predicciones para nuevos datos ---


Unnamed: 0,id,y
0,15738,no
1,35463,no
2,16904,no
3,1866,no
4,31271,no
...,...,...
9038,2917,no
9039,2229,no
9040,7963,no
9041,42431,no


In [None]:
submission_v2.to_csv('submission_v2.csv', index=False)

In [None]:
!kaggle competitions submit -c neuro-kup-iii-beta -f submission_v2.csv -m "Submission_v2 LGBM"

100% 77.3k/77.3k [00:00<00:00, 116kB/s]
Successfully submitted to NeuroKup III Beta