
# Modelo de intenci?n de voto con KNN

Notebook preparado para Google Colab. Contexto: una campa?a pol?tica busca predecir la intenci?n de voto de nuevos electores a partir del dataset `voter_intentions_3000.csv` (~3000 filas, 33 columnas). El modelo principal es **K vecinos m?s cercanos** por su sencillez, interpretabilidad y porque permite exponer un servicio r?pido para prototipos.

> Reglas: no cambiar los nombres de las columnas del CSV original. Las copias transformadas deben indicar claramente que son derivadas.

Antes de ejecutar:
1. Sube el archivo `voter_intentions_3000.csv` a la sesi?n de Colab o m?ntalo desde Drive.
2. Verifica la ruta en la variable `DATA_PATH`.
3. Ejecuta todo para generar el artefacto `models/knn_voter_intention_pipeline.pkl` que consumir? el backend.


In [None]:

import os
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    f1_score,
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
import joblib

plt.style.use("ggplot")
DATA_PATH = "voter_intentions_3000.csv"  # Ajusta esta ruta seg?n tu entorno
MODEL_OUTPUT = Path("..") / "models" / "knn_voter_intention_pipeline.pkl"
MODEL_OUTPUT.parent.mkdir(parents=True, exist_ok=True)


In [None]:

# Carga del dataset original (no modifiques los nombres de columnas)
df_raw = pd.read_csv(DATA_PATH)
print("Dimensiones:", df_raw.shape)
df_raw.head()


In [None]:

# Tipos y primeros estadisticos b?sicos
df_raw.info()

df_raw.describe(include='all').transpose().head(15)


In [None]:

# Distribuci?n de la variable objetivo
fig, ax = plt.subplots(figsize=(10,4))
counts = df_raw['intended_vote'].value_counts().sort_values(ascending=False)
counts.plot(kind='bar', ax=ax, color="#3D8B7D")
ax.set_title("Distribuci?n de intended_vote")
ax.set_ylabel("Conteo")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
counts


In [None]:
# Revision de valores faltantes y estadisticos numericos
missing_ratio = df_raw.isnull().mean().sort_values(ascending=False)
print("Top columnas con mayor proporcion de NaN:", missing_ratio.head(10))

# Estadisticos basicos de numericas
df_raw.describe().T[['mean', 'std', 'min', 'max']].head()

In [None]:

# Gr?ficas simples
fig, axes = plt.subplots(1, 3, figsize=(15,4))

axes[0].hist(df_raw['age'].dropna(), bins=20, color="#8FBC91")
axes[0].set_title("Histograma de edad")
axes[0].set_xlabel("Edad")
axes[0].set_ylabel("Frecuencia")

axes[1].scatter(df_raw['party_id_strength'], df_raw['preference_strength'], alpha=0.5, color="#3D8B7D")
axes[1].set_title("Party ID vs Preference Strength")
axes[1].set_xlabel("party_id_strength")
axes[1].set_ylabel("preference_strength")

axes[2].hist(df_raw['will_turnout'].dropna(), bins=15, color="#DBC557")
axes[2].set_title("Distribuci?n de will_turnout")
axes[2].set_xlabel("Probabilidad declarada")

plt.tight_layout()
plt.show()


In [None]:

# Definici?n de columnas
TARGET = 'intended_vote'

categorical_features = [
    'gender', 'education', 'employment_status', 'employment_sector',
    'income_bracket', 'marital_status', 'urbanicity', 'region',
    'voted_last', 'has_children', 'union_member', 'public_sector',
    'home_owner', 'small_biz_owner', 'owns_car', 'wa_groups',
    'primary_choice', 'secondary_choice'
]

numeric_features = [
    col for col in df_raw.columns if col not in categorical_features + [TARGET]
]

print("Categ?ricas:", categorical_features)
print("Num?ricas:", numeric_features)


In [None]:
# Imputacion simple y, si hay muchas ausencias, imputacion supervisada ligera

df = df_raw.copy()
missing_ratio = df.isnull().mean()
high_missing = missing_ratio[missing_ratio > 0.15]
print("Columnas con >15% de NaN (se intentara imputacion supervisada si es viable):")
print(high_missing)

# Ejemplo: si preference_strength tiene huecos, usar regresion lineal como imputador supervisado
if 'preference_strength' in high_missing.index:
    try:
        helper_features = [c for c in numeric_features if c != 'preference_strength']
        train_mask = df['preference_strength'].notnull()
        if train_mask.sum() > 20:
            lin_reg = LinearRegression()
            lin_reg.fit(df.loc[train_mask, helper_features].fillna(0), df.loc[train_mask, 'preference_strength'])
            pred_mask = df['preference_strength'].isnull()
            df.loc[pred_mask, 'preference_strength'] = lin_reg.predict(
                df.loc[pred_mask, helper_features].fillna(0)
            )
            print("Imputacion supervisada aplicada a preference_strength con LinearRegression.")
    except Exception as exc:  # fallback a imputacion simple
        print("Fallo imputacion supervisada, se mantiene estrategia simple:", exc)

# Para categoricas con muchos nulos se usa LogisticRegression si hay datos suficientes
if 'voted_last' in high_missing.index:
    try:
        helper_features = numeric_features
        train_mask = df['voted_last'].notnull()
        if train_mask.sum() > 30:
            log_reg = LogisticRegression(max_iter=200, multi_class='auto')
            log_reg.fit(df.loc[train_mask, helper_features].fillna(0), df.loc[train_mask, 'voted_last'])
            pred_mask = df['voted_last'].isnull()
            df.loc[pred_mask, 'voted_last'] = log_reg.predict(
                df.loc[pred_mask, helper_features].fillna(0)
            )
            print("Imputacion supervisada aplicada a voted_last con LogisticRegression.")
    except Exception as exc:
        print("Fallo imputacion supervisada, se mantiene estrategia simple:", exc)

# El pipeline final seguira usando imputadores estadisticos para robustez en produccion.

In [None]:

# Divisi?n estratificada 70/15/15
target_encoded = df[TARGET]
X = df.drop(columns=[TARGET])

X_train, X_temp, y_train, y_temp = train_test_split(
    X, target_encoded, test_size=0.30, stratify=target_encoded, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)
print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


In [None]:

# Codificaci?n, escalado y clasificador KNN
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_val_enc = label_encoder.transform(y_val)
y_test_enc = label_encoder.transform(y_test)

candidate_ks = [1, 3, 5, 7, 9, 11, 13, 15]
val_scores = []

for k in candidate_ks:
    clf = KNeighborsClassifier(n_neighbors=k, weights="distance")
    model = Pipeline([
        ("preprocess", preprocess),
        ("classifier", clf),
    ])
    model.fit(X_train, y_train_enc)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val_enc, preds)
    f1 = f1_score(y_val_enc, preds, average="macro")
    val_scores.append({"k": k, "val_accuracy": acc, "val_f1_macro": f1})
    print(f"k={k} -> val_accuracy={acc:.3f}, val_f1_macro={f1:.3f}")

# Curva de desempe?o
a_vals = [m['val_accuracy'] for m in val_scores]
plt.figure(figsize=(8,4))
plt.plot(candidate_ks, a_vals, marker='o', color="#3D8B7D")
plt.title("Accuracy de validaci?n vs K")
plt.xlabel("K vecinos")
plt.ylabel("Accuracy validaci?n")
plt.grid(True)
plt.show()

best_k = sorted(val_scores, key=lambda x: (-x['val_accuracy'], x['k']))[0]['k']
print("Mejor K seleccionado:", best_k)


In [None]:

# Entrenar modelo final con train+val
y_trainval = label_encoder.transform(pd.concat([y_train, y_val]))
X_trainval = pd.concat([X_train, X_val])

final_clf = KNeighborsClassifier(n_neighbors=best_k, weights="distance")
final_model = Pipeline([
    ("preprocess", preprocess),
    ("classifier", final_clf),
])
final_model.fit(X_trainval, y_trainval)

test_preds = final_model.predict(X_test)
test_acc = accuracy_score(y_test_enc, test_preds)
test_f1 = f1_score(y_test_enc, test_preds, average="macro")
print(f"Test accuracy: {test_acc:.3f} | Test F1 macro: {test_f1:.3f}")

print("
Reporte de clasificaci?n en test:")
print(classification_report(y_test_enc, test_preds, target_names=label_encoder.classes_))


In [None]:

# Matriz de confusi?n en prueba
cm = confusion_matrix(y_test_enc, test_preds)
fig, ax = plt.subplots(figsize=(8,6))
cax = ax.imshow(cm, interpolation='nearest', cmap='Blues')
ax.set_title('Matriz de confusi?n (test)')
fig.colorbar(cax)
ax.set_xlabel('Predicci?n')
ax.set_ylabel('Real')
ax.set_xticks(range(len(label_encoder.classes_)))
ax.set_yticks(range(len(label_encoder.classes_)))
ax.set_xticklabels(label_encoder.classes_, rotation=45, ha='right')
ax.set_yticklabels(label_encoder.classes_)
plt.tight_layout()
plt.show()


In [None]:

# Guardar pipeline completo + metadatos
metadata = {
    "model_type": "KNeighborsClassifier",
    "k_value": int(best_k),
    "metric": final_model.named_steps['classifier'].metric,
    "weights": final_model.named_steps['classifier'].weights,
    "trained_at": pd.Timestamp.utcnow().isoformat(),
    "train_size": len(X_trainval),
    "val_accuracy": float([m['val_accuracy'] for m in val_scores if m['k'] == best_k][0]),
    "test_accuracy": float(test_acc),
    "feature_columns": list(X.columns),
    "classes": label_encoder.classes_.tolist(),
    "notes": "Pipeline con imputaci?n simple + one-hot + escalado + KNN. Re-entrenar con datos reales antes de produccion.",
}

bundle = {
    "pipeline": final_model,
    "label_encoder_classes": label_encoder.classes_.tolist(),
    "metadata": metadata,
}

joblib.dump(bundle, MODEL_OUTPUT)
print(f"Modelo guardado en {MODEL_OUTPUT.resolve()}")



## Conclusiones y pr?ximos pasos

- El modelo KNN muestra su desempe?o seg?n las m?tricas de validaci?n/prueba. Valores bajos pueden deberse a clases minoritarias o a la clase "Undecided".
- K m?s altos suelen estabilizar el ruido; K bajos capturan m?s variabilidad pero pueden sobreajustar perfiles raros.
- Limita el uso a fines educativos. Para una campa?a real: recolectar m?s datos, balancear clases y revisar continuamente el desempe?o.
- El artefacto `models/knn_voter_intention_pipeline.pkl` debe copiarse al repositorio del backend para que los endpoints `/predict` funcionen.
- Comparte este notebook como p?blico en Colab para que el equipo pueda reentrenar cuando se actualice el CSV.
