In [None]:
# libs
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Carga de datos
df = pd.read_csv("data/processed/stack_overflow.csv")  # ajustá la ruta

# 2. Variables objetivo y predictoras
target = "WantsToChangeJob"  # ejemplo
X = df.drop(columns=[target])
y = df[target].map({"Yes":1, "No":0})

# 3. Identificar columnas numéricas y categóricas
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()

# 4. Pipeline de preprocesamiento
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols),
])

# 5. Pipeline completo con modelo
pipeline = Pipeline(steps=[
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# 6. División de datos y entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=42)
pipeline.fit(X_train, y_train)

# 7. Evaluación
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
