# Modelado v1.3 – Notas de Actividades
Replica **v1.2** pero usando únicamente las columnas de notas de las tareas y la variable global `nota_media`. Todas las *features* se estandarizan antes del modelado.

## 0. Librerías

In [1]:

# Core
import pandas as pd
import numpy as np
import os, joblib, matplotlib.pyplot as plt

# Scikit‑learn
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Red neuronal
from scikeras.wrappers import KerasClassifier
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Desbalance
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


2025-06-20 20:56:13.838680: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-20 20:56:13.839762: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-20 20:56:13.842761: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-06-20 20:56:13.854070: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750445773.876299  204724 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750445773.88

## 1. Carga y preparación de los datos

In [7]:

DATA_PATH = '/home/carlos/Documentos/TFG/spark-workspace/data/datasets/'
MODEL_PATH = 'models_v1.3'
os.makedirs(MODEL_PATH, exist_ok=True)

df = pd.read_parquet(f'{DATA_PATH}/dataset_2.0.parquet')

# Seleccionar sólo columnas de notas de actividades (contienen '(nota)' pero no empiezan con 'Clase')
grade_cols = [c for c in df.columns if '(nota)' in c and not c.startswith('Clase')]
df = df[['userid', 'abandona'] + grade_cols]

# Métrica global: nota_media
df['nota_media'] = df[grade_cols].mean(axis=1)

# Guardar dataset filtrado
df.to_parquet(f'{DATA_PATH}/dataset_3.0.parquet')

# Split
X = df.drop(columns=['userid', 'abandona'])
y = df['abandona']

display(df.head())
print('Shape X:', X.shape)

display(X.head())


Unnamed: 0,userid,abandona,Test Expr. (nota),Test Complejidad (nota),Act. 02 - Elecciones (nota),Act. 03 - Catalan (nota),Act. 04 - Primos (nota),Act. 05 - Vectores (nota),Act. 07 (nota),nota_media
0,e1f1d0f48ca77093f9d66cefd325504245277db3e6c145...,0,10.0,5.0,10.0,7.0,10.0,,8.5,8.416667
1,b5de2bb5b8538b199d6b3f0ecb32daa8a9d730ccc484db...,0,10.0,6.0,10.0,10.0,10.0,10.0,6.25,8.892857
2,90a634296aff946e9d045997d512d2b77dbc01880715c1...,1,10.0,8.66667,10.0,10.0,9.0,5.0,6.0,8.380953
3,b6b2a12e84ea8203775195ed2bb4e99c5788053782b0bd...,0,10.0,7.33333,10.0,10.0,10.0,10.0,10.0,9.619047
4,fd96e32a94a932f45eb32933d9ffeb71f4addf9153a76b...,0,6.0,6.0,10.0,10.0,,10.0,0.0,7.0


Shape X: (201, 8)


Unnamed: 0,Test Expr. (nota),Test Complejidad (nota),Act. 02 - Elecciones (nota),Act. 03 - Catalan (nota),Act. 04 - Primos (nota),Act. 05 - Vectores (nota),Act. 07 (nota),nota_media
0,10.0,5.0,10.0,7.0,10.0,,8.5,8.416667
1,10.0,6.0,10.0,10.0,10.0,10.0,6.25,8.892857
2,10.0,8.66667,10.0,10.0,9.0,5.0,6.0,8.380953
3,10.0,7.33333,10.0,10.0,10.0,10.0,10.0,9.619047
4,6.0,6.0,10.0,10.0,,10.0,0.0,7.0


## 2. Preprocesador y generador de pipelines

In [10]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer

# ─────────────────────────────────────────────────────────────
# 1️  Preprocesador: (-1 → 0)  ➜  imputar NaN a 0  ➜  escalar
# ─────────────────────────────────────────────────────────────
num_pipeline = Pipeline([
    ("minus1_to_0", FunctionTransformer(lambda X: np.where(X == -1, 0, X),
                                        feature_names_out="one-to-one")),
    ("impute_0",    SimpleImputer(strategy="constant", fill_value=0)),
    ("scale",       StandardScaler())
])

preprocessor = ColumnTransformer(
    [("num", num_pipeline, X.columns)],
    remainder="drop"
)

# ─────────────────────────────────────────────────────────────
# 2️  Generador de pipelines de modelos
# ─────────────────────────────────────────────────────────────
def make_pipeline(name, use_smote=False):
    if name == "LogReg":
        clf = LogisticRegression(max_iter=1000, class_weight="balanced")
    elif name == "Tree":
        clf = DecisionTreeClassifier(class_weight="balanced", random_state=42)
    elif name == "XGB":
        ratio = y.value_counts()[0] / y.value_counts()[1]
        clf = XGBClassifier(
            n_estimators=200, max_depth=4, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            scale_pos_weight=ratio, eval_metric="logloss",
            random_state=42
        )
    else:
        raise ValueError(name)

    steps = [("pre", preprocessor)]
    if use_smote:
        steps.append(("smote", SMOTE()))
    steps.append(("clf", clf))

    pipe = ImbPipeline(steps) if use_smote else Pipeline(steps)
    fname = os.path.join(MODEL_PATH, f"{name.lower()}{'_smote' if use_smote else ''}.pkl")
    return pipe, fname


## 3. Evaluación y guardado de modelos clásicos

In [11]:

scoring = ['f1', 'roc_auc', 'precision', 'recall']

def evaluate_and_save(name, use_smote=False):
    pipe, fname = make_pipeline(name, use_smote)
    cvres = cross_validate(pipe, X, y, cv=5, scoring=scoring, n_jobs=-1)
    metrics = {m: (cvres[f'test_{m}'].mean(), cvres[f'test_{m}'].std()) for m in scoring}
    pipe.fit(X, y)
    joblib.dump(pipe, fname)
    return {'model': f"{name}{'+SMOTE' if use_smote else ''}", **metrics}

results = []
for mdl in ['LogReg', 'Tree', 'XGB']:
    results.extend([evaluate_and_save(mdl, False),
                    evaluate_and_save(mdl, True)])

classic_df = pd.DataFrame({
    r['model']: {m: f"{mean:.3f} ± {std:.3f}" for m,(mean,std) in r.items() if m in scoring}
    for r in results
}).T
display(classic_df)


PicklingError: Can't pickle <function <lambda> at 0x7f29bd152cb0>: it's not found as __main__.<lambda>

## 4. Cross‑validation de la Red Neuronal

In [None]:

def build_model(meta, units=64, dr=0.3):
    n_feat = meta['n_features_in_']
    model = Sequential([
        Dense(units, activation='relu', input_shape=(n_feat,)),
        Dropout(dr),
        Dense(units//2, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

ratio = y.value_counts()[0] / y.value_counts()[1]
nn_clf = KerasClassifier(model=build_model, epochs=60, batch_size=16, verbose=0,
                         fit__class_weight={0:1, 1:ratio}, random_state=42)

pipe_no = Pipeline([('pre', preprocessor), ('nn', nn_clf)])
pipe_sm = ImbPipeline([('pre', preprocessor), ('smote', SMOTE()), ('nn', nn_clf)])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['f1', 'roc_auc', 'precision', 'recall']
nn_results = {}
for label, pipe in [('NN', pipe_no), ('NN+SMOTE', pipe_sm)]:
    cvres = cross_validate(pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    nn_results[label] = {m: f"{cvres[f'test_{m}'].mean():.3f} ± {cvres[f'test_{m}'].std():.3f}" for m in scoring}

nn_df = pd.DataFrame(nn_results).T
display(nn_df)
