In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.keras
import shap
import joblib

In [22]:
data = pd.read_csv('Data_final.csv')

# Mostrar las primeras filas del dataset para familiarizarse con él
data.head()

Unnamed: 0,cole_bilingue,estu_genero,cole_calendario,cole_naturaleza,fami_educacionmadre,fami_educacionpadre,fami_estratovivienda,fami_tienecomputador,fami_tieneinternet,fami_tieneautomovil,desemp_ingles,punt_ingles,punt_matematicas,punt_sociales_ciudadanas,punt_c_naturales,punt_lectura_critica,punt_global
0,No,F,A,OFICIAL,Secundaria (Bachillerato) completa,Secundaria (Bachillerato) incompleta,Estrato 2,Si,Si,Si,A1,37.0,50.0,39.0,41.0,46.0,217.0
1,No,M,A,OFICIAL,Secundaria (Bachillerato) incompleta,Secundaria (Bachillerato) incompleta,Estrato 3,Si,Si,Si,A1,52.0,43.0,44.0,50.0,56.0,243.0
2,No,F,A,OFICIAL,Técnica o tecnológica completa,Técnica o tecnológica completa,Estrato 3,Si,Si,Si,A2,64.0,59.0,60.0,51.0,62.0,292.0
3,No,M,A,NO OFICIAL,Secundaria (Bachillerato) completa,Técnica o tecnológica completa,Estrato 2,Si,Si,No,A2,60.0,64.0,62.0,55.0,61.0,302.0
4,No,F,A,NO OFICIAL,Educación profesional completa,Educación profesional incompleta,Estrato 3,Si,Si,Si,A1,52.0,52.0,49.0,48.0,53.0,253.0


In [6]:
# Definir las características (X) y la variable objetivo (y)
X = data.drop(columns=['punt_global','punt_ingles','punt_matematicas','punt_sociales_ciudadanas','punt_c_naturales','punt_lectura_critica','cole_bilingue','estu_genero','cole_calendario'])
y = data['punt_global']

# Identificar columnas categóricas y numéricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

# Crear un preprocesador para transformar las columnas categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Aplicar el preprocesamiento a los datos
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Convertir a matrices densas
X_train = X_train.toarray()
X_test = X_test.toarray()

# Dividir el conjunto de entrenamiento en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Verificar la forma de los conjuntos de entrenamiento, validación y prueba
X_train.shape, X_val.shape, X_test.shape

((75886, 40), (18972, 40), (23715, 40))

In [23]:
def create_model(learning_rate, layer_1_units, layer_2_units, layer_3_units):
    model = Sequential()
    model.add(Dense(layer_1_units, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(layer_2_units, activation='relu'))
    model.add(Dense(layer_3_units, activation='relu'))
    model.add(Dense(1, activation='linear'))
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mean_squared_error', metrics=['mean_absolute_error'])
    
    return model


In [24]:
def cross_val_evaluate(params, k=5):
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    model = create_model(params["learning_rate"], params["layer_1_units"], params["layer_2_units"], params["layer_3_units"])
    
    results = []
    
    # Convertir y_train a numpy array
    y_train_np = y_train.to_numpy()
    
    for train_idx, val_idx in kfold.split(X_train, y_train_np):
        X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
        y_fold_train, y_fold_val = y_train_np[train_idx], y_train_np[val_idx]
        
        model.fit(X_fold_train, y_fold_train, epochs=params["epochs"], batch_size=params["batch_size"], verbose=0)
        
        y_pred = model.predict(X_fold_val)
        mae = mean_absolute_error(y_fold_val, y_pred)
        mse = mean_squared_error(y_fold_val, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_fold_val, y_pred)
        
        results.append({"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2})
    
    return results



In [25]:
def shap_analysis(model, X_sample):
    # Crear un KernelExplainer de SHAP con el modelo y una muestra de los datos
    explainer = shap.KernelExplainer(model.predict, X_sample)
    shap_values = explainer.shap_values(X_sample)
    
    # Crear los gráficos de SHAP
    shap.summary_plot(shap_values, X_sample)
    shap.summary_plot(shap_values, X_sample, plot_type="bar")


In [26]:
best_params = {
    "learning_rate": 0.001,
    "epochs": 50,
    "batch_size": 32,
    "layer_1_units": 128,
    "layer_2_units": 64,
    "layer_3_units": 32
}


In [27]:
results = cross_val_evaluate(best_params, k=5)
results_df = pd.DataFrame(results)
print(results_df.mean())



KeyboardInterrupt: 

In [20]:
# Entrenar el modelo final con los mejores hiperparámetros
final_model = create_model(best_params["learning_rate"], best_params["layer_1_units"], best_params["layer_2_units"], best_params["layer_3_units"])
final_model.fit(X_train, y_train, epochs=best_params["epochs"], batch_size=best_params["batch_size"], verbose=1)

# Análisis con SHAP
#shap_sample = X_train[:100]  # Utiliza una muestra de los datos de entrenamiento para SHAP
#shap_analysis(final_model, shap_sample)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1b328814c10>

In [None]:
# Guardar el modelo Keras
final_model.save('final_model.h5')

# Guardar el preprocesador
joblib.dump(preprocessor, 'preprocessor.pkl')
