# modelos

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.preprocessing import PolynomialFeatures
from tensorflow import keras
# from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import re
import os
from sklearn.linear_model import Ridge, Lasso, ElasticNet


def train_and_evaluate_linar_model(ruta_guardar, X_train, X_test, y_train, y_test):
    # Crear una lista vacía para almacenar los resultados
    model_results = []

    match = re.search(r'pca(\d+)', ruta_guardar)

    if match:
        num_pca = int(match.group(1)) # Extraer el número capturado
        print(f"El número después de 'pca' es: {num_pca}")
    else:
        num_pca = len(X_train)
    
    # Crear el pipeline con los pasos de preprocesamiento y modelo
    pipe = Pipeline(steps=[("scaler", StandardScaler()),
                           ("pca", PCA()),
                           ('classifier', LinearRegression())
    ])
    
    # Definir los parámetros de búsqueda para el GridSearch
    linear_params = {
        'scaler': [StandardScaler(), MinMaxScaler(), None],
        # 'pca__n_components': [num_pca],
        # 'pca__n_components': [None,10, 0.95],
        'classifier': [LinearRegression()]
    }
    
    # Definir el espacio de búsqueda
    search_space = [
        linear_params
    ]
    
    # Configurar GridSearchCV
    gs = GridSearchCV(estimator=pipe,
                      param_grid=search_space,
                      cv=10,
                      scoring='neg_mean_absolute_error',
                      verbose=2,
                      n_jobs=-1)
    
    # Entrenar el modelo con GridSearchCV
    gs.fit(X_train, y_train)
    
    # Guardar el mejor modelo en un archivo .pkl
    best_model = gs.best_estimator_
    gs.best_estimator_
    gs
    best_scaler = gs.best_estimator_.named_steps['scaler']


    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    with open(ruta_guardar, 'wb') as file:
        pickle.dump({
            'model': best_model,
            'scaler': best_scaler
        }, file)
    
    # Evaluar el modelo
    Y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, Y_pred)
    mae = mean_absolute_error(y_test, Y_pred)
    r2 = r2_score(y_test, Y_pred)
    
    # # Imprimir los resultados
    # print(f"Mean Squared Error: {mse}")
    # print(f"Mean Absolute Error: {mae}")
    # print(f"R-squared: {r2}")
    
    # Almacenar los resultados en la lista
    model_results.append({
        'Model': 'Linear Regression',
        'Best_model':gs.best_estimator_,
        'Best_params':gs.best_params_,
        'Best_score':gs.best_score_,
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2
    })
    
    # Convertir los resultados a un DataFrame
    results_df = pd.DataFrame(model_results)
    
    # # Mostrar los resultados
    # print(results_df)
    
    return best_model, results_df

# --------------------------------------------

def train_and_evaluate_polynomial_model(ruta_guardar, X_train, X_test, y_train, y_test,results_df):
    match = re.search(r'pca(\d+)', ruta_guardar)

    if match:
        num_pca = int(match.group(1))  # Extraer el número capturado
        print(f"El número después de 'pca' es: {num_pca}")
    else:
        num_pca = len(X_train)
    
    match = re.search(r'\d+_regresion(_pca\d+)?_poly_(\d+)\.pkl$', ruta_guardar)

    # Extraer el número encontrado
    if match:
        grado = int(match.groups()[-1])
        # print(grado)
    else:
        print("No se encontró el número")
    


    # Crear una lista vacía para almacenar los resultados
    model_results = []
    
    # Crear el pipeline con los pasos de preprocesamiento y modelo
    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("polynomial", PolynomialFeatures(degree=2, include_bias=False)),
        ("pca", PCA()),
        ("classifier", LinearRegression())
    ])
    
    
    # Definir los parámetros de búsqueda para el GridSearch
    polynomial_params = {
        'scaler': [StandardScaler(), None],
        # 'scaler': [StandardScaler(), MinMaxScaler(), None],
        'polynomial__degree': [grado],  # Se puede añadir más grados si se quiere probar
        # 'polynomial__interaction_only': [True, False],  # Solo para ElasticNet
        # 'pca__n_components': [num_pca],
        # 'pca__n_components': [10, 0.95],
        # 'classifier': [Ridge(), Lasso(), ElasticNet()],
        'classifier': [ElasticNet()],
        'classifier__alpha': np.arange(0.05, 0.15, 0.01).tolist(), # Valores entre 0.05 y 0.15, con un paso de 0.01
        'classifier__l1_ratio': np.arange(0.05, 0.15, 0.01).tolist()
    }
    
    # Definir el espacio de búsqueda
    search_space = [
        polynomial_params
    ]
    
    # Configurar GridSearchCV
    gs = GridSearchCV(estimator=pipe,
                      param_grid=search_space,
                      cv=5,
                      scoring='neg_mean_absolute_error',
                      verbose=2,
                      n_jobs=-1)
    
    
    # Entrenar el modelo con GridSearchCV
    gs.fit(X_train, y_train)
    
    # # Guardar el mejor modelo en un archivo .pkl
    # best_model = gs.best_estimator_
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    # Guardar el mejor modelo en un archivo .pkl
    best_model = gs.best_estimator_
    gs.best_estimator_
    best_scaler = gs.best_estimator_.named_steps['scaler']
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    with open(ruta_guardar, 'wb') as file:
        pickle.dump({
            'model': best_model,
            'scaler': best_scaler
        }, file)
    

    # Evaluar el modelo
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # # Imprimir los resultados
    # print(f"Mean Squared Error: {mse}")
    # print(f"Mean Absolute Error: {mae}")
    # print(f"R-squared: {r2}")
    
    model_results.append({
        'Model': f'Polynomial Regression_{grado}',
        'Best_model':gs.best_estimator_,
        'Best_params':gs.best_params_,
        'Best_score':gs.best_score_,
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2
    })
    
    # Convertir los resultados en DataFrame y concatenar con el anterior
    new_results_df = pd.DataFrame(model_results)
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)
    
    # # Mostrar los resultados
    # print(results_df)
    
    return best_model, results_df

# --------------------------------------------

def train_and_evaluate_decision_tree_model(ruta_guardar, X_train, X_test, y_train, y_test,results_df):
    # Crear una lista vacía para almacenar los resultados
    model_results = []
    ruta_con_png = os.path.splitext(ruta_guardar)[0] + '.png'
    
    # Crear el pipeline con los pasos de preprocesamiento y modelo
    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),  # Escalado opcional
        ("classifier", DecisionTreeRegressor())  # Árbol de Decisión
    ])
    
    # Definir los parámetros de búsqueda para el GridSearch
    tree_params = {
        'scaler': [StandardScaler(), None],
        # 'scaler': [StandardScaler(), MinMaxScaler(), None],
        'classifier__max_depth': [None, 5, 10, 15],  # Profundidad máxima del árbol
        'classifier__min_samples_split': [5, 10],  # Número mínimo de muestras para dividir
        'classifier__min_samples_leaf': [2, 5]  # Número mínimo de muestras en una hoja
    }
    
    # Definir el espacio de búsqueda
    search_space = [
        tree_params
    ]
    
    # Configurar GridSearchCV
    gs = GridSearchCV(estimator=pipe,
                      param_grid=search_space,
                      cv=10,
                      scoring='neg_mean_absolute_error',
                      verbose=2,
                      n_jobs=-1)
    
    # Entrenar el modelo con GridSearchCV
    gs.fit(X_train, y_train)

    # Obtener el modelo entrenado con los mejores parámetros
    best_model = gs.best_estimator_
    
    # Obtener la importancia de las características del modelo
    importances = best_model.named_steps['classifier'].feature_importances_

    # Crear un DataFrame para ordenar y visualizar las importancias
    features = X_train.columns  # Si X_train es un DataFrame de pandas
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': importances
    })

    # Ordenar el DataFrame por la importancia de las características
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Mostrar las importancias
    # print(importance_df)

    # Graficar las importancias
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances')

    # Guardar la imagen sin mostrarla
    plt.savefig(ruta_con_png)
    plt.close()  # Cierra la figura para que no se muestre
    
    # # Guardar el mejor modelo en un archivo .pkl
    # best_model = gs.best_estimator_
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    best_model = gs.best_estimator_
    gs.best_estimator_
    best_scaler = gs.best_estimator_.named_steps['scaler']
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    with open(ruta_guardar, 'wb') as file:
        pickle.dump({
            'model': best_model,
            'scaler': best_scaler
        }, file)
    
    # Evaluar el modelo
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # # Imprimir los resultados
    # print(f"Mean Squared Error: {mse}")
    # print(f"Mean Absolute Error: {mae}")
    # print(f"R-squared: {r2}")
    
    model_results.append({
        'Model': 'Decision Tree',
        'Best_model':gs.best_estimator_,
        'Best_params':gs.best_params_,
        'Best_score':gs.best_score_,
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2
    })
    
    # Convertir los resultados en DataFrame y concatenar con el anterior
    new_results_df = pd.DataFrame(model_results)
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)
    
    # Mostrar los resultados
    # print(results_df)
    
    return best_model, results_df

# --------------------------------------------

def train_and_evaluate_random_forest_model(ruta_guardar, X_train, X_test, y_train, y_test, results_df):
    # Crear una lista vacía para almacenar los resultados
    model_results = []

    match = re.search(r'pca(\d+)', ruta_guardar)

    if match:
        num_pca = int(match.group(1))  # Extraer el número capturado
        print(f"El número después de 'pca' es: {num_pca}")
    else:
        num_pca = len(X_train)

    ruta_con_png = os.path.splitext(ruta_guardar)[0] + '.png'

    # Crear el pipeline con los pasos de preprocesamiento y modelo
    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),  # Escalado opcional
        ("pca", PCA()),  # Reducción de dimensionalidad opcional
        ("classifier", RandomForestRegressor(random_state=42))  # Random Forest
    ])

    # Definir los parámetros de búsqueda para el GridSearch
    forest_params = {
        'scaler': [StandardScaler(), None],
        # "pca__n_components": [num_pca],  # Dimensionalidad reducida
        # "pca__n_components": [5, 10, 0.95],  # Dimensionalidad reducida
        'classifier__n_estimators': [100],  # Número de árboles en el bosque
        'classifier__max_depth': [None, 5, 10],  # Profundidad máxima de cada árbol
        'classifier__min_samples_split': [5, 10],  # Número mínimo de muestras para dividir un nodo
        'classifier__min_samples_leaf': [2, 5]  # Número mínimo de muestras en una hoja
    }

    # Configurar GridSearchCV
    gs = GridSearchCV(estimator=pipe,
                      param_grid=forest_params,
                      cv=10,
                      scoring='neg_mean_absolute_error',
                      verbose=2,
                      n_jobs=-1)

    # Entrenar el modelo con GridSearchCV
    gs.fit(X_train, y_train)

    # Obtener el modelo entrenado con los mejores parámetros
    best_model = gs.best_estimator_

    # Manejo de características dependiendo del uso de PCA
    if 'pca' in best_model.named_steps and best_model.named_steps['pca'] is not None:
        n_components = best_model.named_steps['pca'].n_components_
        features = [f'PC{i + 1}' for i in range(n_components)]  # Nombres de componentes principales
    else:
        features = X_train.columns  # Usar las columnas originales

    # Obtener la importancia de las características del modelo
    importances = best_model.named_steps['classifier'].feature_importances_

    # Crear un DataFrame para ordenar y visualizar las importancias
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': importances
    })

    # Ordenar el DataFrame por la importancia de las características
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Mostrar las importancias
    # print(importance_df)

    # Graficar las importancias
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances')

    # Guardar la imagen sin mostrarla
    plt.savefig(ruta_con_png)
    plt.close()  # Cierra la figura para que no se muestre

    # # Guardar el mejor modelo en un archivo .pkl
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    best_model = gs.best_estimator_
    gs.best_estimator_
    best_scaler = gs.best_estimator_.named_steps['scaler']
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    with open(ruta_guardar, 'wb') as file:
        pickle.dump({
            'model': best_model,
            'scaler': best_scaler
        }, file)

    # Evaluar el modelo
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    model_results.append({
        'Model': 'Random Forest',
        'Best_model': gs.best_estimator_,
        'Best_params': gs.best_params_,
        'Best_score': gs.best_score_,
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2
    })

    # Convertir los resultados en DataFrame y concatenar con el anterior
    new_results_df = pd.DataFrame(model_results)
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)

    return best_model, results_df
   
# --------------------------------------------

def entrenar_xgboost_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df):
    model_results = []

    match = re.search(r'pca(\d+)', ruta_guardar)

    if match:
        num_pca = int(match.group(1))  # Extraer el número capturado
        print(f"El número después de 'pca' es: {num_pca}")
    else:
        num_pca = len(X_train)

    ruta_con_png = os.path.splitext(ruta_guardar)[0] + '.png'
    
    # Pipeline para XGBoost
    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),  # Escalado de características
        ("pca", PCA()),  # Reducción de dimensionalidad (opcional)
        ("classifier", XGBRegressor(random_state=42, objective='reg:squarederror'))  # XGBoost para regresión
    ])

    # Espacio de búsqueda para el GridSearch
    xgb_params = {
        'scaler': [StandardScaler(), MinMaxScaler(), None],
        # "pca__n_components": [num_pca],
        # "pca__n_components": [5, 10, 0.95],
        'classifier__n_estimators': [50,100,200],  # Número de árboles
        # 'classifier__max_depth': [3, 6, 10],  # Profundidad máxima
        'classifier__max_depth': [2,3,4,5],  # Profundidad máxima
        # 'classifier__learning_rate': [0.01, 0.1],  # Tasa de aprendizaje
        'classifier__learning_rate': [0.05, 0.2],  # Tasa de aprendizaje
        'classifier__subsample': [0.8],  # Proporción de muestras utilizadas
        'classifier__colsample_bytree': [0.6,0.8]  # Proporción de características utilizadas
    }

    # Configurar GridSearchCV
    gs = GridSearchCV(estimator=pipe,
                      param_grid=xgb_params,
                      cv=3,
                      scoring='neg_mean_absolute_error',
                      verbose=2,
                      n_jobs=-1)

    # Entrenar el modelo
    gs.fit(X_train, y_train)

    # Obtener el modelo entrenado con los mejores parámetros
    best_model = gs.best_estimator_
    
    # Manejo de características dependiendo del uso de PCA
    if 'pca' in best_model.named_steps and best_model.named_steps['pca'] is not None:
        n_components = best_model.named_steps['pca'].n_components_
        features = [f'PC{i + 1}' for i in range(n_components)]  # Nombres de componentes principales
    else:
        features = X_train.columns  # Usar las columnas originales

    # Obtener la importancia de las características del modelo
    importances = best_model.named_steps['classifier'].feature_importances_

    # Crear un DataFrame para ordenar y visualizar las importancias
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': importances
    })

    # Ordenar el DataFrame por la importancia de las características
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Mostrar las importancias
    # print(importance_df)

    # Graficar las importancias
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances')

    # Guardar la imagen sin mostrarla
    plt.savefig(ruta_con_png)
    plt.close()  # Cierra la figura para que no se muestre


    # # Guardar el mejor modelo
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    best_model = gs.best_estimator_
    gs.best_estimator_
    best_scaler = gs.best_estimator_.named_steps['scaler']
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    with open(ruta_guardar, 'wb') as file:
        pickle.dump({
            'model': best_model,
            'scaler': best_scaler
        }, file)

    # Evaluación del modelo
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # # Mostrar los resultados
    # print(f"Mean Squared Error: {mse}")
    # print(f"Mean Absolute Error: {mae}")
    # print(f"R-squared: {r2}")

    model_results.append({
        'Model': 'XGBoost (Pipeline)',
        'Best_model':gs.best_estimator_,
        'Best_params':gs.best_params_,
        'Best_score':gs.best_score_,
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2
    })

    # Convertir los resultados en DataFrame y concatenar con el anterior
    new_results_df = pd.DataFrame(model_results)
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)

    # Mostrar los resultados
    # print(results_df)

    return best_model, results_df

# --------------------------------------------

def entrenar_lightgbm_pipeline(ruta_guardar,X_train, X_test,y_train,y_test,results_df):
    model_results = []
        
    match = re.search(r'pca(\d+)', ruta_guardar)

    if match:
        num_pca = int(match.group(1))  # Extraer el número capturado
        print(f"El número después de 'pca' es: {num_pca}")
    else:
        num_pca = len(X_train)

    ruta_con_png = os.path.splitext(ruta_guardar)[0] + '.png'

    # Pipeline para LightGBM
    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),  # Escalado de características
        ("pca", PCA()),  # Reducción de dimensionalidad (opcional)
        ("classifier", lgb.LGBMRegressor(random_state=42))  # LightGBM para regresión
    ])

    # Espacio de búsqueda para el GridSearch
    lgb_params = {
        'scaler': [StandardScaler(), MinMaxScaler(), None],
        # "pca__n_components": [num_pca],
        # "pca__n_components": [5, 10, 0.95],
        'classifier__n_estimators': [100],  # Número de árboles
        'classifier__max_depth': [3, 6, 10],  # Profundidad máxima
        'classifier__learning_rate': [0.01, 0.1],  # Tasa de aprendizaje
        'classifier__subsample': [0.8],  # Proporción de muestras utilizadas
        'classifier__colsample_bytree': [0.6,0.8] , # Proporción de características utilizadas
        'classifier__num_leaves': [31, 50, 100]  # Número de hojas
    }

    # Configurar GridSearchCV
    gs = GridSearchCV(estimator=pipe,
                      param_grid=lgb_params,
                      cv=3,
                      scoring='neg_mean_absolute_error',
                      verbose=2,
                      n_jobs=-1)

    # Entrenar el modelo
    gs.fit(X_train, y_train)

    # Obtener el modelo entrenado con los mejores parámetros
    best_model = gs.best_estimator_
    
    # Manejo de características dependiendo del uso de PCA
    if 'pca' in best_model.named_steps and best_model.named_steps['pca'] is not None:
        n_components = best_model.named_steps['pca'].n_components_
        features = [f'PC{i + 1}' for i in range(n_components)]  # Nombres de componentes principales
    else:
        features = X_train.columns  # Usar las columnas originales

    # Obtener la importancia de las características del modelo
    importances = best_model.named_steps['classifier'].feature_importances_

    # Crear un DataFrame para ordenar y visualizar las importancias
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': importances
    })

    # Ordenar el DataFrame por la importancia de las características
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Mostrar las importancias
    # print(importance_df)

    # Graficar las importancias
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importances')

    # Guardar la imagen sin mostrarla
    plt.savefig(ruta_con_png)
    plt.close()  # Cierra la figura para que no se muestre


    # # Guardar el mejor modelo
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    best_model = gs.best_estimator_
    gs.best_estimator_
    best_scaler = gs.best_estimator_.named_steps['scaler']
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(best_model, file)
    with open(ruta_guardar, 'wb') as file:
        pickle.dump({
            'model': best_model,
            'scaler': best_scaler
        }, file)

    # Evaluación del modelo
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # # Mostrar los resultados
    # print(f"Mean Squared Error: {mse}")
    # print(f"Mean Absolute Error: {mae}")
    # print(f"R-squared: {r2}")


    model_results.append({
        'Model': 'LightGBM (Pipeline)',
        'Best_model':gs.best_estimator_,
        'Best_params':gs.best_params_,
        'Best_score':gs.best_score_,
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2
    })

    # Convertir los resultados en DataFrame y concatenar con el anterior
    new_results_df = pd.DataFrame(model_results)
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)

    # Mostrar los resultados
    # print(results_df)

    return best_model, results_df

# --------------------------------------------

def entrenar_red_neuronal(ruta_guardar, X_train, X_test, y_train, y_test,results_df):
    model_results = []

    # Verificar si la ruta contiene 'log' con una regex
    if re.search(r"log", ruta_guardar):
        print("Se detectó 'log' en la ruta. Aplicando transformación logarítmica a la variable objetivo.")
        y_train = np.log1p(y_train)  # Transformar la variable objetivo (log(1 + y))
        y_test = np.log1p(y_test)    # Transformar el conjunto de prueba

    # Si también tienes un conjunto de validación, hacer un split adicional
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

    # Aplicar el StandardScaler
    scaler = StandardScaler()

    # Ajustar el escalador con los datos de entrenamiento y transformar X_train
    X_train_scaled = scaler.fit_transform(X_train)

    # Transformar X_test y X_valid con el mismo escalador
    X_test_scaled = scaler.transform(X_test)
    X_valid_scaled = scaler.transform(X_valid)

    # Definir la arquitectura del modelo de red neuronal
    model = keras.models.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=X_train_scaled.shape[1:]),  # Capa densa de 64 neuronas con ReLU
        keras.layers.Dense(32, activation='relu'),  # Capa densa de 32 neuronas con ReLU
        keras.layers.Dense(1)  # Capa de salida, 1 neurona para la regresión
    ])

    # Compilar el modelo
    model.compile(loss="mean_absolute_error",
                  metrics=['mean_absolute_error'],  # Usamos el error absoluto medio para regresión
                  optimizer=keras.optimizers.Adam(learning_rate=0.001))  # Optimizer Adam con tasa de aprendizaje ajustada

    # Ajuste del modelo a los datos escalados
    history = model.fit(X_train_scaled, y_train,  # Entrenamos con los datos de entrenamiento escalados
                        epochs=50,  # Aumentamos las épocas para un mejor ajuste
                        batch_size=32,  # Tamaño de batch más grande para entrenamiento
                        validation_data=(X_valid_scaled, y_valid))  # Validación con los datos de validación escalados

    # # Guardar el mejor modelo en un archivo .pkl
    # with open(ruta_guardar, 'wb') as file:
    #     pickle.dump(model, file)
    # Guardar el mejor modelo en un archivo .pkl
    with open(ruta_guardar, 'wb') as file:
        pickle.dump({
            'model': model,
            'scaler': scaler
        }, file)

    # Evaluación del modelo
    y_pred_nn = model.predict(X_test_scaled)

    # Si aplicaste logarítmica, destransformar para comparar correctamente
    if re.search(r"log", ruta_guardar):
        y_pred_nn = np.expm1(y_pred_nn)  # Invertir la transformación logarítmica
        y_test = np.expm1(y_test)       # Invertir la transformación logarítmica

    mse_nn = mean_squared_error(y_test, y_pred_nn)
    mae_nn = mean_absolute_error(y_test, y_pred_nn)
    r2_nn = r2_score(y_test, y_pred_nn)

    model_results.append({
        'Model': 'Neural Network',
        'Best_model': '-',
        'Best_params': '-',
        'Best_score': '-',
        'MSE': mse_nn,
        'MAE': mae_nn,
        'R-squared': r2_nn
    })

    # Convertir los resultados en DataFrame y concatenar con el anterior
    new_results_df = pd.DataFrame(model_results)
    results_df = pd.concat([results_df, new_results_df], ignore_index=True)

    # Devolver los resultados actualizados
    return results_df

# Cargar CSV´s

In [22]:
X_train = pd.read_csv("../data/train/X_train.csv",index_col=0)
y_train = pd.read_csv("../data/train/y_train.csv",index_col=0)
X_test = pd.read_csv("../data/test/X_test.csv",index_col=0)
y_test = pd.read_csv("../data/test/y_test.csv",index_col=0)

In [23]:
X_train = X_train.select_dtypes(include=["number"])
# y_train = y_train.select_dtypes(include=["number"])
X_test = X_test.select_dtypes(include=["number"])
# y_test = y_test.select_dtypes(include=["number"])


# Prueba cols: Todas

In [None]:
X_train.head(1)

In [None]:
print(X_train.shape)  
print(X_test.shape)   
print(y_train.shape) 
print(y_test.shape)   

In [None]:
ruta_guardar = '../models/todas/1_regresion_lineal.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train,X_test,y_train,y_test)
ruta_guardar='../models/todas/2_regresion_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas/2_regresion_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas/3_decision_tree.pkl'
best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas/4_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas/5_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas/6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas/7_red_neuronal.pkl'
results_df = entrenar_red_neuronal(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
results_df.to_csv("../models/todas/reultados.csv")
results_df

In [None]:
results_df = results_df.iloc[0:0]


In [None]:
ruta_guardar = '../models/todas_PCA5/1_regresion_lineal_pca5.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train,X_test,y_train,y_test)
ruta_guardar='../models/todas_PCA5/2_regresion_pca5_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA5/2_regresion_pca5_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
# ruta_guardar='../models/todas_PCA5/3_decision_tree.pkl'
# best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA5/4_pca5_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA5/5_pca5_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA5/6_pca5_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
# ruta_guardar='../models/todas_PCA5/7_red_neuronal.pkl'
# results_df = entrenar_red_neuronal(ruta_guardar,X_train,X_test,y_train,y_test,results_df)

results_df.to_csv("../models/todas_PCA/reultados.csv");
results_df

In [None]:
results_df = results_df.iloc[0:0]

In [None]:
ruta_guardar = '../models/todas_PCA6/1_regresion_lineal_pca6.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train,X_test,y_train,y_test)
ruta_guardar='../models/todas_PCA6/2_regresion_pca6_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA6/2_regresion_pca6_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
# ruta_guardar='../models/todas_PCA6/3_decision_tree.pkl'
# best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA6/4_pca6_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA6/5_pca6_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA6/6_pca6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
# ruta_guardar='../models/todas_PCA6/7_red_neuronal.pkl'
# results_df = entrenar_red_neuronal(ruta_guardar,X_train,X_test,y_train,y_test,results_df)

results_df.to_csv("../models/todas_PCA6/reultados.csv");
results_df

In [None]:
results_df = results_df.iloc[0:0]

In [None]:
ruta_guardar = '../models/todas_PCA10/1_regresion_lineal_pca6.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train,X_test,y_train,y_test)
ruta_guardar='../models/todas_PCA10/2_regresion_pca6_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA10/2_regresion_pca6_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
# ruta_guardar='../models/todas_PCA6/3_decision_tree.pkl'
# best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA10/4_pca6_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA10/5_pca6_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
ruta_guardar='../models/todas_PCA10/6_pca6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train,X_test,y_train,y_test,results_df)
# ruta_guardar='../models/todas_PCA6/7_red_neuronal.pkl'
# results_df = entrenar_red_neuronal(ruta_guardar,X_train,X_test,y_train,y_test,results_df)

results_df.to_csv("../models/todas_PCA10/reultados.csv");
results_df

In [None]:
results_df = results_df.iloc[0:0]

# 8 FEATURES

In [None]:
desired_cols = ["num_pos","apariciones_totales","tirosXp","goles_esperados","fuerajuegoXp","total_disparos","xG/Shots","rating"]

X_train2 = X_train[desired_cols]
X_test2 = X_test[desired_cols]
print(X_train2.shape)  
print(X_test2.shape)   
print(y_train.shape) 
print(y_test.shape)   

In [None]:
ruta_guardar = '../models/8features/1_regresion_lineal.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train2,X_test2,y_train,y_test)
ruta_guardar='../models/8features/2_regresion_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features/2_regresion_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features/3_decision_tree.pkl'
best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features/4_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features/5_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features/6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features/7_red_neuronal.pkl'
results_df = entrenar_red_neuronal(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
results_df.to_csv("../models/8features/reultados.csv")
results_df

In [None]:
results_df = results_df.iloc[0:0]

In [None]:
ruta_guardar = '../models/8features_pca5/1_regresion_lineal_pca5.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train2,X_test2,y_train,y_test)
ruta_guardar='../models/8features_pca5/2_regresion_pca5_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca5/2_regresion_pca5_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca5/3_decision_tree.pkl'X_test2
# best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca5/4_pca5_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca5/5_pca5_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca5/6_pca5_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca5/7_red_neuronal.pkl'
# results_df = entrenar_red_neuronal(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)

results_df.to_csv("../models/8features_pca5/reultados.csv");
results_df

In [None]:
results_df = results_df.iloc[0:0]

In [None]:
ruta_guardar = '../models/8features_pca6/1_regresion_lineal_pca6.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train2,X_test2,y_train,y_test)
ruta_guardar='../models/8features_pca6/2_regresion_pca6_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca6/2_regresion_pca6_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca6/3_decision_tree.pkl'
# best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca6/4_pca6_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca6/5_pca6_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_pca6/6_pca6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca6/7_red_neuronal.pkl'
# results_df = entrenar_red_neuronal(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)

results_df.to_csv("../models/8features_pca6/reultados.csv");
results_df

In [None]:
results_df = results_df.iloc[0:0]

# 8 features con y poli

In [None]:
# ruta_guardar = '../models/8features_pca6/1_regresion_lineal_pca6.pkl'
# best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train2,X_test2,y_train,y_test)
# ruta_guardar='../models/8features_pca6/2_regresion_pca6_poly_2.pkl'
# best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca6/2_regresion_pca6_poly_3.pkl'
# best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca6/3_decision_tree.pkl'
# best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca6/4_pca6_random_forest.pkl'
# best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca6/5_pca6_xgboost.pkl'
# best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
# ruta_guardar='../models/8features_pca6/6_pca6_lightgbm.pkl'
# best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_y_poli/7_red_neuronal.pkl'
results_df = entrenar_red_neuronal(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)
ruta_guardar='../models/8features_y_poli/7_red_neuronal_log.pkl'
results_df = entrenar_red_neuronal(ruta_guardar,X_train2,X_test2,y_train,y_test,results_df)

results_df.to_csv("../models/8features_y_poli/reultados.csv");
results_df

In [None]:
results_df = results_df.iloc[0:0]

# 5 features

In [18]:
# desired_cols = ["num_pos","apariciones_totales","rating","fuerajuegoXp","tirosXp","regatesXp","goles_esperados"]

desired_cols = ["num_pos","rating","tirosXp","goles_esperados","apariciones_totales"]

X_train4 = X_train[desired_cols]
X_test4 = X_test[desired_cols]
print(X_train4.shape)  
print(X_test4.shape)   
print(y_train.shape) 
print(y_test.shape)
X_test4.head()

(2552, 5)
(638, 5)
(2552, 1)
(638, 1)


Unnamed: 0,num_pos,rating,tirosXp,goles_esperados,apariciones_totales
1101,2,6.79,0.1,0.02,11
1073,2,6.35,0.9,1.38,11
844,10,6.74,0.5,1.97,12
445,2,6.58,0.2,0.03,5
1184,5,7.23,1.3,0.69,7


In [19]:
para_pruebas = pd.concat([X_test4, y_test], axis=1)

para_pruebas_5f = para_pruebas.sort_values("goles",ascending=False)
para_pruebas_5f.to_csv("../data/test/df_pruebas_5_features.csv")

In [44]:
ruta_guardar = '../models/mae/5features/1_regresion_lineal.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train4,X_test4,y_train,y_test)
ruta_guardar='../models/mae/5features/2_regresion_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train4,X_test4,y_train,y_test,results_df)
ruta_guardar='../models/mae/5features/2_regresion_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train4,X_test4,y_train,y_test,results_df)
ruta_guardar='../models/mae/5features/3_decision_tree.pkl'
best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train4,X_test4,y_train,y_test,results_df)
ruta_guardar='../models/mae/5features/4_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train4,X_test4,y_train,y_test,results_df)
ruta_guardar='../models/mae/5features/5_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train4,X_test4,y_train,y_test,results_df)
ruta_guardar='../models/mae/5features/6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train4,X_test4,y_train,y_test,results_df)
ruta_guardar='../models/mae/5features/7_red_neuronal.pkl'
results_df = entrenar_red_neuronal(ruta_guardar,X_train4,X_test4,y_train,y_test,results_df)
results_df.to_csv("../models/mae/5features/reultados.csv")
results_df

Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 24 candidates, totalling 240 fits


  return fit_method(estimator, *args, **kwargs)


Fitting 3 folds for each of 144 candidates, totalling 432 fits
Fitting 3 folds for each of 108 candidates, totalling 324 fits


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 2552, number of used features: 5
[LightGBM] [Info] Start training from score 0.825235
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.6757 - mean_absolute_error: 0.6757 - val_loss: 0.4715 - val_mean_absolute_error: 0.4715
Epoch 2/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4419 - mean_absolute_error: 0.4419 - val_loss: 0.4431 - val_mean_absolute_error: 0.4431
Epoch 3/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4084 - mean_absolute_error: 0.4084 - val_loss: 0.4303 - val_mean_absolute_error: 0.4303
Epoch 4/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3908 - mean_absolute_error: 0.3908 - val_loss: 0.4319 - val_mean_absolute_error: 0.4319
Epoch 5/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3884 - mean_absolute_error: 0.3884 - val_loss: 0.4381 - val_mean_absolute_error: 0.4381
Epoch 6/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3648 - 

Unnamed: 0,Model,Best_model,Best_params,Best_score,MSE,MAE,R-squared
0,Linear Regression,"(MinMaxScaler(), PCA(), LinearRegression())","{'classifier': LinearRegression(), 'scaler': M...",-0.475117,0.563664,0.450837,0.81758
1,Polynomial Regression_2,"(StandardScaler(), PolynomialFeatures(include_...","{'classifier': ElasticNet(), 'classifier__alph...",-0.445089,0.499207,0.426913,0.838441
2,Polynomial Regression_3,"(None, PolynomialFeatures(degree=3, include_bi...","{'classifier': ElasticNet(), 'classifier__alph...",-0.444579,0.482099,0.403519,0.843977
3,Decision Tree,"(StandardScaler(), DecisionTreeRegressor(max_d...","{'classifier__max_depth': 5, 'classifier__min_...",-0.454769,0.693077,0.438632,0.775698
4,Random Forest,"(StandardScaler(), PCA(), (DecisionTreeRegress...","{'classifier__max_depth': 10, 'classifier__min...",-0.427227,0.619884,0.407281,0.799386
5,XGBoost (Pipeline),"(StandardScaler(), PCA(), XGBRegressor(base_sc...","{'classifier__colsample_bytree': 0.8, 'classif...",-0.441998,0.532844,0.409604,0.827554
6,LightGBM (Pipeline),"(StandardScaler(), PCA(), LGBMRegressor(colsam...","{'classifier__colsample_bytree': 0.8, 'classif...",-0.456857,0.553666,0.409482,0.820816
7,Neural Network,-,-,-,0.491886,0.357442,0.84081


In [43]:
results_df = results_df.iloc[0:0]

# 4 features

In [9]:
# desired_cols = ["num_pos","apariciones_totales","rating","fuerajuegoXp","tirosXp","regatesXp","goles_esperados"]

desired_cols = ["num_pos","rating","tirosXp","goles_esperados"]
X_train3 = X_train[desired_cols]
X_test3 = X_test[desired_cols]
print(X_train3.shape)  
print(X_test3.shape)   
print(y_train.shape) 
print(y_test.shape)


(2552, 4)
(638, 4)
(2552, 1)
(638, 1)


In [11]:
X_train3.head()

Unnamed: 0,num_pos,rating,tirosXp,goles_esperados
576,10,6.98,2.2,1.02
2110,5,6.65,0.3,0.08
2869,5,6.53,0.6,0.67
2513,10,6.05,0.0,0.3
2751,2,6.44,0.3,0.14


In [None]:
para_pruebas = pd.concat([X_test3, y_test], axis=1)

para_pruebas_4f = para_pruebas.sort_values("goles",ascending=False)
para_pruebas_4f.to_csv("../data/test/df_pruebas_4_features.csv")

In [14]:
ruta_guardar = '../models/mae/4features/1_regresion_lineal.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train3,X_test3,y_train,y_test)
ruta_guardar='../models/mae/4features/2_regresion_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train3,X_test3,y_train,y_test,results_df)
ruta_guardar='../models/mae/4features/2_regresion_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train3,X_test3,y_train,y_test,results_df)
ruta_guardar='../models/mae/4features/3_decision_tree.pkl'
best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train3,X_test3,y_train,y_test,results_df)
ruta_guardar='../models/mae/4features/4_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train3,X_test3,y_train,y_test,results_df)
ruta_guardar='../models/mae/4features/5_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train3,X_test3,y_train,y_test,results_df)
ruta_guardar='../models/mae/4features/6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train3,X_test3,y_train,y_test,results_df)
ruta_guardar='../models/mae/4features/7_red_neuronal.pkl'
results_df = entrenar_red_neuronal(ruta_guardar,X_train3,X_test3,y_train,y_test,results_df)
results_df.to_csv("../models/mae/4features/reultados.csv")
results_df

Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 10 folds for each of 32 candidates, totalling 320 fits
Fitting 10 folds for each of 24 candidates, totalling 240 fits


  return fit_method(estimator, *args, **kwargs)


Fitting 3 folds for each of 144 candidates, totalling 432 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 3 folds for each of 108 candidates, totalling 324 fits


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 2552, number of used features: 4
[LightGBM] [Info] Start training from score 0.825235


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.6955 - mean_absolute_error: 0.6955 - val_loss: 0.4726 - val_mean_absolute_error: 0.4726
Epoch 2/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4248 - mean_absolute_error: 0.4248 - val_loss: 0.4376 - val_mean_absolute_error: 0.4376
Epoch 3/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3831 - mean_absolute_error: 0.3831 - val_loss: 0.4312 - val_mean_absolute_error: 0.4312
Epoch 4/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.3804 - mean_absolute_error: 0.3804 - val_loss: 0.4304 - val_mean_absolute_error: 0.4304
Epoch 5/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3727 - mean_absolute_error: 0.3727 - val_loss: 0.4385 - val_mean_absolute_error: 0.4385
Epoch 6/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss

Unnamed: 0,Model,Best_model,Best_params,Best_score,MSE,MAE,R-squared
0,Linear Regression,"(MinMaxScaler(), PCA(), LinearRegression())","{'classifier': LinearRegression(), 'scaler': M...",-0.478196,0.567629,0.456085,0.816297
1,Polynomial Regression_2,"(None, PolynomialFeatures(include_bias=False, ...","{'classifier': ElasticNet(), 'classifier__alph...",-0.442076,0.542606,0.426487,0.824395
2,Polynomial Regression_3,"(None, PolynomialFeatures(degree=3, include_bi...","{'classifier': ElasticNet(), 'classifier__alph...",-0.418438,0.4967,0.40174,0.839252
3,Decision Tree,"(StandardScaler(), DecisionTreeRegressor(max_d...","{'classifier__max_depth': 5, 'classifier__min_...",-0.455182,0.694482,0.439788,0.775243
4,Random Forest,"(None, PCA(), (DecisionTreeRegressor(max_depth...","{'classifier__max_depth': 10, 'classifier__min...",-0.42303,0.498419,0.393147,0.838696
5,XGBoost (Pipeline),"(None, PCA(), XGBRegressor(base_score=None, bo...","{'classifier__colsample_bytree': 0.8, 'classif...",-0.438118,0.573191,0.421612,0.814497
6,LightGBM (Pipeline),"(None, PCA(), LGBMRegressor(colsample_bytree=0...","{'classifier__colsample_bytree': 0.8, 'classif...",-0.447527,0.528481,0.408596,0.828967
7,Neural Network,-,-,-,0.496089,0.353099,0.83945


In [15]:
results_df = results_df.iloc[0:0]

# 3Features

In [None]:
desired_cols = ["rating","tirosXp","goles_esperados"]

X_train5 = X_train[desired_cols]
X_test5 = X_test[desired_cols]
print(X_train5.shape)  
print(X_test5.shape)   
print(y_train.shape) 
print(y_test.shape)
X_train5.head()

In [None]:
# para_pruebas = pd.concat([X_test5, y_test], axis=1)

# para_pruebas = para_pruebas.sort_values("goles",ascending=False)
# para_pruebas.to_csv("../data/test/df_pruebas_3_features.csv")

In [None]:
ruta_guardar = '../models/mae/3features/1_regresion_lineal.pkl'
best_model_regresion_lineal,results_df = train_and_evaluate_linar_model(ruta_guardar,X_train5,X_test5,y_train,y_test)
ruta_guardar='../models/mae/3features/2_regresion_poly_2.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train5,X_test5,y_train,y_test,results_df)
ruta_guardar='../models/mae/3features/2_regresion_poly_3.pkl'
best_model_regresion_poly,results_df = train_and_evaluate_polynomial_model(ruta_guardar,X_train5,X_test5,y_train,y_test,results_df)
ruta_guardar='../models/mae/3features/3_decision_tree.pkl'
best_model_decision_tree,results_df = train_and_evaluate_decision_tree_model(ruta_guardar,X_train5,X_test5,y_train,y_test,results_df)
ruta_guardar='../models/mae/3features/4_random_forest.pkl'
best_model_random_forest,results_df = train_and_evaluate_random_forest_model(ruta_guardar,X_train5,X_test5,y_train,y_test,results_df)
ruta_guardar='../models/mae/3features/5_xgboost.pkl'
best_model_xgboost,results_df = entrenar_xgboost_pipeline(ruta_guardar,X_train5,X_test5,y_train,y_test,results_df)
ruta_guardar='../models/mae/3features/6_lightgbm.pkl'
best_model_lightgbm,results_df = entrenar_lightgbm_pipeline(ruta_guardar,X_train5,X_test5,y_train,y_test,results_df)
ruta_guardar='../models/mae/3features/7_red_neuronal.pkl'
results_df = entrenar_red_neuronal(ruta_guardar,X_train5,X_test5,y_train,y_test,results_df)
results_df.to_csv("../models/mae/3features/reultados.csv")
results_df

# Modelos inicio

## Regresión lineal simple

In [None]:
ruta_guardar = '../models/1_regresion_lineal.pkl'

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pickle
from sklearn.decomposition import PCA

# Crear una lista vacía para almacenar los resultados
model_results = []

pipe = Pipeline(steps=[("scaler", StandardScaler()),
                       ("pca", PCA()),
                       ('classifier', LinearRegression())
])

linear_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    'pca__n_components': [10, 0.95],
    'classifier': [LinearRegression()]
}

search_space = [
    linear_params
]

gs = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 10,
                  scoring='r2',
                  verbose=2,
                  n_jobs=-1)

gs.fit(X_train, y_train)

# Guardar el mejor modelo en un archivo .pkl
best_model = gs.best_estimator_
with open(ruta_guardar, 'wb') as file:
    pickle.dump(best_model, file)

# Evaluación del modelo
Y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, Y_pred)
mae = mean_absolute_error(y_test, Y_pred)
r2 = r2_score(y_test, Y_pred)

# Resultados
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Almacenar los resultados del modelo
model_results.append({
    'Model': 'Linear Regression',
    'MSE': mse,
    'MAE': mae,
    'R-squared': r2
})

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(model_results)

# Mostrar los resultados
print(results_df)




## Regresión Polinómica de grado 2

In [None]:
ruta_guardar = '../models/2_regresion_polinomica.pkl'
best_model, results_df = train_and_evaluate_polynomial_model(ruta_guardar, X_train, X_test, y_train, y_test)


In [None]:
ruta_guardar = '../models/2_regresion_polinomica.pkl'

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

# Crear una lista vacía para almacenar los resultados
model_results = []

# Pipeline para regresión polinómica
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("polynomial", PolynomialFeatures(degree=2, include_bias=False)),
    ("pca", PCA()),
    ("classifier", LinearRegression())
])

# Espacio de búsqueda para el GridSearch
polynomial_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    'polynomial__degree': [2],  # Se puede añadir más grados si se quiere probar
    'pca__n_components': [10, 0.95],
    'classifier': [LinearRegression()]
}

search_space = [
    polynomial_params
]

# Configurar GridSearchCV
gs = GridSearchCV(estimator=pipe,
                  param_grid=search_space,
                  cv=10,
                  scoring='r2',
                  verbose=2,
                  n_jobs=-1)

# Entrenar con los datos
gs.fit(X_train, y_train)

# Guardar el mejor modelo en un archivo .pkl
best_model = gs.best_estimator_
with open(ruta_guardar, 'wb') as file:
    pickle.dump(best_model, file)

# Evaluación del modelo
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Almacenar los resultados del modelo
model_results.append({
    'Model': 'Linear Regression',
    'MSE': mse,
    'MAE': mae,
    'R-squared': r2
})

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(model_results)

# Mostrar los resultados
print(results_df)

## Arbol de decisión

In [None]:
ruta_guardar = '../models/3_arbol_decision.pkl'
best_model, results_df = train_and_evaluate_decision_tree_model(ruta_guardar, X_train, X_test, y_train, y_test)


In [None]:
ruta_guardar = '../models/3_arbol_decision.pkl'

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle


# Crear una lista vacía para almacenar los resultados
model_results = []

# Pipeline para el Árbol de Decisión
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),  # Escalado opcional
    ("classifier", DecisionTreeRegressor())  # Árbol de Decisión
])

# Espacio de búsqueda para el GridSearch
tree_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    'classifier__max_depth': [None, 5, 10, 15],  # Profundidad máxima del árbol
    'classifier__min_samples_split': [2, 5, 10],  # Número mínimo de muestras para dividir
    'classifier__min_samples_leaf': [1, 2, 5]  # Número mínimo de muestras en una hoja
}

search_space = [
    tree_params
]

# Configurar GridSearchCV
gs = GridSearchCV(estimator=pipe,
                  param_grid=search_space,
                  cv=10,
                  scoring='r2',
                  verbose=2,
                  n_jobs=-1)

# Entrenar con los datos
gs.fit(X_train, y_train)

# Guardar el mejor modelo en un archivo .pkl
best_model = gs.best_estimator_
with open(ruta_guardar, 'wb') as file:
    pickle.dump(best_model, file)

# Evaluación del modelo
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Almacenar los resultados del modelo
model_results.append({
    'Model': 'Linear Regression',
    'MSE': mse,
    'MAE': mae,
    'R-squared': r2
})

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(model_results)

# Mostrar los resultados
print(results_df)


## Random Forest

In [None]:
ruta_guardar = '../models/4_random_forest.pkl'
best_model, results_df = train_and_evaluate_random_forest_model(ruta_guardar, X_train, X_test, y_train, y_test)


In [None]:
ruta_guardar = '../models/4_random_forest.pkl'

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

model_results = []

# Pipeline para el Random Forest
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),  # Opcional, útil si las características tienen rangos muy diferentes
    ("pca", PCA()),
    ("classifier", RandomForestRegressor(random_state=42))
])

# Espacio de búsqueda para el GridSearch
forest_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    "pca__n_components": [5, 10, 0.95],
    'classifier__n_estimators': [100, 200, 300],  # Número de árboles en el bosque
    'classifier__max_depth': [None, 5, 10],  # Profundidad máxima de cada árbol
    'classifier__min_samples_split': [2, 5],  # Número mínimo de muestras para dividir un nodo
    'classifier__min_samples_leaf': [1, 2]  # Número mínimo de muestras en una hoja
}


search_space = [
    forest_params
]

# Configurar GridSearchCV
gs = GridSearchCV(estimator=pipe,
                  param_grid=search_space,
                  cv=10,
                  scoring='r2',
                  verbose=2,
                  n_jobs=-1)

# Entrenar con los datos
gs.fit(X_train, y_train)

# Guardar el mejor modelo en un archivo .pkl
best_model = gs.best_estimator_
with open(ruta_guardar, 'wb') as file:
    pickle.dump(best_model, file)

# Evaluación del modelo
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Almacenar los resultados del modelo
model_results.append({
    'Model': 'Linear Regression',
    'MSE': mse,
    'MAE': mae,
    'R-squared': r2
})

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(model_results)

# Mostrar los resultados
print(results_df)



## Red neuronal simple

In [None]:
ruta_guardar = '../models/5_red_neuronal_simple.pkl'

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
import pickle


model_results = []


# Asumiendo que X y Y ya están definidos
# X es el conjunto de características, Y es el conjunto de etiquetas

# Dividimos los datos en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Si también tienes un conjunto de validación, puedes hacer un split adicional:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Ahora aplicamos el StandardScaler
scaler = StandardScaler()

# Ajustamos el escalador con los datos de entrenamiento y transformamos X_train
X_train_scaled = scaler.fit_transform(X_train)

# Transformamos X_test y X_valid con el mismo escalador
X_test_scaled = scaler.transform(X_test)
X_valid_scaled = scaler.transform(X_valid)

# Definir la arquitectura del modelo
model = keras.models.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=X_train_scaled.shape[1:]),  # Capa densa de 64 neuronas con ReLU
    keras.layers.Dense(32, activation='relu'),  # Capa densa de 32 neuronas con ReLU
    keras.layers.Dense(1)  # Capa de salida, 1 neurona para la regresión
])

# Compilar el modelo
model.compile(loss="mean_squared_error",
              metrics=['mean_absolute_error'],  # Usamos el error cuadrático medio para regresión
              optimizer=keras.optimizers.Adam(learning_rate=0.001))  # Optimizer Adam con tasa de aprendizaje ajustada

# Ajuste del modelo a los datos escalados
history = model.fit(X_train_scaled, y_train,  # Entrenamos con los datos de entrenamiento escalados
                    epochs=50,  # Aumentamos las épocas para un mejor ajuste
                    batch_size=32,  # Tamaño de batch más grande para entrenamiento
                    validation_data=(X_valid_scaled, y_valid))  # Validación con los datos de validación escalados

# Guardar el mejor modelo en un archivo .pkl
with open(ruta_guardar, 'wb') as file:
    pickle.dump(model, file)


# Evaluación del modelo
y_pred_nn = model.predict(X_test_scaled)
mse_nn = mean_squared_error(y_test, y_pred_nn)
mae_nn = mean_absolute_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

# Almacenar los resultados de la red neuronal
model_results.append({
    'Model': 'Neural Network',
    'MSE': mse_nn,
    'MAE': mae_nn,
    'R-squared': r2_nn
})

# Convertir a DataFrame para mostrar los resultados
results_df = pd.DataFrame(model_results)

# Mostrar los resultados actualizados
print(results_df)


## Red neuronal con pipeline

In [None]:
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Función para definir el modelo
def crear_modelo():
    model = keras.models.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Capa densa
        keras.layers.Dense(32, activation='relu'),  # Otra capa densa
        keras.layers.Dense(1)  # Capa de salida
    ])
    
    model.compile(loss='mean_squared_error', 
                  optimizer=keras.optimizers.Adam(learning_rate=0.001), 
                  metrics=['mean_absolute_error'])
    
    return model


In [None]:
ruta_guardar = '../models/6_red_neuronal_pipeline.pkl'
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
import pickle


model_results = []

# Dividir los datos (asumiendo que ya tienes X y Y definidos)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Crear el pipeline
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),  # Normalizar los datos
    ("keras_regressor", KerasRegressor(build_fn=crear_modelo, epochs=50, batch_size=32, verbose=0))  # Modelo de red neuronal
])

# Configuración de los parámetros para GridSearch
param_grid = {
    'keras_regressor__epochs': [50, 100],
    'keras_regressor__batch_size': [16, 32],
}

# GridSearch para buscar el mejor modelo
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)

# Entrenar el modelo
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_

# Evaluación del modelo
Y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, Y_pred)
mae = mean_absolute_error(y_test, Y_pred)
r2 = r2_score(y_test, Y_pred)

# Mostrar resultados
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Guardar el mejor modelo
with open(ruta_guardar, 'wb') as file:
    pickle.dump(best_model, file)


# Almacenar los resultados del modelo
model_results.append({
    'Model': 'Linear Regression',
    'MSE': mse,
    'MAE': mae,
    'R-squared': r2
})

# Convertir los resultados a un DataFrame
results_df = pd.DataFrame(model_results)

# Mostrar los resultados
print(results_df)


## XGBOOST

In [None]:
ruta_guardar = '../models/7_xgboost.pkl'

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

# Pipeline para XGBoost
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),  # Escalado de características
    ("pca", PCA()),  # Reducción de dimensionalidad (opcional)
    ("classifier", XGBRegressor(random_state=42, objective='reg:squarederror'))  # XGBoost para regresión
])

# Espacio de búsqueda para el GridSearch
xgb_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    "pca__n_components": [5, 10, 0.95],
    'classifier__n_estimators': [100, 200, 300],  # Número de árboles
    'classifier__max_depth': [3, 6, 10],  # Profundidad máxima
    'classifier__learning_rate': [0.01, 0.1, 0.2],  # Tasa de aprendizaje
    'classifier__subsample': [0.8, 0.9, 1.0],  # Proporción de muestras utilizadas
    'classifier__colsample_bytree': [0.8, 0.9, 1.0]  # Proporción de características utilizadas
}

# Configurar GridSearchCV
gs = GridSearchCV(estimator=pipe,
                  param_grid=xgb_params,
                  cv=10,
                  scoring='r2',
                  verbose=2,
                  n_jobs=-1)

# Entrenar el modelo
gs.fit(X_train, y_train)

# Guardar el mejor modelo
best_model = gs.best_estimator_
with open(ruta_guardar, 'wb') as file:
    pickle.dump(best_model, file)

# Evaluación del modelo
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


In [None]:
ruta_guardar = '../models/8_lightgbm.pkl'


from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

# Pipeline para LightGBM
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),  # Escalado de características
    ("pca", PCA()),  # Reducción de dimensionalidad (opcional)
    ("classifier", lgb.LGBMRegressor(random_state=42))  # LightGBM para regresión
])

# Espacio de búsqueda para el GridSearch
lgb_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    "pca__n_components": [5, 10, 0.95],
    'classifier__n_estimators': [100, 200, 300],  # Número de árboles
    'classifier__max_depth': [5, 10, -1],  # Profundidad máxima
    'classifier__learning_rate': [0.01, 0.05, 0.1],  # Tasa de aprendizaje
    'classifier__num_leaves': [31, 50, 100],  # Número de hojas
    'classifier__subsample': [0.8, 0.9, 1.0]  # Proporción de muestras utilizadas
}

# Configurar GridSearchCV
gs = GridSearchCV(estimator=pipe,
                  param_grid=lgb_params,
                  cv=10,
                  scoring='r2',
                  verbose=2,
                  n_jobs=-1)

# Entrenar el modelo
gs.fit(X_train, y_train)

# Guardar el mejor modelo
best_model = gs.best_estimator_
with open(ruta_guardar, 'wb') as file:
    pickle.dump(best_model, file)

# Evaluación del modelo
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Resultados
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")
