In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

def main():
    # Cargar los datos
    data = pd.read_csv('Copa_Libertadores_2023_Complete_Goal_Scorers.csv')

    # Separar características y etiquetas
    X = data.drop(columns=['Goals'])
    y = data['Goals']

    # Dividir los datos en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Definir las columnas categóricas y numéricas
    categorical_cols = ['Player', 'Team', 'Country', 'Team Nationality']
    numeric_cols = ['Penalty Goals', 'Team Total Goals']

    # Crear transformadores para las columnas categóricas y numéricas
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combinar los transformadores en un preprocesador
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Crear el pipeline completo
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    # Entrenar el modelo
    model.fit(X_train, y_train)

    # Evaluar el modelo
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    # Guardar el modelo y el pipeline
    joblib.dump(model, 'mejor_modelo.gz')
    joblib.dump(preprocessor, 'pipeline_total.gz')

if __name__ == "__main__":
    main()