In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
url = "https://raw.githubusercontent.com/Izainea/proyecto_prueba/refs/heads/main/Datos/Copa_Libertadores_2023_Complete_Goal_Scorers.csv" 

In [6]:
datos = pd.read_csv(url)
datos.head(3)

Unnamed: 0,Player,Team,Country,Team Nationality,Goals,Penalty Goals,Team Total Goals
0,Germán Cano,Fluminense RJ,Argentina,Brasil,13,0,24
1,Paulinho,Atlético Mineiro,Brasil,Brasil,7,0,14
2,Dorlan Pabón,Atlético Nacional,Colombia,Colombia,6,2,12


In [9]:
datos.isnull().sum()

Player              0
Team                0
Country             0
Team Nationality    0
Goals               0
Penalty Goals       0
Team Total Goals    0
dtype: int64

In [11]:
X = datos.drop(columns=['Goals'])
y = datos['Goals']

In [12]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Definir las columnas categóricas y numéricas
categorical_cols = ['Player', 'Team', 'Country', 'Team Nationality']
numeric_cols = ['Penalty Goals', 'Team Total Goals']

In [15]:
# Crear transformadores para las columnas categóricas y numéricas
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [16]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [17]:
# Combinar los transformadores en un preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [18]:
# Crear el pipeline completo
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Entrenamiento del Modelo de Machine Learning

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [20]:
# Añadir el modelo al pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Entrenar el modelo
model.fit(X_train, y_train)

# Evaluar el modelo
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.7738397150355307


# Guardado del Modelo con Joblib

In [21]:
import joblib

# Guardar el modelo
joblib.dump(model, 'mejor_modelo.gz')

# Guardar el pipeline
joblib.dump(pipeline, 'pipeline_total.gz')

['pipeline_total.gz']