In [1]:
from google.colab import files
uploaded = files.upload()

Saving 09_db_delantero_filtered_top40pct.csv to 09_db_delantero_filtered_top40pct.csv


In [2]:
import pandas as pd
# Leemos el fichero
df = pd.read_csv('09_db_delantero_filtered_top40pct.csv', sep=",", encoding='utf-8')
# Mostramos las 5 primeras filas
df.head()

Unnamed: 0,Nombre completo,Lugar de nacimiento (país),Nacionalidad,Posición principal,Club actual,Proveedor,Fin de contrato,Fecha de fichaje,comprado_por,gk_diving,...,longshots,understat_matches,understat_assists,crossing,understat_shots,ballcontrol,acceleration,vision,Valor de mercado actual (numérico),Cluster
0,jeremy-doku,11.0,69,2,54,6.0,30/06/2028,24/08/2023,1.487382,-0.321842,...,0.749581,0.591692,2.958668,0.637932,1.677758,0.769843,1.820397,0.771632,55000000.0,3
1,عمر مرموش,24.0,105,2,54,6.0,30/06/2029,23/01/2025,3.96545,-0.321842,...,1.164485,0.591692,2.034918,0.342824,2.700295,1.046008,1.295837,1.12827,75000000.0,3
2,Philip Walter Foden,46.0,196,2,54,6.0,30/06/2027,01/07/2017,,-0.466613,...,1.348887,1.175273,2.958668,1.129779,3.955226,1.211706,1.230267,1.484908,130000000.0,2
3,Erling Braut Håland,46.0,267,2,54,6.0,30/06/2034,01/07/2022,10.935016,-0.51487,...,1.348887,0.786219,1.573043,0.047716,4.745368,0.880309,0.836848,0.771632,200000000.0,2
4,oscar-bobb,65.0,269,2,54,7.0,30/06/2029,01/07/2023,,-0.321842,...,0.196376,-0.867261,-0.274456,0.146085,-0.739146,0.659378,1.099127,0.628976,25000000.0,1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping

# Lista de columnas a excluir
exclude_columns = [
    'Lugar de nacimiento (país)', 'Nacionalidad', 'Club actual', 'Proveedor',
    'Fin de contrato', 'Fecha de fichaje', 'comprado_por',
    'Nombre completo', 'Posición principal'
]

# Obtener los valores únicos de la columna 'Cluster'
clusters = df['Cluster'].unique()

# Lista para guardar resultados por cluster
resultados = []

# Función para crear modelo y evitar retracing
def build_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    return model

# EarlyStopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

for cluster_id in clusters:
    df_cluster = df[df['Cluster'] == cluster_id].copy()

    # Eliminar columnas no deseadas y filas con nulos
    df_filtrado = df_cluster.drop(columns=exclude_columns, errors='ignore').dropna()


    # Separar X e y
    y = df_filtrado['Valor de mercado actual (numérico)']
    X = df_filtrado.drop(columns=['Valor de mercado actual (numérico)', 'Cluster'], errors='ignore')

    # Escalado
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

    # División entrenamiento/prueba
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)


    # Crear y entrenar modelo
    model = build_model(X_train.shape[1])
    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=0
    )

    # Predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Invertir escalado
    y_train_original = scaler_y.inverse_transform(y_train).ravel()
    y_test_original = scaler_y.inverse_transform(y_test).ravel()
    y_train_pred_original = scaler_y.inverse_transform(y_train_pred).ravel()
    y_test_pred_original = scaler_y.inverse_transform(y_test_pred).ravel()

    # Métricas
    r2_train = r2_score(y_train_original, y_train_pred_original)
    r2_test = r2_score(y_test_original, y_test_pred_original)
    mae_test = mean_absolute_error(y_test_original, y_test_pred_original)
    rmse_test = np.sqrt(mean_squared_error(y_test_original, y_test_pred_original))
    ratio_overfitting = r2_test / r2_train if r2_train != 0 else np.nan
    n_muestras = df_filtrado.shape[0]

    resultados.append({
        'Posición': 'Modelo por cluster',
        'Clúster': cluster_id,
        'N Muestras': n_muestras,
        'R2 Train': r2_train,
        'R2 Test': r2_test,
        'MAE Test': mae_test,
        'RMSE Test': rmse_test,
        'Ratio Overfitting': ratio_overfitting
    })

# Mostrar tabla de resultados
print(f"{'Posición':<20} {'Clúster':<10} {'N Muestras':<12} {'R2 Train':<10} {'R2 Test':<10} {'MAE Test':<10} {'RMSE Test':<10} {'Ratio Overfitting'}")
for res in resultados:
    print(f"{res['Posición']:<20} {res['Clúster']:<10} {res['N Muestras']:<12} {res['R2 Train']:<10.4f} {res['R2 Test']:<10.4f} {res['MAE Test']:<10.4f} {res['RMSE Test']:<10.4f} {res['Ratio Overfitting']:.4f}")


Epoch 12: early stopping
Restoring model weights from the end of the best epoch: 7.




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Epoch 38: early stopping
Restoring model weights from the end of the best epoch: 33.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step




Epoch 12: early stopping
Restoring model weights from the end of the best epoch: 7.
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Epoch 6: early stopping
Restoring model weights from the end of the best epoch: 1.
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Posición             Clúster    N Muestras   R2 Train   R2 Test    MAE Test   RMSE Test  Ratio Overfitting
Modelo por cluster   3          78           0.2958     -0.3089    16331618.0000 21199783.8246 -1.0442
Modelo por cluster   2          4            0.9029     nan        45286944.0000 45286944.0000 nan
Modelo por cluster   1          227          0.3871     0.0388     5814309.6739 7472136.3907 0.1003
Modelo por cluster   0          58           -0.0703    -0.0398    2224721.6562 2942041.8238 0.5661


In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Lista de columnas a excluir
exclude_columns = [
    'Lugar de nacimiento (país)', 'Nacionalidad', 'Club actual', 'Proveedor',
    'Fin de contrato', 'Fecha de fichaje', 'comprado_por',
    'Nombre completo', 'Posición principal'
]

# Obtener valores únicos de clusters
clusters = df['Cluster'].unique()

# Lista para resultados
resultados = []

for cluster_id in clusters:
    df_cluster = df[df['Cluster'] == cluster_id].copy()

    # Filtrar columnas y eliminar nulos
    df_filtrado = df_cluster.drop(columns=exclude_columns, errors='ignore').dropna()


    # Separar X e y
    y = df_filtrado['Valor de mercado actual (numérico)']
    X = df_filtrado.drop(columns=['Valor de mercado actual (numérico)', 'Cluster'], errors='ignore')

    # Escalado
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).ravel()

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)


    # Entrenar modelo
    modelo = LinearRegression()
    modelo.fit(X_train, y_train)

    # Predicciones
    y_train_pred = modelo.predict(X_train)
    y_test_pred = modelo.predict(X_test)

    # Invertir escalado
    y_train_orig = scaler_y.inverse_transform(y_train.reshape(-1, 1)).ravel()
    y_test_orig = scaler_y.inverse_transform(y_test.reshape(-1, 1)).ravel()
    y_train_pred_orig = scaler_y.inverse_transform(y_train_pred.reshape(-1, 1)).ravel()
    y_test_pred_orig = scaler_y.inverse_transform(y_test_pred.reshape(-1, 1)).ravel()

    # Métricas
    r2_train = r2_score(y_train_orig, y_train_pred_orig)
    r2_test = r2_score(y_test_orig, y_test_pred_orig)
    mae_test = mean_absolute_error(y_test_orig, y_test_pred_orig)
    rmse_test = np.sqrt(mean_squared_error(y_test_orig, y_test_pred_orig))
    ratio_overfitting = r2_test / r2_train if r2_train != 0 else np.nan

    resultados.append({
        'Modelo': 'Lineal',
        'Clúster': cluster_id,
        'N Muestras': df_filtrado.shape[0],
        'R2 Train': r2_train,
        'R2 Test': r2_test,
        'MAE Test': mae_test,
        'RMSE Test': rmse_test,
        'Ratio Overfitting': ratio_overfitting
    })

# Mostrar resultados
print(f"{'Modelo':<10} {'Clúster':<10} {'N Muestras':<12} {'R2 Train':<10} {'R2 Test':<10} {'MAE Test':<10} {'RMSE Test':<10} {'Ratio Overfitting'}")
for res in resultados:
    print(f"{res['Modelo']:<10} {res['Clúster']:<10} {res['N Muestras']:<12} {res['R2 Train']:<10.4f} {res['R2 Test']:<10.4f} {res['MAE Test']:<10.4f} {res['RMSE Test']:<10.4f} {res['Ratio Overfitting']:.4f}")


Modelo     Clúster    N Muestras   R2 Train   R2 Test    MAE Test   RMSE Test  Ratio Overfitting
Lineal     3          78           0.5676     0.0376     15030677.6847 18178280.9629 0.0662
Lineal     2          4            1.0000     nan        33260121.1510 33260121.1510 nan
Lineal     1          227          0.5052     0.2088     5400655.7929 6779183.0266 0.4133
Lineal     0          58           0.5863     -0.3591    2711341.0523 3363565.9797 -0.6126


