# Predicting Forward Performance in Big 5 Leagues from Brazil Serie A

This notebook analyzes how forwards perform when moving from Brazil's Serie A to the Big 5 European leagues. It:
- Matches players who have played in both leagues.
- Calculates the percentage change in key performance statistics.
- Trains a neural network model to predict performance differences.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf  # Ensure TensorFlow is installed

# Load datasets
brasil_serieA_df = pd.read_csv("data/Fixed_Brasil_SerieA.csv")
big5_forwards_df = pd.read_csv("data/Big5CombinedForwards.csv")

2025-02-18 15:07:30.429442: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-18 15:07:30.456117: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-18 15:07:30.765793: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-18 15:07:30.874251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739891251.153585   27433 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739891251.23

## Match Players in Both Leagues

In [3]:
# Standardize player names
brasil_serieA_df["Player"] = brasil_serieA_df["Player"].str.strip().str.lower()
big5_forwards_df["Player"] = big5_forwards_df["Player"].str.strip().str.lower()

# Group Big 5 players by name and average stats if they appear multiple times
numeric_columns = big5_forwards_df.select_dtypes(include=[np.number]).columns
big5_grouped = big5_forwards_df.groupby("Player")[numeric_columns].mean().reset_index()

# Merge datasets based on player name
matched_players = brasil_serieA_df.merge(big5_grouped, on="Player", suffixes=("_brasil", "_big5"))

## Compute Percentage Differences

In [5]:
performance_columns = ["Gls/90", "G/Sh", "G/SoT", "SoT%", "SoT/90", "Sh/90", "PK/90", "PKatt/90"]

# Calculate percentage difference
for col in performance_columns:
    matched_players[f"{col}_diff"] = ((matched_players[f"{col}_big5"] - matched_players[f"{col}_brasil"]) / matched_players[f"{col}_brasil"]) * 100

# Compute average percentage difference across all metrics
matched_players["Avg_Percent_Diff"] = matched_players[[f"{col}_diff" for col in performance_columns]].mean(axis=1)

## Prepare Data for Machine Learning

In [7]:
# Define features and target variables
X = matched_players[[f"{col}_brasil" for col in performance_columns]].values  # Features from Brazil Serie A
y = matched_players[[f"{col}_diff" for col in performance_columns]].values  # Target: Percentage change in Big 5 leagues

# Replace infinity and NaN values with a large finite number or zero
X = np.nan_to_num(X, nan=0.0, posinf=1e10, neginf=-1e10)
y = np.nan_to_num(y, nan=0.0, posinf=1e10, neginf=-1e10)

# Standardize data
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

## Train a Neural Network Model

In [8]:
# Define the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(y_train.shape[1])  # Output layer
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_test, y_test), verbose=1)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-02-18 15:11:11.326980: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 213ms/step - loss: 1.2081 - mae: 0.7909 - val_loss: 1.1931 - val_mae: 0.8152
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 1.1038 - mae: 0.7718 - val_loss: 1.1709 - val_mae: 0.8055
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 1.0994 - mae: 0.7571 - val_loss: 1.1527 - val_mae: 0.7973
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 1.0922 - mae: 0.7635 - val_loss: 1.1394 - val_mae: 0.7905
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 0.9572 - mae: 0.7061 - val_loss: 1.1291 - val_mae: 0.7847
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - loss: 0.8799 - mae: 0.7075 - val_loss: 1.1208 - val_mae: 0.7797
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 0.9397 - mae: 0.72

## Save Model and Scalers

In [9]:
# Save the trained model
model.save("brazil_to_big5_model.h5")
np.save("scaler_X.npy", scaler_X.scale_)
np.save("scaler_y.npy", scaler_y.scale_)



## Predict

In [10]:
model = tf.keras.models.load_model("brazil_to_big5_model.h5")
scaler_X = StandardScaler()
scaler_X.scale_ = np.load("scaler_X.npy", allow_pickle=True)
scaler_y = StandardScaler()
scaler_y.scale_ = np.load("scaler_y.npy", allow_pickle=True)

# Define performance metrics used in training
performance_columns = ["Gls/90", "G/Sh", "G/SoT", "SoT%", "SoT/90", "Sh/90", "G-PK/90", "PK/90", "PKatt/90"]

# Function to get stats from user input
def get_player_stats():
    print("Enter the player's performance stats from Brazil Serie A:")
    stats = []
    for metric in performance_columns:
        value = float(input(f"{metric}: "))
        stats.append(value)
    return np.array(stats).reshape(1, -1)

# Function to make a prediction
def predict_performance(stats):
    stats_scaled = scaler_X.transform(stats)  # Apply the same scaling
    predicted_diff_scaled = model.predict(stats_scaled)
    predicted_diff = scaler_y.inverse_transform(predicted_diff_scaled)  # Reverse scaling
    predicted_big5_stats = stats + (predicted_diff * stats / 100)  # Adjust based on percentage difference
    return predicted_big5_stats

# Main function
def main():
    stats = get_player_stats()
    predicted_stats = predict_performance(stats)
    
    print("\nPredicted performance in a Big 5 league:")
    for metric, value in zip(performance_columns, predicted_stats[0]):
        print(f"{metric}: {value:.2f}")

if __name__ == "__main__":
    main()
    

TypeError: Could not locate function 'mse'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': 'keras.metrics', 'class_name': 'function', 'config': 'mse', 'registered_name': 'mse'}