# Predicting Forward Performance in Big 5 Leagues from Brazil Serie A

This notebook analyzes how forwards perform when moving from Brazil's Serie A to the Big 5 European leagues. It:
- Matches players who have played in both leagues.
- Calculates the percentage change in key performance statistics.
- Trains a neural network model to predict performance differences.
- Allows user input to predict a player's performance in a Big 5 league based on their Brazil Serie A stats.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf  # Ensure TensorFlow is installed
import keras as ks


In [3]:
# Load datasets
brasil_serieA_df = pd.read_csv("data/Fixed_Brasil_SerieA.csv")
big5_forwards_df = pd.read_csv("data/Big5CombinedForwards.csv")


## Match Players in Both Leagues

In [4]:
# Standardize player names
brasil_serieA_df["Player"] = brasil_serieA_df["Player"].str.strip().str.lower()
big5_forwards_df["Player"] = big5_forwards_df["Player"].str.strip().str.lower()

# Group Big 5 players by name and average stats if they appear multiple times
numeric_columns = big5_forwards_df.select_dtypes(include=[np.number]).columns
big5_grouped = big5_forwards_df.groupby("Player")[numeric_columns].mean().reset_index()

# Merge datasets based on player name
matched_players = brasil_serieA_df.merge(big5_grouped, on="Player", suffixes=("_brasil", "_big5"))
matched_players.head()


Unnamed: 0,Player,Gls/90_brasil,G/Sh_brasil,G/SoT_brasil,SoT%_brasil,SoT/90_brasil,Sh/90_brasil,G-PK/90,PK/90_brasil,PKatt/90_brasil,...,Sh/90_big5,PK/90_big5,PKatt/90_big5,Dist_big5,Age_big5,MP_big5,90s_big5,Starts_big5,Subs_big5,unSub_big5
0,andré silva,0.4,0.2,0.54,37.1,0.8,2.1,0.4,0.0,0.0,...,3.033333,0.066667,0.066667,12.766667,23.5,28.833333,20.95,20.833333,8.0,4.0
1,antony,0.2,0.06,0.15,40.0,1.0,2.5,0.2,0.0,0.0,...,3.25,0.0,0.0,19.25,22.5,27.0,17.35,19.0,8.0,2.0
2,dimitri payet,0.2,0.15,0.5,30.8,0.3,1.0,0.1,0.1,0.1,...,2.06,0.1,0.1,23.46,32.0,29.6,24.88,25.8,3.8,0.6
3,douglas costa,0.1,0.07,0.2,34.9,0.7,2.1,0.1,0.0,0.0,...,1.8,0.0,0.0,19.7,26.0,31.0,19.9,18.0,13.0,6.0
4,felipe anderson,0.1,0.08,0.29,29.2,0.5,1.6,0.1,0.0,0.0,...,1.516667,0.0,0.0,17.333333,27.0,32.666667,26.516667,27.666667,5.0,1.0


## Compute Percentage Differences

In [None]:
performance_columns = ["Gls/90", "G/Sh", "G/SoT", "SoT%", "SoT/90", "Sh/90", "PK/90", "PKatt/90, Dist"]

# Calculate percentage difference
for col in performance_columns:
    matched_players[f"{col}_diff"] = ((matched_players[f"{col}_big5"] - matched_players[f"{col}_brasil"]) / matched_players[f"{col}_brasil"]) * 100

# Compute average percentage difference across all metrics
matched_players["Avg_Percent_Diff"] = matched_players[[f"{col}_diff" for col in performance_columns]].mean(axis=1)


## Prepare Data for Machine Learning

In [6]:
# Define features and target variables
X = matched_players[[f"{col}_brasil" for col in performance_columns]].values  # Features from Brazil Serie A
y = matched_players[[f"{col}_diff" for col in performance_columns]].values  # Target: Percentage change in Big 5 leagues

# Replace infinity and NaN values with a large finite number or zero
X = np.nan_to_num(X, nan=0.0, posinf=1e10, neginf=-1e10)
y = np.nan_to_num(y, nan=0.0, posinf=1e10, neginf=-1e10)

# Standardize data
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)


## Train a Neural Network Model

In [8]:
# Define the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(y_train.shape[1])  # Output layer
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_test, y_test), verbose=3)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Save Model and Scalers

In [None]:
# Save the trained model
model.save("brazil_to_big5_model.h5")
np.save("scaler_X.npy", [scaler_X.mean_, scaler_X.scale_])
np.save("scaler_y.npy", [scaler_y.mean_, scaler_y.scale_])




In [21]:
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import ipywidgets as widgets
from IPython.display import display, clear_output

# Load trained model
model = tf.keras.models.load_model(
    "/workspaces/FBREF_Analysis/CONMEBOL vs Big5 Analysis/brazil_to_big5_model.h5", 
    custom_objects={"mse": tf.keras.losses.MeanSquaredError()}
)

# Properly load scaler parameters (mean and scale)
scaler_X_params = np.load("/workspaces/FBREF_Analysis/CONMEBOL vs Big5 Analysis/scaler_X.npy", allow_pickle=True)
scaler_y_params = np.load("/workspaces/FBREF_Analysis/CONMEBOL vs Big5 Analysis/scaler_y.npy", allow_pickle=True)

scaler_X = StandardScaler()
scaler_X.mean_, scaler_X.scale_ = scaler_X_params

scaler_y = StandardScaler()
scaler_y.mean_, scaler_y.scale_ = scaler_y_params

# Widget labels (your provided feature names)
labels = ["Gls/90", "G/Sh", "G/SoT", "SoT%", "SoT/90", "Sh/90", "PK/90", "PKatt/90", "Dist"]

# Create widgets for user input
input_widgets = [widgets.FloatText(description=label, value=0.0) for label in labels]

# Display input widgets
for widget in input_widgets:
    display(widget)

# Button setup
predict_button = widgets.Button(description="Predict Performance", button_style='success')
output_area = widgets.Output()

display(predict_button, output_area)

# Define prediction function triggered on click
def predict_performance(button):
    with output_area:
        clear_output(wait=True)
        try:
            # Capture inputs
            player_stats = np.array([w.value for w in input_widgets]).reshape(1, -1)
            print("Player Stats Input:", player_stats.flatten())

            # Prediction pipeline
            scaled_input = scaler_X.transform(player_stats)
            scaled_prediction = model.predict(scaled_input)
            prediction = scaler_y.inverse_transform(scaled_prediction)

            # Display results explicitly
            print("\nPredicted Performance in Top 5 Leagues:")
            for label, value in zip(labels, prediction.flatten()):
                print(f"{label}: {value:.3f}")

        except Exception as e:
            print("Error during prediction:", e)

# Bind button click to prediction function
predict_button.on_click(predict_performance)




ValueError: too many values to unpack (expected 2)

## Predict Player Performance in Big 5 Leagues

In [None]:
'''
# Load trained model and scalers
import ipywidgets as widgets
from IPython.display import display
model = tf.keras.models.load_model("/workspaces/FBREF_Analysis/CONMEBOL vs Big5 Analysis/brazil_to_big5_model.h5", custom_objects={"mse": tf.keras.losses.MeanSquaredError()})
scaler_X.scale_ = np.load("scaler_X.npy", allow_pickle=True)
scaler_y.scale_ = np.load("scaler_y.npy", allow_pickle=True)

# Function to get stats from user input
def get_player_stats():
    stats = []
    print("Enter the player's performance stats from Brazil Serie A:")
    for metric in performance_columns:
        value = widgets.FloatText(description=f"{metric}:")
        display(value)
        stats.append(value)
    return stats

# Function to make a prediction
def predict_performance(stats):
    stats_values = np.array([stat.value for stat in stats]).reshape(1, -1)
    stats_scaled = scaler_X.transform(stats_values)
    predicted_diff_scaled = model.predict(stats_scaled)
    predicted_diff = scaler_y.inverse_transform(predicted_diff_scaled)
    predicted_big5_stats = stats_values + (predicted_diff * stats_values / 100)
    return predicted_big5_stats

# Main function
def main():
    stats = get_player_stats()
    button = widgets.Button(description="Predict")
    display(button)

    def on_button_click(b):
        predicted_stats = predict_performance(stats)
        for metric, value in zip(performance_columns, predicted_stats[0]):
            print(f"{metric}: {value:.2f}")

    button.on_click(on_button_click)

if __name__ == "__main__":
    main()
'''
