In [2]:
import pandas as pd

# Load the dataset
file_path = '../data/visualizations/combined_data.csv'
data = pd.read_csv(file_path)

# Standardize column names
data.columns = data.columns.str.strip().str.lower()

# Ensure round is numeric
data['round'] = data['round'].str.extract('(\\d+)').astype(int)

# Group by 'round' and calculate aggregate statistics
grouped_data = data.groupby('round').agg({
    'round': 'first',
    'expected goals': 'mean',
    'venue': 'first',
    'opponent': 'first',
    'goals': 'mean',
    'shots': 'mean',
    'touches': 'mean',
    'ast': 'mean',
    'passcmp': 'mean',
    'passcmp%': 'mean',
    'min': 'mean',
    'formation': 'first',
    'opp formation': 'first',
    'poss': 'mean',
    'gca': 'mean',
    'sca': 'mean',
    'gls': 'mean',
    'sh': 'mean'
})

# Add rolling and cumulative features (excluding the current match)
rolling_window = 5

# Team-level rolling and cumulative features
team_rolling_features = ['expected goals', 'goals', 'shots', 'touches', 'poss', 'gca', 'sca']
for feature in team_rolling_features:
    grouped_data[f'rolling_{feature}'] = grouped_data[feature].shift(1).rolling(window=rolling_window, min_periods=1).mean()
    grouped_data[f'cumulative_{feature}'] = grouped_data[feature].shift(1).expanding().mean()

# Player-level rolling and cumulative features for Haaland
haaland_features = ['gls', 'ast', 'sh', 'min', 'touches', 'poss', 'gca', 'sca']
for feature in haaland_features:
    grouped_data[f'rolling_haaland_{feature}'] = grouped_data[feature].shift(1).rolling(window=rolling_window, min_periods=1).mean()
    grouped_data[f'cumulative_haaland_{feature}'] = grouped_data[feature].shift(1).expanding().mean()

# Add Haaland's performance rating
# Performance Rating Formula: (Goals * 4 + Assists * 3 + xG Contribution) / Minutes Played
grouped_data['haaland_performance_rating'] = (
    grouped_data['gls'] * 4 + grouped_data['ast'] * 3 + grouped_data['expected goals']
) / grouped_data['min'].replace(0, 1)  # Replace 0 minutes to avoid division by zero

# Add rolling and cumulative ratings (excluding current match)
grouped_data['rolling_haaland_rating'] = grouped_data['haaland_performance_rating'].shift(1).rolling(window=rolling_window, min_periods=1).mean()
grouped_data['cumulative_haaland_rating'] = grouped_data['haaland_performance_rating'].shift(1).expanding().mean()

# Define the parse_formation function
def parse_formation(formation):
    parts = str(formation).split('-')
    if len(parts) == 3:
        return [int(part) for part in parts]
    elif len(parts) == 4:
        parts[1] = int(parts[1]) + int(parts[2])
        parts[2] = parts[3]
        return parts[:3]
    return [0, 0, 0]

# Parse formation columns
grouped_data[['defenders', 'midfielders', 'forwards']] = pd.DataFrame(
    grouped_data['formation'].apply(parse_formation).tolist(), index=grouped_data.index)
grouped_data[['defenders_opp', 'midfielders_opp', 'forwards_opp']] = pd.DataFrame(
    grouped_data['opp formation'].apply(parse_formation).tolist(), index=grouped_data.index)

# Drop original formation columns
grouped_data = grouped_data.drop(columns=['formation', 'opp formation'])
grouped_data=grouped_data.fillna(0)
# Add Opponent Strength Data
team_data = pd.DataFrame({
    'opponent': [
        'Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton',
        'Burnley', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham',
        'Liverpool', 'Luton Town', 'Manchester Utd', 'Newcastle Utd',
        "Nott'ham Forest", 'Sheffield Utd', 'Tottenham', 'West Ham', 'Wolves'
    ],
    'opponent_strength': [
        4.07, 0.57, -0.48, -0.02, -0.21, -0.79, 0.88, -0.18, -0.10, -0.40,
        3.45, -0.90, -0.29, 0.96, -0.44, -1.03, 0.71, -0.51, -0.54
    ],
})

grouped_data = pd.merge(grouped_data, team_data, on='opponent', how='left')

# Save the processed data
output_file_path = '../data/model/processed_xg_rolling_features_enhanced.csv'
grouped_data.to_csv(output_file_path, index=False)
print(f"Processed data saved to {output_file_path}")


Processed data saved to ../data/model/processed_xg_rolling_features_enhanced.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Paths to the data
haaland_features_path = '../data/model/enhanced_haaland_match_features.csv'
xg_features_path = '../data/model/processed_xg_rolling_features_enhanced.csv'

# Load datasets
haaland_features = pd.read_csv(haaland_features_path)
xg_features = pd.read_csv(xg_features_path)

# Standardize column names to lowercase
haaland_features.columns = haaland_features.columns.str.strip().str.lower()
xg_features.columns = xg_features.columns.str.strip().str.lower()

# Ensure 'round' column is consistent
xg_features['round'] = xg_features['round'].astype(int)
haaland_features['round'] = haaland_features['round'].astype(int)

# Merge datasets on 'round', using left join
merged_data = xg_features

# Filter for rounds less than 33
filtered_data = merged_data[merged_data['round']]

# Define feature columns for prediction
xg_features_only = [
    'rolling_expected goals', 'cumulative_expected goals',
    'rolling_goals', 'cumulative_goals', 'rolling_shots', 'cumulative_shots',
    'rolling_touches', 'cumulative_touches', 'rolling_poss', 'cumulative_poss',
    'rolling_gca', 'cumulative_gca', 'rolling_sca', 'cumulative_sca',
    'rolling_haaland_gls', 'cumulative_haaland_gls',
    'rolling_haaland_ast', 'cumulative_haaland_ast',
    'rolling_haaland_sh', 'cumulative_haaland_sh',
    'rolling_haaland_min', 'cumulative_haaland_min',
    'rolling_haaland_touches', 'cumulative_haaland_touches',
    'rolling_haaland_poss', 'cumulative_haaland_poss',
    'rolling_haaland_gca', 'cumulative_haaland_gca',
    'rolling_haaland_sca', 'cumulative_haaland_sca',
    'haaland_performance_rating', 'rolling_haaland_rating', 'cumulative_haaland_rating',
    'defenders', 'midfielders', 'forwards',
    'defenders_opp', 'midfielders_opp', 'forwards_opp', 'opponent_strength','min'
]

# Ensure all required features are present in the filtered dataset
X = filtered_data[xg_features_only].dropna()  # Remove rows with missing values
y = filtered_data.loc[X.index, 'expected goals']  # Match rows for 'expected goals'

# Compute feature importance using RandomForestRegressor (you can use any model)
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X, y)

# Extract feature importances
feature_importance = rf_model.feature_importances_

# Normalize feature importances to get feature weights (can scale by any factor)
feature_weights = feature_importance / feature_importance.sum()

# Create a DataFrame for feature importance and weights
feature_importance_df = pd.DataFrame({
    'Feature': xg_features_only,
    'Importance': feature_importance,
    'Weight': feature_weights
}).sort_values(by='Importance', ascending=False)

# Save the feature importance to CSV
feature_importance_path = '../data/model/feature_importance.csv'
feature_importance_df.to_csv(feature_importance_path, index=False)

# Print feature importance
print("Feature Importance and Weights:")
print(feature_importance_df)

# Scale the features based on the importance
X_scaled = X.copy()
for i, feature in enumerate(xg_features_only):
    X_scaled[feature] = X[feature] * feature_weights[i]

# Train a Gradient Boosting Regressor with the scaled features
gb_model = GradientBoostingRegressor(random_state=42, n_estimators=100)
gb_model.fit(X_scaled, y)

# Predict on the same data
y_pred = gb_model.predict(X_scaled)

# Evaluate model performance on the same data
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

# Display evaluation metrics
print("Model Evaluation Metrics (with Gradient Boosting and Feature Scaling):")
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R² Score:", r2)


Feature Importance and Weights:
                       Feature  Importance    Weight
30  haaland_performance_rating    0.251792  0.251792
37             midfielders_opp    0.066260  0.066260
21      cumulative_haaland_min    0.053377  0.053377
0       rolling_expected goals    0.050219  0.050219
39           opponent_strength    0.047445  0.047445
1    cumulative_expected goals    0.044544  0.044544
29      cumulative_haaland_sca    0.042089  0.042089
3             cumulative_goals    0.040751  0.040751
13              cumulative_sca    0.037124  0.037124
20         rolling_haaland_min    0.034078  0.034078
17      cumulative_haaland_ast    0.030294  0.030294
7           cumulative_touches    0.026661  0.026661
22     rolling_haaland_touches    0.023987  0.023987
31      rolling_haaland_rating    0.019231  0.019231
6              rolling_touches    0.018505  0.018505
28         rolling_haaland_sca    0.018072  0.018072
19       cumulative_haaland_sh    0.017533  0.017533
4             

In [4]:
def predict_and_compare_expected_goals(venue, opponent, min, model, xg_features, xg_features_only):
    """
    Compares the actual Expected Goals in the filtered data with the predicted Expected Goals.
    
    Parameters:
    - venue: Venue of the match
    - opponent: Opponent team
    - min: Additional minutes to include in the prediction
    - model: The pre-trained prediction model (e.g., GradientBoostingRegressor)
    - xg_features: The dataset used for making predictions
    - xg_features_only: List of features used for prediction
    
    Returns:
    - Comparison of actual and predicted Expected Goals
    """
    # Filter the data for the specified venue and opponent
    filtered_data = xg_features[(xg_features['venue'] == venue) & 
                                (xg_features['opponent'] == opponent)].copy()  # Use a copy to avoid modifying the original

    if filtered_data.empty:
        return "No matching data for the specified venue and opponent."
    
    # Update the 'Min' feature for prediction
    filtered_data['min'] = min
    
    # Print the updated 'min' values to verify
    print(filtered_data['min'].tolist())
    
    # Extract actual Expected Goals
    actual_expected_goals = filtered_data['expected goals'].tolist()
    
    # Prepare features for prediction
    features = filtered_data[xg_features_only]

    # Use the model to predict Expected Goals
    predicted_expected_goals = model.predict(features)

    # Create a comparison dataframe
    comparison = pd.DataFrame({
        'Actual Expected Goals': actual_expected_goals,
        'Predicted Expected Goals': predicted_expected_goals
    })

    # Return the comparison
    return comparison


In [5]:
import pickle

# Bundle the model and preprocessor
model_preprocessor_bundle = {
    'model': gb_model,
    
}

# Save to a pickle file
file_path = '../data/model/gb_model.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(model_preprocessor_bundle, file)

print(f"Model and preprocessor saved to {file_path}")


Model and preprocessor saved to ../data/model/gb_model.pkl


In [10]:
# Example usage
# Assuming you have the trained model and data ready
xg_features_only = [
    'rolling_expected goals', 'cumulative_expected goals',
    'rolling_goals', 'cumulative_goals', 'rolling_shots', 'cumulative_shots',
    'rolling_touches', 'cumulative_touches', 'rolling_poss', 'cumulative_poss',
    'rolling_gca', 'cumulative_gca', 'rolling_sca', 'cumulative_sca',
    'rolling_haaland_gls', 'cumulative_haaland_gls',
    'rolling_haaland_ast', 'cumulative_haaland_ast',
    'rolling_haaland_sh', 'cumulative_haaland_sh',
    'rolling_haaland_min', 'cumulative_haaland_min',
    'rolling_haaland_touches', 'cumulative_haaland_touches',
    'rolling_haaland_poss', 'cumulative_haaland_poss',
    'rolling_haaland_gca', 'cumulative_haaland_gca',
    'rolling_haaland_sca', 'cumulative_haaland_sca',
    'haaland_performance_rating', 'rolling_haaland_rating', 'cumulative_haaland_rating',
    'defenders', 'midfielders', 'forwards',
    'defenders_opp', 'midfielders_opp', 'forwards_opp', 'opponent_strength','min'
]
xg_features = pd.read_csv(xg_features_path)
xg_features.columns = xg_features.columns.str.strip().str.lower()

# Ensure 'round' column is consistent
xg_features['round'] = xg_features['round'].astype(int)

comparison_result = predict_and_compare_expected_goals('Home', "Luton Town", 90, gb_model, xg_features, xg_features_only)

# Display the comparison result
print(comparison_result)


[90]
   Actual Expected Goals  Predicted Expected Goals
0                    4.2                  2.805339
