In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from helpers import load_csv, save_csv, log, check_file_exists, load_and_merge_prediction_data, preprocess_prediction_data, 
    make_predictions, save_predictions

In [2]:
def load_and_merge_data(data_directory, positions):
    """Load and merge data for specified positions."""
    merged_data = pd.DataFrame()
    for position in positions:
        position_file = os.path.join(data_directory, "processed_data", position, f"{position}_final.csv")
        if check_file_exists(position_file):
            position_data = load_csv(position_file)
            if position_data is not None:
                merged_data = pd.concat([merged_data, position_data], ignore_index=True)
                log(f"Merged {position_file}")
        else:
            log(f"File {position_file} not found. Skipping.", level="WARNING")
    return merged_data

In [3]:
def calculate_blended_rate(data, cumulative_total_col, cumulative_metric_col, smoothing_factor=5):
    """
    Calculate a blended rate using cumulative metrics as a proxy for player efficiency.

    Args:
        data (pd.DataFrame): The input data containing 'id', cumulative totals, and cumulative metrics.
        cumulative_total_col (str): The column name for the cumulative total metric (e.g., 'cumulative_goals').
        cumulative_metric_col (str): The column name for the cumulative metric (e.g., 'cumulative_xg').
        smoothing_factor (float): Smoothing factor for blending.

    Returns:
        pd.DataFrame: DataFrame with 'id' and the blended rate column.
    """
    # Group by player (id) and calculate cumulative metrics
    grouped = data.groupby("id").agg(
        cumulative_total=(cumulative_total_col, "max"),
        cumulative_metric=(cumulative_metric_col, "max")
    ).reset_index()

    # Calculate player-level rate
    grouped["player_rate"] = (
        grouped["cumulative_total"] / grouped["cumulative_metric"]
    )
    grouped["player_rate"] = grouped["player_rate"].fillna(0)  # Handle division by zero

    # Calculate league-wide rate
    total_sum = grouped["cumulative_total"].sum()
    metric_sum = grouped["cumulative_metric"].sum()
    league_rate = total_sum / metric_sum if metric_sum > 0 else 1

    # Blend the rate using cumulative metrics as weight
    grouped["weight"] = grouped["cumulative_metric"] / (grouped["cumulative_metric"] + smoothing_factor)
    grouped["blended_rate"] = (
        grouped["weight"] * grouped["player_rate"] +
        (1 - grouped["weight"]) * league_rate
    )

    return grouped[["id", "blended_rate"]]


def add_blended_conversion_rate(data, metric_type):
    """
    Add blended conversion rate to the dataset for the specified metric.

    Args:
        data (pd.DataFrame): The input dataset containing player stats.
        metric_type (str): The metric type ('goals' or 'assists').

    Returns:
        pd.DataFrame: The dataset with the blended conversion rate added.
    """
    if metric_type == "goals":
        blended_rate_df = calculate_blended_rate(
            data, cumulative_total_col="cumulative_goals", cumulative_metric_col="cumulative_xg"
        )
        blended_rate_df.rename(columns={"blended_rate": "blended_goal_conversion_rate"}, inplace=True)
    elif metric_type == "assists":
        blended_rate_df = calculate_blended_rate(
            data, cumulative_total_col="cumulative_assists", cumulative_metric_col="cumulative_xa"
        )
        blended_rate_df.rename(columns={"blended_rate": "blended_assist_conversion_rate"}, inplace=True)
    else:
        raise ValueError("Invalid metric type. Use 'goals' or 'assists'.")

    # Merge blended rate back into the original dataset
    data = data.merge(blended_rate_df, on="id", how="left")
    return data

In [4]:
def calculate_points_for_goals(data, prediction_column, output_column):
    """
    Multiply predicted goals by points based on player position.

    Args:
        data (pd.DataFrame): Dataset containing 'POS' and predicted goals.
        prediction_column (str): Column with predicted goals (e.g., 'predicted_xG').
        output_column (str): Column to store calculated points.

    Returns:
        pd.DataFrame: Updated dataset with calculated points.
    """
    # Define points for each position
    position_points = {2: 6, 3: 5, 4: 4}  # 2 -> Defender, 3 -> Midfielder, 4 -> Forward

    # Map points to positions and calculate final points
    data[output_column] = data["POS"].map(position_points) * data[prediction_column]
    return data

In [5]:
def preprocess_data(data):
    """Perform data preprocessing and feature engineering."""
    # Rename "id" to "unique_id" for consistency
    data.rename(columns={"id": "unique_id"}, inplace=True)

    # Feature engineering
    data['was_home'] = data['was_home'].astype(int)
    data['home_crowd_effect'] = data['was_home'] * data['crowds']
    data["_unique_id_copy"] = data["unique_id"]
    data["_pos_copy"] = data["POS"]

    # Sort for rolling and cumulative calculations
    data = data.sort_values(by=["unique_id", "season", "gameweek"])

    # One-hot encoding for categorical columns
    dummy_columns = ["POS", "home_crowd_effect", "unique_id", "own_team", "opponent_team"]
    data = pd.get_dummies(data, columns=dummy_columns)

    data["unique_id"] = data["_unique_id_copy"]
    data["POS"] = data["_pos_copy"]
    data.drop(columns=["_unique_id_copy"], inplace=True)
    
    return data

In [6]:
def make_predictions(data, model_path, prediction_column):
    """Load model, generate predictions, and add to data."""
    model = xgb.XGBRegressor()
    model.load_model(model_path)
    print(f"Loaded model from {model_path}")

    # Ensure only trained features are used
    trained_feature_names = model.get_booster().feature_names
    prediction_data = data.reindex(columns=trained_feature_names, fill_value=0)

    # Generate predictions
    predictions = model.predict(prediction_data)
    predictions = np.clip(predictions, a_min=0, a_max=None)  # Clamp negative predictions to 0
    data[prediction_column] = predictions
    return data

In [7]:
def save_predictions(data, output_columns, prediction_column, output_file):
    """Format and save predictions to a CSV file."""
    output_data = data[output_columns + [prediction_column]]
    output_data.loc[:, "gameweek"] = output_data["gameweek"].astype(int)  # Ensure gameweek is an integer
    output_data = output_data.pivot(index=["unique_id", "first_name", "second_name"], columns="gameweek", values=prediction_column)
    output_data.reset_index(inplace=True)

    # Rename columns for clarity
    output_data.columns = [
        f"gw_{col}_{prediction_column}" if isinstance(col, int) else col for col in output_data.columns
    ]

    # Sort by gameweek columns
    gameweek_columns = sorted(
        [col for col in output_data.columns if col.startswith("gw_")],
        key=lambda x: int(x.split('_')[1])  # Extract the gameweek number for proper sorting
    )
    output_data = output_data.sort_values(by=gameweek_columns, ascending=False)

    # Save to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    output_data.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

In [8]:
data_directory = "Fantasy-Premier-League/data/2024-25"
positions = ["DEF", "MID", "FWD"]
model_path_xg = "models/xgboost_xg_prediction_model.json"
model_path_xa = "models/xgboost_xa_prediction_model.json"
output_file_xg = "predictions/xG_predictions.csv"
output_file_goals = "predictions/goal_predictions.csv"
output_file_goalpoints = "predictions/goal_points.csv"
output_file_xa = "predictions/xA_predictions.csv"
output_file_assists = "predictions/assist_predictions.csv"
output_file_assistpoints = "predictions/assist_points.csv"

# Load and merge data
merged_data = load_and_merge_data(data_directory, positions)

# Add blended conversion rates for goals and assists
merged_data = add_blended_conversion_rate(merged_data, metric_type="goals")
merged_data = add_blended_conversion_rate(merged_data, metric_type="assists")

# Preprocess the data
preprocessed_data = preprocess_data(merged_data)

# Make predictions for xG
predicted_data_xg = make_predictions(preprocessed_data, model_path_xg, "predicted_xG")
predicted_data_xg["predicted_goals"] = (
    predicted_data_xg["predicted_xG"] * predicted_data_xg["blended_goal_conversion_rate"]
)
predicted_data_xg = calculate_points_for_goals(
    predicted_data_xg, 
    prediction_column="predicted_goals", 
    output_column="predicted_goal_points"
)

# Save xG-related predictions
save_predictions(predicted_data_xg, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_xG", output_file_xg)
save_predictions(predicted_data_xg, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_goals", output_file_goals)
save_predictions(predicted_data_xg, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_goal_points", output_file_goalpoints)

# Make predictions for xA
predicted_data_xa = make_predictions(preprocessed_data, model_path_xa, "predicted_xA")
predicted_data_xa["predicted_assists"] = (
    predicted_data_xa["predicted_xA"] * predicted_data_xa["blended_assist_conversion_rate"]
)
predicted_data_xa["predicted_assist_points"] = predicted_data_xa["predicted_assists"] * 3

# Save xA-related predictions
save_predictions(predicted_data_xa, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_xA", output_file_xa)
save_predictions(predicted_data_xa, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_assists", output_file_assists)
save_predictions(predicted_data_xa, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_assist_points", output_file_assistpoints)

INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/DEF/DEF_final.csv
INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/MID/MID_final.csv
INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/FWD/FWD_final.csv
Loaded model from models/xgboost_xg_prediction_model.json
Predictions saved to predictions/xG_predictions.csv
Predictions saved to predictions/goal_predictions.csv
Predictions saved to predictions/goal_points.csv
Loaded model from models/xgboost_xa_prediction_model.json
Predictions saved to predictions/xA_predictions.csv
Predictions saved to predictions/assist_predictions.csv
Predictions saved to predictions/assist_points.csv
