In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from helpers import load_csv, save_csv, log, check_file_exists

In [2]:
def load_and_merge_data(data_directory, positions):
    """Load and merge data for specified positions."""
    merged_data = pd.DataFrame()
    for position in positions:
        position_file = os.path.join(data_directory, "processed_data", position, f"{position}_final.csv")
        if check_file_exists(position_file):
            position_data = load_csv(position_file)
            if position_data is not None:
                merged_data = pd.concat([merged_data, position_data], ignore_index=True)
                log(f"Merged {position_file}")
        else:
            log(f"File {position_file} not found. Skipping.", level="WARNING")
    return merged_data

In [3]:
def calculate_blended_conversion_rate(data, k=5):
    """
    Calculate blended conversion rate using cumulative goals and xG as a proxy for player efficiency.
    
    Args:
        data (pd.DataFrame): The input data containing 'id', 'cumulative_goals', and 'cumulative_xg'.
        k (float): Smoothing factor for blending.

    Returns:
        pd.DataFrame: DataFrame with 'id' and 'blended_conversion_rate'.
    """
    # Group by player (id) and calculate cumulative metrics
    grouped = data.groupby("id").agg(
        total_goals=("cumulative_goals", "max"),
        total_xg=("cumulative_xg", "max")
    ).reset_index()

    # Calculate player-level conversion rate
    grouped["player_conversion_rate"] = grouped["total_goals"] / grouped["total_xg"]
    grouped["player_conversion_rate"] = grouped["player_conversion_rate"].fillna(0)  # Handle division by zero

    # Calculate league-wide conversion rate
    total_goals = grouped["total_goals"].sum()
    total_xg = grouped["total_xg"].sum()
    league_conversion_rate = total_goals / total_xg if total_xg > 0 else 1

    # Blend conversion rate using xG as weight
    grouped["weight"] = grouped["total_xg"] / (grouped["total_xg"] + k)
    grouped["blended_conversion_rate"] = (
        grouped["weight"] * grouped["player_conversion_rate"] +
        (1 - grouped["weight"]) * league_conversion_rate
    )

    return grouped[["id", "blended_conversion_rate"]]

def add_blended_conversion_rate(data):
    """
    Add blended conversion rate to the dataset based on cumulative metrics.
    
    Args:
        data (pd.DataFrame): The input dataset containing player stats.

    Returns:
        pd.DataFrame: The dataset with the blended conversion rate added.
    """
    # Calculate blended conversion rate
    blended_rate_df = calculate_blended_conversion_rate(data)

    # Merge blended rate back into the original dataset
    data = data.merge(blended_rate_df, on="id", how="left")
    return data

In [4]:
def calculate_points_for_goals(data, prediction_column, output_column):
    """
    Multiply predicted goals by points based on player position.

    Args:
        data (pd.DataFrame): Dataset containing 'POS' and predicted goals.
        prediction_column (str): Column with predicted goals (e.g., 'predicted_xG').
        output_column (str): Column to store calculated points.

    Returns:
        pd.DataFrame: Updated dataset with calculated points.
    """
    # Define points for each position
    position_points = {2: 6, 3: 5, 4: 4}  # 2 -> Defender, 3 -> Midfielder, 4 -> Forward

    # Map points to positions and calculate final points
    data[output_column] = data["POS"].map(position_points) * data[prediction_column]
    return data

In [5]:
def preprocess_data(data):
    """Perform data preprocessing and feature engineering."""
    # Rename "id" to "unique_id" for consistency
    data.rename(columns={"id": "unique_id"}, inplace=True)

    # Feature engineering
    data['was_home'] = data['was_home'].astype(int)
    data['home_crowd_effect'] = data['was_home'] * data['crowds']
    data["_unique_id_copy"] = data["unique_id"]
    data["_pos_copy"] = data["POS"]

    # Sort for rolling and cumulative calculations
    data = data.sort_values(by=["unique_id", "season", "gameweek"])

    # One-hot encoding for categorical columns
    dummy_columns = ["POS", "home_crowd_effect", "unique_id", "own_team", "opponent_team"]
    data = pd.get_dummies(data, columns=dummy_columns)

    data["unique_id"] = data["_unique_id_copy"]
    data["POS"] = data["_pos_copy"]
    data.drop(columns=["_unique_id_copy"], inplace=True)
    
    return data

In [6]:
def make_predictions(data, model_path, prediction_column):
    """Load model, generate predictions, and add to data."""
    model = xgb.XGBRegressor()
    model.load_model(model_path)
    print(f"Loaded model from {model_path}")

    # Ensure only trained features are used
    trained_feature_names = model.get_booster().feature_names
    prediction_data = data.reindex(columns=trained_feature_names, fill_value=0)

    # Generate predictions
    predictions = model.predict(prediction_data)
    predictions = np.clip(predictions, a_min=0, a_max=None)  # Clamp negative predictions to 0
    data[prediction_column] = predictions
    return data

In [7]:
def save_predictions(data, output_columns, prediction_column, output_file):
    """Format and save predictions to a CSV file."""
    output_data = data[output_columns + [prediction_column]]
    output_data.loc[:, "gameweek"] = output_data["gameweek"].astype(int)  # Ensure gameweek is an integer
    output_data = output_data.pivot(index=["unique_id", "first_name", "second_name"], columns="gameweek", values=prediction_column)
    output_data.reset_index(inplace=True)

    # Rename columns for clarity
    output_data.columns = [
        f"gw_{col}_{prediction_column}" if isinstance(col, int) else col for col in output_data.columns
    ]

    # Sort by gameweek columns
    gameweek_columns = sorted(
        [col for col in output_data.columns if col.startswith("gw_")],
        key=lambda x: int(x.split('_')[1])  # Extract the gameweek number for proper sorting
    )
    output_data = output_data.sort_values(by=gameweek_columns, ascending=False)

    # Save to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    output_data.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

In [11]:
data_directory = "Fantasy-Premier-League/data/2024-25"
positions = ["DEF", "MID", "FWD"]
model_path = "models/xgboost_xg_prediction_model.json"
output_file_xg = "predictions/xG_predictions.csv"
output_file_goals = "predictions/goal_predictions.csv"
output_file_goalpoints = "predictions/goal_points.csv"
prediction_column = "predicted_xG"

# Load and merge data
merged_data = load_and_merge_data(data_directory, positions)

# Add blended conversion rate to the dataset
merged_data = add_blended_conversion_rate(merged_data)

# Preprocess the data
preprocessed_data = preprocess_data(merged_data)

# Make predictions
predicted_data = make_predictions(preprocessed_data, model_path, prediction_column)

predicted_data["predicted_goals"] = (
    predicted_data[prediction_column] * predicted_data["blended_conversion_rate"]
)

predicted_data = calculate_points_for_goals(
    predicted_data, 
    prediction_column="predicted_goals", 
    output_column="predicted_points"
)

# Save xG predictions
save_predictions(predicted_data, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_xG", output_file_xg)

save_predictions(predicted_data, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_goals", output_file_goals)

save_predictions(
    predicted_data, 
    ["unique_id", "first_name", "second_name", "gameweek"], 
    "predicted_points", 
    output_file_goalpoints
)

INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/DEF/DEF_final.csv
INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/MID/MID_final.csv
INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/FWD/FWD_final.csv
Loaded model from models/xgboost_xg_prediction_model.json
Predictions saved to predictions/xG_predictions.csv
Predictions saved to predictions/goal_predictions.csv
Predictions saved to predictions/goal_points.csv
