In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from helpers import load_csv, save_csv, log, check_file_exists

In [2]:
def load_and_merge_data(data_directory, positions):
    """Load and merge data for specified positions."""
    merged_data = pd.DataFrame()
    for position in positions:
        position_file = os.path.join(data_directory, "processed_data", position, f"{position}_final.csv")
        if check_file_exists(position_file):
            position_data = load_csv(position_file)
            if position_data is not None:
                merged_data = pd.concat([merged_data, position_data], ignore_index=True)
                log(f"Merged {position_file}")
        else:
            log(f"File {position_file} not found. Skipping.", level="WARNING")
    return merged_data

In [3]:
def preprocess_data(data):
    """Perform data preprocessing and feature engineering."""
    # Rename "id" to "unique_id" for consistency
    data.rename(columns={"id": "unique_id"}, inplace=True)

    # Feature engineering
    data['was_home'] = data['was_home'].astype(int)
    data['home_crowd_effect'] = data['was_home'] * data['crowds']
    data["_unique_id_copy"] = data["unique_id"]

    # Sort for rolling and cumulative calculations
    data = data.sort_values(by=["unique_id", "season", "gameweek"])

    # One-hot encoding for categorical columns
    dummy_columns = ["POS", "home_crowd_effect", "unique_id", "own_team", "opponent_team"]
    data = pd.get_dummies(data, columns=dummy_columns)

    data["unique_id"] = data["_unique_id_copy"]
    data.drop(columns=["_unique_id_copy"], inplace=True)
    
    return data

In [4]:
def make_predictions(data, model_path, prediction_column):
    """Load model, generate predictions, and add to data."""
    model = xgb.XGBRegressor()
    model.load_model(model_path)
    print(f"Loaded model from {model_path}")

    # Ensure only trained features are used
    trained_feature_names = model.get_booster().feature_names
    prediction_data = data.reindex(columns=trained_feature_names, fill_value=0)

    # Generate predictions
    predictions = model.predict(prediction_data)
    predictions = np.clip(predictions, a_min=0, a_max=None)  # Clamp negative predictions to 0
    data[prediction_column] = predictions
    return data

In [5]:
def save_predictions(data, output_columns, prediction_column, output_file):
    """Format and save predictions to a CSV file."""
    output_data = data[output_columns + [prediction_column]]
    output_data.loc[:, "gameweek"] = output_data["gameweek"].astype(int)  # Ensure gameweek is an integer
    output_data = output_data.pivot(index=["unique_id", "first_name", "second_name"], columns="gameweek", values=prediction_column)
    output_data.reset_index(inplace=True)

    # Rename columns for clarity
    output_data.columns = [
        f"gw_{col}_{prediction_column}" if isinstance(col, int) else col for col in output_data.columns
    ]

    # Sort by gameweek columns
    gameweek_columns = sorted(
        [col for col in output_data.columns if col.startswith("gw_")],
        key=lambda x: int(x.split('_')[1])  # Extract the gameweek number for proper sorting
    )
    output_data = output_data.sort_values(by=gameweek_columns, ascending=False)

    # Save to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    output_data.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

In [6]:
data_directory = "Fantasy-Premier-League/data/2024-25"
positions = ["DEF", "MID", "FWD"]
model_path = "models/xgboost_xg_prediction_model.json"
output_file_g = "predictions/xG_predictions.csv"
prediction_column = "predicted_xG"

# Load and merge data
merged_data = load_and_merge_data(data_directory, positions)

# Preprocess the data
preprocessed_data = preprocess_data(merged_data)

# Make predictions
predicted_data = make_predictions(preprocessed_data, model_path, prediction_column)

# Save the predictions
save_predictions(predicted_data, ["unique_id", "first_name", "second_name", "gameweek"], "predicted_xG", output_file_g)

INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/DEF/DEF_final.csv
INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/MID/MID_final.csv
INFO: Merged Fantasy-Premier-League/data/2024-25/processed_data/FWD/FWD_final.csv
Loaded model from models/xgboost_xg_prediction_model.json
Predictions saved to predictions/xG_predictions.csv
