In [1]:
import pandas as pd
import os
import xgboost as xgb

In [16]:
data_directory = "Fantasy-Premier-League/data/2024-25"
output_file = os.path.join(data_directory, "DEF_MID_FWD_combined.csv")
model_path = "models/xgboost_xg_prediction_model.json"

positions = ["DEF", "MID", "FWD"]
merged_data = pd.DataFrame()

for position in positions:
    position_file = os.path.join(data_directory, position, f"{position}_merged_with_fixtures.csv")
    if os.path.exists(position_file):
        position_data = pd.read_csv(position_file)
        merged_data = pd.concat([merged_data, position_data], ignore_index=True)
        print(f"Merged {position_file}")
    else:
        print(f"File {position_file} not found. Skipping.")

merged_data.to_csv(output_file, index=False)
print(f"Combined data saved to {output_file}")

# Retain non-feature columns for formatting output later
output_columns = ["unique_id", "first_name", "second_name", "gameweek"]

merged_data.rename(columns={"mean_xg_5": "rolling_xg_5", "id": "unique_id"}, inplace=True)

merged_data['was_home'] = merged_data['was_home'].astype(int)
merged_data['home_crowd_effect'] = merged_data['was_home'] * merged_data['crowds']
merged_data = merged_data.sort_values(by=["unique_id", "season", "gameweek"])

# Convert categorical columns to dummies
merged_data = pd.get_dummies(merged_data, columns=["POS", "season"], drop_first=True)

# Load model
model = xgb.XGBRegressor()
model.load_model(model_path)
print(f"Loaded model from {model_path}")

# Ensure only trained features are used for prediction
trained_feature_names = model.get_booster().feature_names
prediction_data = merged_data.reindex(columns=trained_feature_names, fill_value=0)

# Generate predictions
predictions = model.predict(prediction_data)
merged_data["predicted_xG"] = predictions

# Format output to include non-feature columns and predictions
output_data = merged_data[output_columns + ["predicted_xG"]]
output_data.loc[:, "gameweek"] = output_data["gameweek"].astype(int)  # Ensure gameweek is an integer
output_data = output_data.pivot(index=["unique_id", "first_name", "second_name"], columns="gameweek", values="predicted_xG")
output_data.reset_index(inplace=True)

# Rename columns for clarity
output_data.columns = [
    f"gw_{col}_xG" if isinstance(col, int) else col for col in output_data.columns
]

# Sort by gameweek columns
gameweek_columns = sorted(
    [col for col in output_data.columns if col.startswith("gw_")],
    key=lambda x: int(x.split('_')[1])  # Extract the gameweek number for proper sorting
)
output_data = output_data.sort_values(by=gameweek_columns, ascending=False)


# Save the predictions
predictions_folder = "predictions"
os.makedirs(predictions_folder, exist_ok=True)
prediction_output_file = os.path.join(predictions_folder, "xG_predictions.csv")
output_data.to_csv(prediction_output_file, index=False)
print(f"Predictions saved to {prediction_output_file}")

Merged Fantasy-Premier-League/data/2024-25/DEF/DEF_merged_with_fixtures.csv
Merged Fantasy-Premier-League/data/2024-25/MID/MID_merged_with_fixtures.csv
Merged Fantasy-Premier-League/data/2024-25/FWD/FWD_merged_with_fixtures.csv
Combined data saved to Fantasy-Premier-League/data/2024-25/DEF_MID_FWD_combined.csv
Loaded model from models/xgboost_xg_prediction_model.json
Predictions saved to predictions/xG_predictions.csv
