In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from helpers import combine_position_data, log, load_csv, save_csv, calculate_season_average_until_gw
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.ensemble import VotingRegressor
import xgboost as xgb
from itertools import combinations
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error

In [5]:
data_directory = "Fantasy-Premier-League/data"
seasons = ["2022-23", "2023-24", "2024-25"]
positions = ["MID"]
output_file_name = "mid_training_data.csv"

log("Combining position data...", level="INFO")
combine_position_data(data_directory, seasons, positions, output_file_name)

INFO: Combining position data...
INFO: Loaded data from MID_players.csv for season 2022-23.
INFO: Loaded data from MID_players.csv for season 2023-24.
INFO: Loaded data from MID_players.csv for season 2024-25.
INFO: Combined data saved to Fantasy-Premier-League/data/training_data/mid_training_data.csv.


In [17]:
training_data_dir = os.path.join(data_directory, "training_data")
training_file = os.path.join(training_data_dir, output_file_name)
mid_data = load_csv(training_file)
if mid_data is None:
    log("Failed to load training data. Exiting.", level="ERROR")
    exit()

# ========================
# Feature Engineering
# ========================
mid_data = mid_data[mid_data["minutes"] > 50]
mid_data['was_home'] = mid_data['was_home'].astype(int)
mid_data["def_atk_diff"] = mid_data["own_defense"]-mid_data["opponent_attack"]
mid_data["atk_def_diff"] = mid_data["own_attack"]-mid_data["opponent_defense"]
mid_data = mid_data.sort_values(by=["unique_id", "season", "gameweek"]).reset_index(drop=True)

rolling_periods = [15]
base_features = ["expected_assists", "expected_goals", "ict_index", "team_deep", "shots", "key_passes"]

for period in rolling_periods:
    mid_data[f"rolling_total_points_{period}"] = (
        mid_data.groupby("unique_id")["total_points"]
        .shift(1)
        .rolling(window=period, min_periods=1)
        .mean()
        .reset_index(level=0, drop=True)
    )

# Usuwamy wszystkie inne cechy i zostawiamy tylko rolling features dla punktów
features = [f"rolling_total_points_{period}" for period in rolling_periods]
target = "total_points"

# ========================
# Train-Test Split
# ========================
X = mid_data[features]
y = mid_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost Model
model = CatBoostRegressor(
    iterations=200,  # Liczba iteracji
    learning_rate=0.1,  # Współczynnik uczenia
    depth=6,  # Głębokość drzewa
    random_seed=42,
    verbose=50
)
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate Model
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Feature Importance Plot
xgb.plot_importance(model, max_num_features=10)
plt.title("Feature Importance for Midfielders")
plt.show()

# Save Model
# ========================
models_folder = "models"
os.makedirs(models_folder, exist_ok=True)
model_path = os.path.join(models_folder, "mid_prediction_model.json")
model.save_model(model_path)
log(f"Model saved at: {model_path}", level="INFO")

# ========================
# Optional Hyperparameter Tuning
# ========================
def hyperparameter_tuning(X_train, y_train):
    from sklearn.model_selection import GridSearchCV

    log("Starting hyperparameter tuning...", level="INFO")
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    model = XGBRegressor(random_stateexpected_goals_conceded=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)

    log(f"Best parameters found: {grid_search.best_params_}", level="INFO")
    log(f"Best RMSE: {-grid_search.best_score_}", level="INFO")
    return grid_search.best_estimator_

# Uncomment to perform tuning
# best_model = hyperparameter_tuning(X_train, y_train)

0:	learn: 3.1208696	total: 871us	remaining: 173ms
50:	learn: 3.0369595	total: 36.5ms	remaining: 107ms
100:	learn: 3.0275024	total: 73ms	remaining: 71.5ms
150:	learn: 3.0167683	total: 109ms	remaining: 35.4ms
199:	learn: 3.0090743	total: 143ms	remaining: 0us
RMSE: 3.1624
R²: 0.0667
Mean Absolute Error (MAE): 2.2327


ValueError: tree must be Booster, XGBModel or dict instance