In [1]:
!pip install pulp




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import joblib
import pandas as pd
from pathlib import Path
from pulp import LpProblem, LpMaximize, LpVariable, lpSum, PULP_CBC_CMD
import xgboost as xgb
import lightgbm as lgb
import numpy as np

In [3]:
BASE_DIR = Path("C:/Users/User/Desktop/end-to-end_ml_project/FPL_best_xi")
MODELS_DIR = BASE_DIR / "src/models"
DATA_PATH = BASE_DIR / "data/raw/players_current.csv"

xgb_model = joblib.load(MODELS_DIR / "xgb_model.joblib")
preproc   = joblib.load(MODELS_DIR / "preprocessor.joblib")
players   = pd.read_csv(DATA_PATH)

In [4]:
import os

In [5]:
print(f" Loaded {len(players)} players from API data")

 Loaded 747 players from API data


In [6]:
players = players.rename(columns={"web_name": "name"})
players["season"] = "2024-25"
players["gameweek"] = 11

for col in ["total_points", "minutes", "goals_scored", "assists", "clean_sheets"]:
    if col not in players.columns:
        players[col] = 0

In [7]:
print("ðŸ”§ Aligning live dataset schema...")

# Numeric encodings
players["season_code"] = 0
players["pos_code"] = players["element_type"].map({1: 0, 2: 1, 3: 2, 4: 3}).fillna(2).astype(int)
players["value"] = players["now_cost"] / 10.0

# Selected %
if "selected_by_percent" in players.columns:
    players["selected"] = (
        players["selected_by_percent"]
        .astype(str)
        .str.replace("%", "", regex=False)
        .replace(["nan", "None", ""], "0")
        .astype(float)
        .fillna(0)
    )
else:
    players["selected"] = 0.0

# Position names for categorical encoding
pos_map = {1: "GKP", 2: "DEF", 3: "MID", 4: "FWD"}
players["position"] = players["element_type"].map(pos_map)

print(" Schema alignment complete.")

ðŸ”§ Aligning live dataset schema...
 Schema alignment complete.


In [8]:
print("Cleaning and aligning before transform...")

# Get expected feature names from preprocessor
if hasattr(preproc, "feature_names_in_"):
    expected_cols = list(preproc.feature_names_in_)
    print(f"Model expects {len(expected_cols)} columns.")
else:
    expected_cols = players.select_dtypes(include=[np.number]).columns.tolist()
    print("No feature_names_in_ found, using numeric columns only.")

# Filter to only expected model columns
players_for_model = players.reindex(columns=expected_cols, fill_value=0)

# Coerce all to numeric safely
for c in players_for_model.columns:
    players_for_model[c] = pd.to_numeric(players_for_model[c], errors="coerce").fillna(0)

# Replace infinities
players_for_model = players_for_model.replace([np.inf, -np.inf], 0).fillna(0)

print("Clean dataframe ready for model transform.")

Cleaning and aligning before transform...
Model expects 36 columns.
Clean dataframe ready for model transform.


In [9]:
print(preproc.feature_names_in_)

['position' 'team' 'minutes' 'goals_scored' 'assists' 'clean_sheets'
 'goals_conceded' 'saves' 'yellow_cards' 'red_cards' 'bonus' 'bps' 'value'
 'selected' 'transfers_in' 'transfers_out' 'team_code' 'pos_code'
 'season_code' 'total_points_roll3' 'total_points_roll5'
 'total_points_roll10' 'minutes_roll3' 'minutes_roll5' 'minutes_roll10'
 'goals_scored_roll3' 'goals_scored_roll5' 'goals_scored_roll10'
 'assists_roll3' 'assists_roll5' 'assists_roll10' 'clean_sheets_roll3'
 'clean_sheets_roll5' 'clean_sheets_roll10' 'last_points' 'form_points']


In [10]:
print(" Predicting expected points (safe version)...")

EXPECTED_FEATURES = [
    'position', 'team', 'minutes', 'goals_scored', 'assists', 'clean_sheets',
    'goals_conceded', 'saves', 'yellow_cards', 'red_cards', 'bonus', 'bps',
    'value', 'selected', 'transfers_in', 'transfers_out', 'team_code',
    'pos_code', 'season_code',
    'total_points_roll3', 'total_points_roll5', 'total_points_roll10',
    'minutes_roll3', 'minutes_roll5', 'minutes_roll10',
    'goals_scored_roll3', 'goals_scored_roll5', 'goals_scored_roll10',
    'assists_roll3', 'assists_roll5', 'assists_roll10',
    'clean_sheets_roll3', 'clean_sheets_roll5', 'clean_sheets_roll10',
    'last_points', 'form_points'
]

# ðŸ”§ ensure all expected columns exist
for col in EXPECTED_FEATURES:
    if col not in players.columns:
        players[col] = 0

# ðŸ”§ select only those columns, in exact order
players_for_model = players[EXPECTED_FEATURES].copy()

# ðŸ”§ force numeric dtype
players_for_model = players_for_model.apply(pd.to_numeric, errors='coerce').fillna(0)

# ðŸ”§ run transform + predict
X_curr = preproc.transform(players_for_model)
dtest = xgb.DMatrix(X_curr)
players['pred_points'] = xgb_model.predict(dtest)

print(" Predictions complete!")


 Predicting expected points (safe version)...


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
print(" Running team optimizer...")

player_ids = players.index.tolist()
prob = LpProblem("FPL_Best_XI", LpMaximize)

# Variables
in_squad = LpVariable.dicts("in_squad", player_ids, cat="Binary")
starter = LpVariable.dicts("starter", player_ids, cat="Binary")
captain = LpVariable.dicts("captain", player_ids, cat="Binary")

# Objective
prob += lpSum(starter[i] * players.loc[i, "pred_points"] for i in player_ids) \
        + lpSum(captain[i] * players.loc[i, "pred_points"] for i in player_ids)

# Constraints
prob += lpSum(in_squad[i] * (players.loc[i, "now_cost"] / 10.0) for i in player_ids) <= 100.0
prob += lpSum(in_squad[i] for i in player_ids) == 15
prob += lpSum(starter[i] for i in player_ids) == 11

# Formation rules
for pos, min_req in {"GKP": 1, "DEF": 3, "MID": 2, "FWD": 1}.items():
    prob += lpSum(starter[i] for i in player_ids if players.loc[i, "position"] == pos) >= min_req

# Squad composition
prob += lpSum(in_squad[i] for i in player_ids if players.loc[i, "position"] == "GKP") == 2
prob += lpSum(in_squad[i] for i in player_ids if players.loc[i, "position"] == "DEF") == 5
prob += lpSum(in_squad[i] for i in player_ids if players.loc[i, "position"] == "MID") == 5
prob += lpSum(in_squad[i] for i in player_ids if players.loc[i, "position"] == "FWD") == 3

# Subset + captain constraint
for i in player_ids:
    prob += starter[i] <= in_squad[i]
    prob += captain[i] <= starter[i]
prob += lpSum(captain[i] for i in player_ids) == 1

# Max 3 from one team
for t in players["team"].unique():
    prob += lpSum(in_squad[i] for i in player_ids if players.loc[i, "team"] == t) <= 3

# Solve
prob.solve(PULP_CBC_CMD(msg=False))
