In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from RF_Goalkeeper import create_goalkeeper_features, train_and_evaluate_rf
from RF_Defender import create_defender_features, train_and_evaluate_rf
from RF_Midfielder import create_midfielder_features, train_and_evaluate_rf
from RF_Forward import create_forward_features, train_and_evaluate_rf

from ortools.linear_solver import pywraplp

In [2]:
base_path = '../Data/2023-24/gws/gw{}.csv'  # Base path template

num_gameweeks = 24 # Adjust based on available data

gw_data_frames = {gw: pd.read_csv(base_path.format(gw)) for gw in range(1, num_gameweeks + 1)}

# Add a 'gw' column to each DataFrame to indicate the game week, then concatenate them into one DataFrame
for gw, df in gw_data_frames.items():
    df['gw'] = gw

combined_df = pd.concat(gw_data_frames.values(), ignore_index=True)

In [3]:
# Adjust the target gameweek for training to 24
training_target_gw = 24

# Prepare features for goalkeepers up to the 24th gameweek
goalkeeper_features_up_to_24 = create_goalkeeper_features(gw_data_frames, training_target_gw - 1)

# Load the data for the 24th gameweek to use as the target variable
gw_24_data = gw_data_frames[training_target_gw]
gw_24_goalkeeper = gw_24_data.loc[gw_24_data['position'] == 'GK']

# Prepare the target variable for training
y_train = gw_24_goalkeeper[['name', 'total_points']].set_index('name').sort_index()

# Ensure we only consider goalkeepers present in both the features and target sets for training
common_indices_train = goalkeeper_features_up_to_24.index.intersection(y_train.index)
X_train_filtered = goalkeeper_features_up_to_24.loc[common_indices_train]
y_train_filtered = y_train.loc[common_indices_train]

# Train the Random Forest model with data up to the 24th gameweek
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_filtered, y_train_filtered.values.ravel())

# Now, prepare features for the 25th gameweek using data up to the 24th
goalkeeper_features_up_to_24 = create_goalkeeper_features(gw_data_frames, 24)

# Since we do not have actual points for the 25th gameweek yet, we use the features to predict
# No need to prepare a target variable (y) for the 25th gameweek as we are predicting it

# Ensure we consider goalkeepers present in the features set for the 25th prediction
# Here, we use all goalkeepers from the features as we're predicting, not training
X_pred_25 = goalkeeper_features_up_to_24

# Predict for the 25th gameweek
predictions_25 = rf_model.predict(X_pred_25)

# Convert predictions to a pandas Series for easy handling, though index management may be required
goalkeeper_predictions = pd.Series(predictions_25, index=X_pred_25.index, name='Predicted Points for GW 25')

# Display or process the predictions as needed
print(goalkeeper_predictions)

Aaron Ramsdale                    0.10
Adam Davies                       0.00
Adrián San Miguel del Castillo    0.00
Alfie Whiteman                    0.00
Alisson Ramses Becker             2.09
                                  ... 
Vincent Angelini                  0.00
Wayne Hennessey                   0.00
Wes Foderingham                   2.87
Zack Steffen                      0.00
Đorđe Petrović                    2.98
Name: Predicted Points for GW 25, Length: 94, dtype: float64


In [4]:
# Adjust the target gameweek for training to 24
training_target_gw = 24

# Prepare features for goalkeepers up to the 24th gameweek
goalkeeper_features_up_to_24 = create_defender_features(gw_data_frames, training_target_gw - 1)

# Load the data for the 24th gameweek to use as the target variable
gw_24_data = gw_data_frames[training_target_gw]
gw_24_goalkeeper = gw_24_data.loc[gw_24_data['position'] == 'DEF']

# Prepare the target variable for training
y_train = gw_24_goalkeeper[['name', 'total_points']].set_index('name').sort_index()

# Ensure we only consider goalkeepers present in both the features and target sets for training
common_indices_train = goalkeeper_features_up_to_24.index.intersection(y_train.index)
X_train_filtered = goalkeeper_features_up_to_24.loc[common_indices_train]
y_train_filtered = y_train.loc[common_indices_train]

# Train the Random Forest model with data up to the 24th gameweek
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_filtered, y_train_filtered.values.ravel())

# Now, prepare features for the 25th gameweek using data up to the 24th
goalkeeper_features_up_to_24 = create_defender_features(gw_data_frames, 24)

# Since we do not have actual points for the 25th gameweek yet, we use the features to predict
# No need to prepare a target variable (y) for the 25th gameweek as we are predicting it

# Ensure we consider goalkeepers present in the features set for the 25th prediction
# Here, we use all goalkeepers from the features as we're predicting, not training
X_pred_25 = goalkeeper_features_up_to_24

# Predict for the 25th gameweek
predictions_25 = rf_model.predict(X_pred_25)

# Convert predictions to a pandas Series for easy handling, though index management may be required
defender_predictions = pd.Series(predictions_25, index=X_pred_25.index, name='Predicted Points for GW 25')

# Display or process the predictions as needed
print(defender_predictions)

Aaron Cresswell              0.43000
Aaron Hickey                 0.21000
Aaron Wan-Bissaka            0.49000
Abdul Rahman Baba            0.01253
Adam Smith                   2.52000
                              ...   
Willy Boly                   0.28000
Willy Kambwala               0.12000
Yasser Larouci               0.18000
Yerson Mosquera              0.01253
Álvaro Fernández Carreras    0.01253
Name: Predicted Points for GW 25, Length: 261, dtype: float64


In [5]:
# Adjust the target gameweek for training to 24
training_target_gw = 24

# Prepare features for goalkeepers up to the 24th gameweek
goalkeeper_features_up_to_24 = create_midfielder_features(gw_data_frames, training_target_gw - 1)

# Load the data for the 24th gameweek to use as the target variable
gw_24_data = gw_data_frames[training_target_gw]
gw_24_goalkeeper = gw_24_data.loc[gw_24_data['position'] == 'MID']

# Prepare the target variable for training
y_train = gw_24_goalkeeper[['name', 'total_points']].set_index('name').sort_index()

# Ensure we only consider goalkeepers present in both the features and target sets for training
common_indices_train = goalkeeper_features_up_to_24.index.intersection(y_train.index)
X_train_filtered = goalkeeper_features_up_to_24.loc[common_indices_train]
y_train_filtered = y_train.loc[common_indices_train]

# Train the Random Forest model with data up to the 24th gameweek
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_filtered, y_train_filtered.values.ravel())

# Now, prepare features for the 25th gameweek using data up to the 24th
goalkeeper_features_up_to_24 = create_midfielder_features(gw_data_frames, 24)

# Since we do not have actual points for the 25th gameweek yet, we use the features to predict
# No need to prepare a target variable (y) for the 25th gameweek as we are predicting it

# Ensure we consider goalkeepers present in the features set for the 25th prediction
# Here, we use all goalkeepers from the features as we're predicting, not training
X_pred_25 = goalkeeper_features_up_to_24

# Predict for the 25th gameweek
predictions_25 = rf_model.predict(X_pred_25)

# Convert predictions to a pandas Series for easy handling, though index management may be required
midfielder_predictions = pd.Series(predictions_25, index=X_pred_25.index, name='Predicted Points for GW 25')

# Display or process the predictions as needed
print(midfielder_predictions)

Aaron Ramsey                      0.600000
Abdoulaye Doucouré                0.460000
Adam Lallana                      0.570000
Adam Wharton                      2.070000
Adama Traoré                      0.210000
                                    ...   
Yegor Yarmolyuk                   0.610000
Youri Tielemans                   1.240000
Yves Bissouma                     0.980000
Zack Nelson                       0.009534
Ângelo Gabriel Borges Damaceno    0.009534
Name: Predicted Points for GW 25, Length: 357, dtype: float64


In [6]:
# Adjust the target gameweek for training to 24
training_target_gw = 24

# Prepare features for goalkeepers up to the 24th gameweek
goalkeeper_features_up_to_24 = create_forward_features(gw_data_frames, training_target_gw - 1)

# Load the data for the 24th gameweek to use as the target variable
gw_24_data = gw_data_frames[training_target_gw]
gw_24_goalkeeper = gw_24_data.loc[gw_24_data['position'] == 'FWD']

# Prepare the target variable for training
y_train = gw_24_goalkeeper[['name', 'total_points']].set_index('name').sort_index()

# Ensure we only consider goalkeepers present in both the features and target sets for training
common_indices_train = goalkeeper_features_up_to_24.index.intersection(y_train.index)
X_train_filtered = goalkeeper_features_up_to_24.loc[common_indices_train]
y_train_filtered = y_train.loc[common_indices_train]

# Train the Random Forest model with data up to the 24th gameweek
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_filtered, y_train_filtered.values.ravel())

# Now, prepare features for the 25th gameweek using data up to the 24th
goalkeeper_features_up_to_24 = create_forward_features(gw_data_frames, 24)

# Since we do not have actual points for the 25th gameweek yet, we use the features to predict
# No need to prepare a target variable (y) for the 25th gameweek as we are predicting it

# Ensure we consider goalkeepers present in the features set for the 25th prediction
# Here, we use all goalkeepers from the features as we're predicting, not training
X_pred_25 = goalkeeper_features_up_to_24

# Predict for the 25th gameweek
predictions_25 = rf_model.predict(X_pred_25)

# Convert predictions to a pandas Series for easy handling, though index management may be required
forward_predictions = pd.Series(predictions_25, index=X_pred_25.index, name='Predicted Points for GW 25')

# Display or process the predictions as needed
print(forward_predictions)

Aaron Connolly              0.027854
Ademola Ola-Adebomi         0.027854
Admiral Muskwe              0.027854
Alejo Véliz                 0.280000
Aleksandar Mitrović         0.000000
                              ...   
William Osula               0.500000
Wout Weghorst               0.027854
Yoane Wissa                 1.570000
Youssef Ramalho Chermiti    0.580000
Zeki Amdouni                2.020000
Name: Predicted Points for GW 25, Length: 105, dtype: float64


In [7]:
# Combine all predictions into one DataFrame
all_predictions = pd.concat([goalkeeper_predictions, defender_predictions, midfielder_predictions, forward_predictions], axis=0)

In [8]:
all_predictions

Aaron Ramsdale                    0.100000
Adam Davies                       0.000000
Adrián San Miguel del Castillo    0.000000
Alfie Whiteman                    0.000000
Alisson Ramses Becker             2.090000
                                    ...   
William Osula                     0.500000
Wout Weghorst                     0.027854
Yoane Wissa                       1.570000
Youssef Ramalho Chermiti          0.580000
Zeki Amdouni                      2.020000
Name: Predicted Points for GW 25, Length: 817, dtype: float64

In [9]:
# Convert the Series to a DataFrame and reset the index
all_predictions_df = all_predictions.reset_index()

# Rename the columns
all_predictions_df.columns = ['Player Name', 'Predicted Points for GW 25']

In [10]:
all_predictions_df

Unnamed: 0,Player Name,Predicted Points for GW 25
0,Aaron Ramsdale,0.100000
1,Adam Davies,0.000000
2,Adrián San Miguel del Castillo,0.000000
3,Alfie Whiteman,0.000000
4,Alisson Ramses Becker,2.090000
...,...,...
812,William Osula,0.500000
813,Wout Weghorst,0.027854
814,Yoane Wissa,1.570000
815,Youssef Ramalho Chermiti,0.580000


In [11]:
current_gw = 24
current_gw_data = gw_data_frames[current_gw]

In [12]:
current_gw_data

Unnamed: 0,name,position,team,xP,assists,bonus,bps,clean_sheets,creativity,element,...,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,gw
0,Femi Seriki,DEF,Sheffield Utd,0.0,0,0,0,0,0.0,653,...,1,0.0,0,-345,147,492,39,False,0,24
1,Josh Brooking,DEF,Chelsea,0.0,0,0,0,0,0.0,723,...,1,0.0,0,12,66,54,40,False,0,24
2,Radek Vítek,GK,Man Utd,0.0,0,0,0,0,0.0,669,...,1,0.0,0,-93,0,93,40,False,0,24
3,Jack Hinshelwood,MID,Brighton,0.8,0,0,0,0,0.0,621,...,2,0.0,0,2415,4343,1928,45,False,0,24
4,Jadon Sancho,MID,Man Utd,0.0,0,0,0,0,0.0,397,...,1,0.0,0,-133,0,133,67,False,0,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,Kyle Walker,DEF,Man City,2.0,0,0,11,0,22.5,369,...,2,0.0,1,-25628,76904,102532,55,True,0,24
810,Jacob Brown,FWD,Luton,0.3,0,0,0,0,0.0,631,...,1,0.0,0,-293,346,639,49,True,0,24
811,Vicente Guaita,GK,Crystal Palace,0.0,0,0,0,0,0.0,227,...,1,0.0,0,-181,0,181,44,True,0,24
812,Braian Ojeda Rodríguez,MID,Nott'm Forest,0.0,0,0,0,0,0.0,459,...,2,0.0,0,0,0,0,45,True,0,24


In [13]:
# Add the value column to the all_predictions_df DataFrame by name
all_predictions_df['Value'] = all_predictions_df['Player Name'].map(current_gw_data.set_index('name')['value'])

# Add the position column to the all_predictions_df DataFrame by name
all_predictions_df['Position'] = all_predictions_df['Player Name'].map(current_gw_data.set_index('name')['position'])

# Add the team column to the all_predictions_df DataFrame by name
all_predictions_df['Team'] = all_predictions_df['Player Name'].map(current_gw_data.set_index('name')['team'])

In [14]:
all_predictions_df

Unnamed: 0,Player Name,Predicted Points for GW 25,Value,Position,Team
0,Aaron Ramsdale,0.100000,45.0,GK,Arsenal
1,Adam Davies,0.000000,40.0,GK,Sheffield Utd
2,Adrián San Miguel del Castillo,0.000000,39.0,GK,Liverpool
3,Alfie Whiteman,0.000000,39.0,GK,Spurs
4,Alisson Ramses Becker,2.090000,57.0,GK,Liverpool
...,...,...,...,...,...
812,William Osula,0.500000,43.0,FWD,Sheffield Utd
813,Wout Weghorst,0.027854,55.0,FWD,Burnley
814,Yoane Wissa,1.570000,56.0,FWD,Brentford
815,Youssef Ramalho Chermiti,0.580000,48.0,FWD,Everton


In [15]:
all_predictions_df.to_csv("dataset.csv", index = False)

In [16]:
df = all_predictions_df

In [62]:
df = all_predictions_df
positions = {
    'GK': 2,
    'DEF': 5,
    'MID': 5,
    'FWD': 3
}

# Assuming `df` is your DataFrame with player data
df['Value for Money'] = df['Predicted Points for GW 25'] / df['Value']

# Filter players with a 'Value for Money' below a certain threshold
# Adjust the threshold based on your analysis of the data distribution
threshold = df['Value for Money'].quantile(0.05)  # Example: Exclude bottom 25%
df_filtered = df[df['Value for Money'] >= threshold]

# Further reduction by considering top N players per position
top_n_multiplier = 100  # Example: Consider twice the number of required players per position
df_reduced = pd.DataFrame()  # Initialize an empty DataFrame for reduced player set
for position in ['GK', 'DEF', 'MID', 'FWD']:
    required_number = positions[position]  # Assuming 'positions' dict is defined as before
    top_n = required_number * top_n_multiplier
    df_position = df_filtered[df_filtered['Position'] == position].nlargest(top_n, 'Value for Money')
    df_reduced = pd.concat([df_reduced, df_position])

len(df_reduced)


813

In [67]:
df = df_reduced
# Assuming `df` is your DataFrame with player data
# Check if SCIP solver is available
solver = pywraplp.Solver.CreateSolver('SCIP')
if not solver:
    print('SCIP solver not available.')
    exit(1)

# Increase the emphasis on finding a first feasible solution and enable parallel processing
solver.SetSolverSpecificParametersAsString("""
    heuristics/rens/freq=10
    constraints/setppc/upgrade=FALSE
    separating/maxrounds=0
    presolving/maxrestarts=0
    parallel/maxnthreads=4  # Adjust based on your CPU capabilities
""")

# Define decision variables
player_vars = {}
for i, row in df.iterrows():
    player_vars[row['Player Name']] = solver.IntVar(0, 1, f"var_{row['Player Name']}")

# Objective: Maximize total predicted points
objective = solver.Objective()
for name, var in player_vars.items():
    objective.SetCoefficient(var, df.loc[df['Player Name'] == name, 'Predicted Points for GW 25'].values[0])
objective.SetMaximization()

# Constraints
# Budget constraint
solver.Add(solver.Sum([df.loc[df['Player Name'] == name, 'Value'].values[0] * var for name, var in player_vars.items()]) <= 1000)

# Squad size
solver.Add(solver.Sum(player_vars.values()) == 15)

# Position constraints
positions = {'GK': 2, 'DEF': 5, 'MID': 5, 'FWD': 3}
for position, required_count in positions.items():
    solver.Add(solver.Sum([var for name, var in player_vars.items() if df.loc[df['Player Name'] == name, 'Position'].values[0] == position]) == required_count)

# Team constraint (max 3 players from the same team)
for team in df['Team'].unique():
    solver.Add(solver.Sum([var for name, var in player_vars.items() if df.loc[df['Player Name'] == name, 'Team'].values[0] == team]) <= 3)

# Solve the problem
status = solver.Solve()

# Output
if status == pywraplp.Solver.OPTIMAL:
    print('Solution:')
    total_points = 0
    total_value = 0

    selected_players = []
    for name, var in player_vars.items():
        if var.solution_value() == 1:
            print(f"{name}: Selected")
            total_points += df.loc[df['Player Name'] == name, 'Predicted Points for GW 25'].values[0]
            total_value += df.loc[df['Player Name'] == name, 'Value'].values[0]
            selected_players.append(name)
    print(f"Total Predicted Points: {total_points}")
    print(f"Total Value: {total_value}")
else:
    print('The problem does not have an optimal solution.')

Solution:
Mark Flekken: Selected
Ederson Santana de Moraes: Selected
Gabriel dos Santos Magalhães: Selected
Nathan Aké: Selected
William Saliba: Selected
Fabian Schär: Selected
Virgil van Dijk: Selected
Bruno Guimarães Rodriguez Moura: Selected
Douglas Luiz Soares de Paulo: Selected
Pascal Groß: Selected
Conor Gallagher: Selected
Bukayo Saka: Selected
Rodrigo Muniz Carvalho: Selected
Ivan Toney: Selected
Erling Haaland: Selected
Total Predicted Points: 103.33999999999999
Total Value: 961.0


In [70]:
selected_players_data = df.loc[df['Player Name'].isin(selected_players)]
selected_players_data


Unnamed: 0,Player Name,Predicted Points for GW 25,Value,Position,Team,Value for Money
58,Mark Flekken,3.92,45.0,GK,Brentford,0.087111
22,Ederson Santana de Moraes,3.78,55.0,GK,Man City,0.068727
174,Gabriel dos Santos Magalhães,8.61,50.0,DEF,Arsenal,0.1722
280,Nathan Aké,5.87,50.0,DEF,Man City,0.1174
349,William Saliba,6.4,56.0,DEF,Arsenal,0.114286
169,Fabian Schär,5.72,53.0,DEF,Newcastle,0.107925
345,Virgil van Dijk,5.16,63.0,DEF,Liverpool,0.081905
403,Bruno Guimarães Rodriguez Moura,8.56,58.0,MID,Newcastle,0.147586
442,Douglas Luiz Soares de Paulo,8.04,55.0,MID,Aston Villa,0.146182
638,Pascal Groß,9.26,64.0,MID,Brighton,0.144687


In [81]:
best_gk = selected_players_data[selected_players_data['Position'] == 'GK'].nlargest(1, 'Predicted Points for GW 25')
best_ten = selected_players_data[selected_players_data['Position'] != 'GK'].nlargest(10, 'Predicted Points for GW 25')

best_eleven = pd.concat([best_gk, best_ten])

best_eleven["Captain"] = False
best_eleven["Vice Captain"] = False

best_eleven.loc[best_eleven['Predicted Points for GW 25'].idxmax(), 'Captain'] = True
best_eleven.loc[best_eleven[best_eleven["Captain"] == False]['Predicted Points for GW 25'].idxmax(), 'Vice Captain'] = True

best_eleven

Unnamed: 0,Player Name,Predicted Points for GW 25,Value,Position,Team,Value for Money,Captain,Vice Captain
58,Mark Flekken,3.92,45.0,GK,Brentford,0.087111,False,False
406,Bukayo Saka,9.88,90.0,MID,Arsenal,0.109778,True,False
638,Pascal Groß,9.26,64.0,MID,Brighton,0.144687,False,True
174,Gabriel dos Santos Magalhães,8.61,50.0,DEF,Arsenal,0.1722,False,False
403,Bruno Guimarães Rodriguez Moura,8.56,58.0,MID,Newcastle,0.147586,False,False
754,Erling Haaland,8.33,143.0,FWD,Man City,0.058252,False,False
442,Douglas Luiz Soares de Paulo,8.04,55.0,MID,Aston Villa,0.146182,False,False
422,Conor Gallagher,7.77,54.0,MID,Chelsea,0.143889,False,False
802,Rodrigo Muniz Carvalho,6.86,44.0,FWD,Fulham,0.155909,False,False
349,William Saliba,6.4,56.0,DEF,Arsenal,0.114286,False,False
