## Imports

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ML Helpers/Metrics
import pickle
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

## Load 2023 Targets

In [2]:
df = pd.read_csv("output/data/ml_soccer_targets.csv")
print(df.shape)
df.head()

(707, 58)


Unnamed: 0,season_name,player_id,player_name,minutes_played,offensive_goals_added,offensive_goals_added_90,player_age,guaranteed_compensation_1,minutes_played_1,share_team_touches_1,attempted_passes_1,completed_passes_1,total_distance_yds_1,total_vertical_distance_yds_1,xcompleted_passes_1,goals_added_above_avg_Dribbling_1,goals_added_above_avg_Fouling_1,goals_added_above_avg_Interrupting_1,goals_added_above_avg_Passing_1,goals_added_above_avg_Receiving_1,goals_added_above_avg_Shooting_1,guaranteed_compensation_2,minutes_played_2,share_team_touches_2,attempted_passes_2,completed_passes_2,total_distance_yds_2,total_vertical_distance_yds_2,xcompleted_passes_2,goals_added_above_avg_Dribbling_2,goals_added_above_avg_Fouling_2,goals_added_above_avg_Interrupting_2,goals_added_above_avg_Passing_2,goals_added_above_avg_Receiving_2,goals_added_above_avg_Shooting_2,guaranteed_compensation_3,minutes_played_3,share_team_touches_3,attempted_passes_3,completed_passes_3,total_distance_yds_3,total_vertical_distance_yds_3,xcompleted_passes_3,goals_added_above_avg_Dribbling_3,goals_added_above_avg_Fouling_3,goals_added_above_avg_Interrupting_3,goals_added_above_avg_Passing_3,goals_added_above_avg_Receiving_3,goals_added_above_avg_Shooting_3,played_two_years_ago,played_three_years_ago,primary_position_AM,primary_position_CB,primary_position_CM,primary_position_DM,primary_position_FB,primary_position_ST,primary_position_W
0,2023,0Oq630dXQ6,Kevin Cabral,,,,23.0,1650000.0,1896.0,0.035077,550.0,423.0,8499.1957,-1016.9031,432.3451,-0.7633,0.3519,-0.0297,-0.9557,0.4421,0.0378,1440000.0,2182.0,0.047993,583.0,446.0,9341.4938,-1334.5887,477.6853,-0.5895,0.2924,-0.5129,-1.1422,0.9445,0.1802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False,False,False,False,False,True
1,2023,0Oq632k7Q6,Danny Leyva,,,,19.0,129423.0,1046.0,0.064742,678.0,579.0,12537.3004,3213.688,568.4282,-0.1141,0.0084,0.1877,-0.0573,-0.2458,-0.1194,100798.0,989.0,0.046333,532.0,440.0,9292.0537,2135.7791,435.4407,-0.3145,-0.022491,-0.0111,-0.3453,-0.3065,-0.1386,83052.0,68.0,0.0333,39.0,36.0,625.4843,64.6718,33.8605,-0.0292,0.003,-0.0272,-0.0269,-0.0043,-0.0126,True,True,False,False,False,True,False,False,False
2,2023,0Oq633dAQ6,Griffin Yow,,,,20.0,136118.0,307.0,0.027614,58.0,42.0,842.7285,-188.5922,43.0811,-0.0688,0.2766,0.0214,-0.177,-0.1559,-0.0243,116118.0,290.0,0.026645,100.0,58.0,2021.9371,472.1529,68.2641,0.1141,-0.0039,0.0577,0.0634,-0.0889,0.0505,96118.0,448.0,0.025842,92.0,68.0,1589.4724,-164.4759,70.9204,0.0205,-0.0109,0.0289,-0.1898,-0.2867,-0.1032,True,True,True,False,False,False,False,False,False
3,2023,0Oq63JA7Q6,Wyatt Omsberg,,,,27.0,157000.0,1376.0,0.095829,611.0,445.0,15899.5392,7463.2308,457.049,-0.001392,0.1109,-0.5812,-0.1617,0.3337,0.0691,85444.0,840.0,0.041811,372.0,288.0,8592.6451,2899.8791,286.225,-0.0906,-0.038999,-0.5106,-0.0764,-0.061,-0.0469,81375.0,215.0,0.0509,134.0,101.0,3318.9127,1003.8907,107.1716,0.0152,0.0208,0.2268,-0.0025,-0.0487,-0.024,True,True,False,True,False,False,False,False,False
4,2023,0Oq63P32Q6,Nkosi Burgess,,,,25.0,197950.0,1523.0,0.058337,867.0,739.0,20794.2994,8484.7484,739.3657,0.0667,-0.0077,0.1025,0.42,-0.2575,-0.1205,66724.0,2009.0,0.091764,1049.0,872.0,26608.9496,10418.1654,879.8107,0.0459,0.2026,-0.0677,0.0285,-0.3093,-0.1185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,True,False,False,False,False,False


In [3]:
# Load Models
with open('models/final/oga90_model.pkl', 'rb') as file:
    oga90_model = pickle.load(file)

In [4]:
# Make inferences
target_cols = ['season_name', 'player_id', 'player_name', 'minutes_played', 'offensive_goals_added', 'offensive_goals_added_90']
X = df.drop(columns=target_cols)
oga90 = oga90_model.predict(X)

In [5]:
# filter to only columns we care about
proj = df.loc[:, ['season_name', 'player_id', 'player_name', 'guaranteed_compensation_1']]
proj["offensive_goals_added_90_proj"] = oga90

proj = proj.sort_values(by=f"offensive_goals_added_90_proj", ascending=False)
proj.head(15)

Unnamed: 0,season_name,player_id,player_name,guaranteed_compensation_1,offensive_goals_added_90_proj
316,2023,NWMWnVEQlz,Lorenzo Insigne,14000000.0,0.185098
659,2023,vzqorrRk5a,Carles Gil,3545830.0,0.121517
333,2023,Oa5wY8RXQ1,Hany Mukhtar,1926250.0,0.108887
661,2023,wvq90Ym5Wn,Xherdan Shaqiri,8153000.0,0.0982
359,2023,Pk5LedyLqO,Thiago Almada,2332000.0,0.096212
576,2023,kRQabvYbMK,Walker Zimmerman,2345210.0,0.080223
208,2023,EGMPVykqaY,Andreu Fontàs,1125000.0,0.07494
362,2023,Pk5LgAwOMO,Juan Hernández,2886000.0,0.071975
331,2023,Oa5wVzeWM1,José Cifuentes,411750.0,0.071344
584,2023,ljqE2VkOQx,Luciano Acosta,2222850.0,0.070564


In [6]:
proj.offensive_goals_added_90_proj.describe()

count    707.000000
mean      -0.017351
std        0.029364
min       -0.100116
25%       -0.034576
50%       -0.023072
75%       -0.002941
max        0.185098
Name: offensive_goals_added_90_proj, dtype: float64

In [7]:
proj.to_csv("output/mls_2023_projections_ridge.csv", index=False)

## Combination

Some of the model projections seem high or skewed based off of salaries. We can combine with the MARCEL projections to create a more balanced system. Or, when both models are in alignment, we can increase our confidence of the results

In [8]:
# Load in Marcel Projections
marcel = pd.read_csv("output/mls_2023_projections_marcel.csv")
marcel.drop(columns=["pt_projection", "offensive_goals_added_proj"], inplace=True)
marcel.rename(columns={"offensive_goals_added_90_proj": "offensive_goals_added_90_marcel"}, inplace=True)
marcel.head()

Unnamed: 0,player_id,player_name,season_name,offensive_goals_added_90_marcel
0,4JMA9R42MK,Adam Buksa,2023,0.147533
1,vzqorrRk5a,Carles Gil,2023,0.14247
2,Oa5wVzeWM1,José Cifuentes,2023,0.11966
3,eV5D9A9qKn,Sebastián Blanco,2023,0.11513
4,KAqBrrVqbg,Darwin Quintero,2023,0.114102


# Results

* José Cifuentes looks like a steal for that salary - perhaps why he went to Scotland in 2023
* Carles Gil was an All-Star and scored 11 goals in 2023
* Lorenzo Insigne highest paid MLS player - the "Italian Messi"
* Hany Mukhtar was 2022 MVP and scored 15 goals in 2023
* Adam Buksa was transferred to Ligue 1 for $$$
* Luiz Araújo transferred to Serie A in Brazil for 9 million euros
* Walker Zimmerman - 2023 All-Star, USA National team cap

In [9]:
# Average Marcel and ML Projections for a "wisdom of the crowd" approach
combo = pd.merge(proj, marcel, on=["player_id", "player_name", "season_name"])
combo["offensive_goals_added_combo"] = (combo.offensive_goals_added_90_proj + combo.offensive_goals_added_90_marcel) / 2
combo = combo.sort_values(by="offensive_goals_added_combo", ascending=False)
combo.head(15)

Unnamed: 0,season_name,player_id,player_name,guaranteed_compensation_1,offensive_goals_added_90_proj,offensive_goals_added_90_marcel,offensive_goals_added_combo
1,2023,vzqorrRk5a,Carles Gil,3545830.0,0.121517,0.14247,0.131993
0,2023,NWMWnVEQlz,Lorenzo Insigne,14000000.0,0.185098,0.038966,0.112032
2,2023,Oa5wY8RXQ1,Hany Mukhtar,1926250.0,0.108887,0.112664,0.110775
19,2023,4JMA9R42MK,Adam Buksa,1106250.0,0.048244,0.147533,0.097889
8,2023,Oa5wVzeWM1,José Cifuentes,411750.0,0.071344,0.11966,0.095502
15,2023,eV5D9A9qKn,Sebastián Blanco,1708000.0,0.057232,0.11513,0.086181
11,2023,gpMOa0lnqz,Luiz Araújo,4480330.0,0.067163,0.098848,0.083006
6,2023,EGMPVykqaY,Andreu Fontàs,1125000.0,0.07494,0.090438,0.082689
5,2023,kRQabvYbMK,Walker Zimmerman,2345210.0,0.080223,0.080333,0.080278
4,2023,Pk5LedyLqO,Thiago Almada,2332000.0,0.096212,0.063462,0.079837


In [10]:
combo.to_csv("output/mls_2023_projections_combined.csv", index=False)