In [18]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
import joblib


In [2]:
## players 2016

In [3]:
df_raw = pd.read_csv('src/players/data/players_16.csv')
df = df_raw.copy()

In [4]:
columns_midfielder = ["age",
    "attacking_crossing",
    "attacking_finishing",
    "attacking_heading_accuracy",
    "attacking_short_passing",
    "attacking_volleys",
    "dribbling",
    "mentality_aggression",
    "mentality_interceptions",
    "mentality_penalties",
    "mentality_positioning",
    "mentality_vision",
    "movement_acceleration",
    "movement_agility",
    "movement_balance",
    "movement_reactions",
    "movement_sprint_speed",
    "pace",
    "passing",
    "physic",
    "power_jumping",
    "power_long_shots",
    "power_shot_power",
    "power_stamina",
    "power_strength",
    "shooting",
    "skill_ball_control",
    "skill_curve",
    "skill_dribbling",
    "skill_fk_accuracy",
    "skill_long_passing",
    "value_eur",
    ]

In [5]:
columns_attacker = ["age", 
        "attacking_crossing", 
        "attacking_finishing", 
        "attacking_heading_accuracy", 
        "attacking_short_passing",
        "attacking_volleys",
        "dribbling",
        "mentality_aggression",
        "mentality_interceptions",
        "mentality_penalties",
        "mentality_positioning",
        "mentality_vision",
        "movement_acceleration",
        "movement_agility",
        "movement_balance",
        "movement_reactions",
        "movement_sprint_speed",
        "pace",
        "passing",
        "physic",
        "power_jumping",
        "power_long_shots",
        "power_shot_power",
        "power_stamina",
        "power_strength",
        "shooting",
        "skill_ball_control",
        "skill_curve",
        "skill_dribbling",
        "skill_fk_accuracy",
        "skill_long_passing",
        "value_eur",]

In [6]:
columns_defender = [
    "age",
    "defending",
    "defending_marking",
    "defending_sliding_tackle",
    "defending_standing_tackle",
    "dribbling",
    "mentality_aggression",
    "mentality_interceptions",
    "mentality_penalties",
    "mentality_positioning",
    "mentality_vision",
    "movement_acceleration",
    "movement_agility",
    "movement_balance",
    "movement_reactions",
    "movement_sprint_speed",
    "pace",
    "passing",
    "physic",
    "power_jumping",
    "power_long_shots",
    "power_shot_power",
    "power_stamina",
    "power_strength",
    "shooting",
    "skill_ball_control",
    "skill_curve",
    "skill_dribbling",
    "skill_fk_accuracy",
    "skill_long_passing",
    "value_eur",
]

In [7]:
list_of_positions = list(df['team_position'].unique())
list_of_positions

['RW',
 'LM',
 'ST',
 'LW',
 'RCB',
 'CAM',
 'RM',
 'RDM',
 'CB',
 'LCB',
 'LDM',
 'LCM',
 'CDM',
 'CF',
 'RCM',
 'LB',
 'RS',
 'LS',
 'CM',
 'RB',
 'RAM',
 'LAM',
 'RWB',
 'RF',
 'LWB',
 'LF']

In [8]:
position_defenders = ['CB', 'LCB', 'RB', 'LWB', 'RWB', 'LB', 'RCB', ]
position_midfielders = ['CAM', 'RM', 'RDM', 'RCM', 'CM', 'CDM', 'LAM', 'LDM', ]
position_attackers = ['RW', 'ST', 'LW', 'RS', 'LS', 'LF', 'RF']

In [9]:
best_players_by_position = {}
for position in list_of_positions:
    df_1 = df[df['team_position'] == position]
    best_players_by_position[position] = df_1['short_name'].head(1)

In [10]:
midfield = ['CAM', 'RM', 'RDM', 'LDM', 'LCM', 'CDM', 'LAM', 'RAM', 'CM', 'RCM']
attack = ['RW', 'LM', 'ST', 'LW', 'LF', 'RF', 'LS', 'RS', 'CF']
defend = ['RCB', 'CB', 'LCB', 'LWB', 'RWB', 'RB', 'LB']

In [11]:
def position_filter(position, dataframe):
    midfield = ['CAM', 'RM', 'RDM', 'LDM', 'LCM', 'CDM', 'LAM', 'RAM', 'CM', 'RCM']
    attack = ['RW', 'LM', 'ST', 'LW', 'LF', 'RF', 'LS', 'RS', 'CF']
    defend = ['RCB', 'CB', 'LCB', 'LWB', 'RWB', 'RB', 'LB']
    
    if position == 'midfielders':
        df = dataframe[dataframe["team_position"].isin(midfield)]
        return df
    elif position == 'attackers':
        df = dataframe[dataframe["team_position"].isin(attack)]
        return df
    elif position == 'defenders':
        df = dataframe[dataframe["team_position"].isin(defend)]
        return df
    else:
        print('wrong position')

In [12]:
df_midfielders = position_filter("midfielders", df)
df_attackers = position_filter("attackers", df)
df_defenders = position_filter("defenders", df)

In [13]:
df_defenders = df_defenders[df_defenders['value_eur'] != 0]
df_attackers = df_attackers[df_attackers['value_eur'] != 0]
df_midfielders = df_midfielders[df_midfielders['value_eur'] != 0]

In [14]:
midfield_corr = df_midfielders[columns_midfielder].corr()
best_midfield_corr = midfield_corr[midfield_corr['value_eur'] > 0.5]
midfield_columns_for_estimation = [column for column in best_midfield_corr.index]

In [15]:
attack_corr = df_attackers[columns_attacker].corr()
best_attack_corr = attack_corr[attack_corr['value_eur'] > 0.4]
attack_columns_for_estimation = [column for column in best_attack_corr.index]

In [16]:
defend_corr = df_defenders[columns_defender].corr()
best_defend_corr = defend_corr[defend_corr['value_eur'] > 0.4]
defend_columns_for_estimation = [column for column in best_defend_corr.index]

In [20]:
## midfielder estimate
import time
from sklearn.model_selection import GridSearchCV
start = time.time()
X_mid = df_midfielders[midfield_columns_for_estimation]
y_mid = X_mid.pop('value_eur')
scaler_mid = MinMaxScaler()
scaler_mid.fit(X_mid)
scaler_mid.transform(X_mid)
X_train_mid, X_test_mid, y_train_mid, y_test_mid = train_test_split(X_mid, y_mid)

param_grid = [{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20], 'min_samples_leaf': [3, 4, 5, 10, 15]}]
reg_mid_16 = RandomForestRegressor()
gs_mid_16 = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, scoring='r2')
reg_mid_16 = reg_mid_16.fit(X_train_mid, y_train_mid)
gs_mid_16 = gs_mid_16.fit(X_train_mid, y_train_mid)
end = time.time()
print(f"midfielders reg: {reg_mid_16}")
print(f"midfielders gs: {gs_mid_16}")
print({end - start})

midfielders reg: RandomForestRegressor()
midfielders gs: GridSearchCV(estimator=RandomForestRegressor(),
             param_grid=[{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20],
                          'min_samples_leaf': [3, 4, 5, 10, 15]}],
             scoring='r2')
{39.503352880477905}


In [93]:
print(gs_mid_16.best_params_)

{'max_depth': 10, 'min_samples_leaf': 3}


In [94]:
## attacker estimate

X_att = df_attackers[attack_columns_for_estimation]
y_att = X_att.pop('value_eur')
scaler_att = MinMaxScaler()
scaler_att.fit(X_att)
scaler_att.transform(X_att)
X_train_att, X_test_att, y_train_att, y_test_att = train_test_split(X_att, y_att)
param_grid = [{'max_depth': [ 7, 8, 9], 'min_samples_leaf': [3, 4]}]
reg_att_16 = RandomForestRegressor()
gs_att_16 = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, scoring='r2')
reg_att_16 = reg_att_16.fit(X_train_att, y_train_att)
gs_att_16 = gs_att_16.fit(X_train_att, y_train_att)
print(f"defender reg: {reg_att_16}")
print(f"defender gs: {gs_att_16}")


attacker reg: 0.8101923580652037
attacker gs: 0.7476539978668831


In [95]:
print(gs_att_16.best_params_)

{'max_depth': 9, 'min_samples_leaf': 3}


In [110]:
## defender estimate

X_def = df_defenders[defend_columns_for_estimation]
y_def = X_def.pop('value_eur')
scaler_def = MinMaxScaler()
scaler_def.fit(X_def)
scaler_def.transform(X_def)
X_train_def, X_test_def, y_train_def, y_test_def = train_test_split(X_def, y_def)
param_grid = [{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20], 'min_samples_leaf': [3, 4, 5, 10, 15]}]
reg_def_16 = RandomForestRegressor()
gs_def_16 = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, scoring='r2')
reg_def_16 = reg_def_16.fit(X_train_def, y_train_def)
gs_def_16 = gs_def_16.fit(X_train_def, y_train_def)
print(f"defender reg: {reg_def_16}")
print(f"defender gs: {gs_def_16}")

defender reg: RandomForestRegressor()
defender gs: GridSearchCV(estimator=RandomForestRegressor(),
             param_grid=[{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20],
                          'min_samples_leaf': [3, 4, 5, 10, 15]}],
             scoring='r2')


In [96]:
print(gs_def_16.best_params_)

{'max_depth': 7, 'min_samples_leaf': 10}


In [97]:
pred_defender = reg_def_16.predict([[90, 87, 90, 90, 89, 85]])[0]
print(f"Predicted value for defensor: {pred_defender/1000000} mln euro")

Predicted value for defensor: 32.795 mln euro


In [98]:
pred_attacker = reg_att_16.predict([[90, 87, 90, 90, 89, 85, 89, 90, 90]])[0]
print(f"Predicted value for attacker: {pred_attacker/1000000} mln euro")

Predicted value for attacker: 64.165 mln euro


In [99]:
pred_midfielder = reg_mid_16.predict([[95, 95, 95, 95]])[0]
print(f"Predicted value for midfielder: {pred_midfielder/1000000} mln euro")

Predicted value for midfielder: 42.655 mln euro


In [100]:
pred_attacker = reg_att_16.predict([[92, 87, 95, 88, 90, 86, 88, 96, 96]])[0]
print(f"Predicted value for attacker: {pred_attacker/1000000} mln euro")

Predicted value for attacker: 68.905 mln euro


In [101]:
px.scatter(df_attackers, x='overall', y='value_eur', hover_name="age", title="Attackers 2016")

In [102]:
px.scatter(df_defenders, x='overall', y='value_eur', hover_name="age", title="Defenders 2016")

In [103]:
fig = px.scatter(df_midfielders, x='overall', y='value_eur', hover_name="age", title="Midfielders 2016")
fig.show()

In [104]:
## players_17

In [105]:
df_raw = pd.read_csv('src/players/data/players_17.csv')
df = df_raw.copy()

df_midfielders = position_filter("midfielders", df)
df_attackers = position_filter("attackers", df)
df_defenders = position_filter("defenders", df)

df_defenders = df_defenders[df_defenders['value_eur'] != 0]
df_attackers = df_attackers[df_attackers['value_eur'] != 0]
df_midfielders = df_midfielders[df_midfielders['value_eur'] != 0]

midfield_corr = df_midfielders[columns_midfielder].corr()
best_midfield_corr = midfield_corr[midfield_corr['value_eur'] > 0.5]
midfield_columns_for_estimation = [column for column in best_midfield_corr.index]

attack_corr = df_attackers[columns_attacker].corr()
best_attack_corr = attack_corr[attack_corr['value_eur'] > 0.4]
attack_columns_for_estimation = [column for column in best_attack_corr.index]

defend_corr = df_defenders[columns_defender].corr()
best_defend_corr = defend_corr[defend_corr['value_eur'] > 0.4]
defend_columns_for_estimation = [column for column in best_defend_corr.index]

In [106]:
## midfielder estimate
from sklearn.model_selection import GridSearchCV
X_mid = df_midfielders[midfield_columns_for_estimation]
y_mid = X_mid.pop('value_eur')
scaler_mid = MinMaxScaler()
scaler_mid.fit(X_mid)
scaler_mid.transform(X_mid)
X_train_mid, X_test_mid, y_train_mid, y_test_mid = train_test_split(X_mid, y_mid)

param_grid = [{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20], 'min_samples_leaf': [3, 4, 5, 10, 15]}]
reg_mid = RandomForestRegressor()
gs_mid = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, scoring='r2')
reg_mid.fit(X_train_mid, y_train_mid)
gs_mid.fit(X_train_mid, y_train_mid)
print(f"midfielders reg: {reg_mid.score(X_test_mid, y_test_mid)}")
print(f"midfielders gs: {gs_mid.score(X_test_mid, y_test_mid)}")

midfielders reg: 0.7480787398894477
midfielders gs: 0.7559420689053253


In [107]:
## attacker estimate

X_att = df_attackers[attack_columns_for_estimation]
y_att = X_att.pop('value_eur')
scaler_att = MinMaxScaler()
scaler_att.fit(X_att)
scaler_att.transform(X_att)
X_train_att, X_test_att, y_train_att, y_test_att = train_test_split(X_att, y_att)
param_grid = [{'max_depth': [ 7, 8, 9], 'min_samples_leaf': [3, 4]}]
reg_att_17 = RandomForestRegressor()
gs_att_17 = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, scoring='r2')
reg_att_17.fit(X_train_att, y_train_att)
gs_att_17.fit(X_train_att, y_train_att)
print(f"attacker reg: {reg_att_17.score(X_test_att, y_test_att)}")
print(f"attacker gs: {gs_att_17.score(X_test_att, y_test_att)}")

attacker reg: 0.8125878752909594
attacker gs: 0.8156360723014466


In [108]:
## defender estimate

X_def = df_defenders[defend_columns_for_estimation]
y_def = X_def.pop('value_eur')
scaler_def = MinMaxScaler()
scaler_def.fit(X_def)
scaler_def.transform(X_def)
X_train_def, X_test_def, y_train_def, y_test_def = train_test_split(X_def, y_def)
param_grid = [{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20], 'min_samples_leaf': [3, 4, 5, 10, 15]}]
reg_def_17 = RandomForestRegressor()
gs_def_17 = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, scoring='r2')
reg_def_17.fit(X_train_def, y_train_def)
gs_def_17.fit(X_train_def, y_train_def)
print(f"defender reg: {reg_def_17.score(X_test_def, y_test_def)}")
print(f"defender gs: {gs_def_17.score(X_test_def, y_test_def)}")

defender reg: 0.73803102448901
defender gs: 0.7853453029943768


In [135]:
def model_to_estimate_player_value(dataframe, columns_list, name):
       
    X = dataframe[columns_list]
    y= X.pop('value_eur')
    scaler = MinMaxScaler()
    scaler.fit(X)
    scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    param_grid = [{'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 20], 'min_samples_leaf': [3, 4, 5, 10, 15]}]
    gs = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, scoring='r2')
    gs = gs.fit(X_train, y_train)
    joblib.dump(gs, f'model_{name}.pkl')
    return gs

In [136]:
def columns_for_estimation(dataframe, position):
    if position == "attacker":
        columns = columns_attacker
    elif position == "defender":
        columns = columns_defender
    elif position == "midfielder":
        columns = columns_midfielder

    dataframe = dataframe[dataframe['value_eur'] != 0]
    corr = dataframe[columns].corr()
    best_corr = corr[corr['value_eur'] > 0.4]
    columns_for_estimation = [column for column in best_corr.index]

    return columns_for_estimation

In [137]:
df_raw_16 = pd.read_csv('src/players/data/players_16.csv')
df_16 = df_raw_16.copy()


df_midfielders_16 = position_filter("midfielders", df_16)
df_attackers_16 = position_filter("attackers", df_16)
df_defenders_16 = position_filter("defenders", df_16)

midfielders_columns_16 = columns_for_estimation(df_midfielders_16, "midfielder")
attackers_columns_16 = columns_for_estimation(df_attackers_16, "attacker")
defenders_columns_16 = columns_for_estimation(df_defenders_16, "defender")

mid_16 = model_to_estimate_player_value(df_midfielders_16, midfielders_columns_16, "mid_16")
att_16 = model_to_estimate_player_value(df_attackers_16, attackers_columns_16, "att_16")
def_16 = model_to_estimate_player_value(df_defenders_16, defenders_columns_16, "def_16")




In [131]:
print(mid_16.best_estimator_)
print(att_16.best_estimator_)
print(def_16.best_estimator_)

RandomForestRegressor(max_depth=8, min_samples_leaf=3)
RandomForestRegressor(max_depth=7, min_samples_leaf=3)
RandomForestRegressor(max_depth=6, min_samples_leaf=4)


In [4]:
model = joblib.load("model_att_16.pkl")
pred_attacker = model.predict([[90, 90, 90, 90, 90, 90, 90, 90, 90]])[0]
print(f"Predicted value for attacker: {pred_attacker/1000000} mln euro")


Predicted value for attacker: 59.459134920634924 mln euro
