In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [2]:
df = pd.read_csv('eda_df.csv')

In [3]:
#checking correlation in all the data
df.corr()['latest_market_value'].sort_values(ascending = False).round(2)

latest_market_value            1.00
highest_market_value           0.94
appearances                    0.36
assists                        0.35
minutes_played                 0.34
goals                          0.33
last_assists                   0.31
last_appearances               0.30
last_goals                     0.29
last_minutes_played            0.28
assists_per_game               0.21
last_assists_per_game          0.20
goals_per_game                 0.19
last_goals_per_game            0.17
yellow                         0.17
last_yellow                    0.14
minutes_per_game               0.10
country_2                      0.09
last_minutes_per_game          0.09
secondary_position             0.05
last_red                       0.05
last_sent_off                  0.04
clean_sheets                   0.03
sent_off                       0.03
red                            0.03
last_clean_sheets              0.02
second_yellow                  0.01
last_second_yellow          

# Attempt using all data

In [4]:
#some of the best features in correlation
df_players_model = df[['age','minutes_played', 'appearances', 'goals','assists', 'last_minutes_played', 'last_appearances', 'last_goals','last_assists','latest_market_value', 'league', 'foot', 'outfitter']]

#df_players_model = df[['age', 'minutes_per_game', 'goals_per_game', 'assists_per_game','latest_market_value', 'league']]
#df_players_model = df[['clean_sheets', 'latest_market_value', 'appearances', 'minutes_played']]


#getting dummies for non numerical variables
df_dum = pd.get_dummies(df_players_model)

# train test split 
from sklearn.model_selection import train_test_split

X = df_dum.drop('latest_market_value', axis =1)
y = df_dum.latest_market_value.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101, shuffle = True)


#setting up algorithms
lm = LinearRegression()
lm.fit(X_train, y_train)

rf = RandomForestRegressor(random_state = 101)


In [5]:
y_predicted = lm.predict(X_test)
print('Linear Regression r2 result = %s' % metrics.r2_score(y_test, y_predicted).round(2))
print('Random Forest r2 result =', np.mean(cross_val_score(rf,X_train,y_train,scoring = 'r2', cv= 3, )).round(2))

Linear Regression r2 result = 0.4
Random Forest r2 result = 0.5


# Attempt using only offensive players

In [6]:
#filtering only forwards
df_players = df[df['main_position'].str.contains('Forward')]

In [7]:
df_players.corr()['latest_market_value'].sort_values(ascending = False).round(2)

latest_market_value            1.00
highest_market_value           0.93
goals                          0.48
assists                        0.43
minutes_played                 0.39
last_goals                     0.37
appearances                    0.36
last_assists                   0.34
last_minutes_played            0.32
goals_per_game                 0.31
last_appearances               0.31
assists_per_game               0.28
last_goals_per_game            0.22
last_assists_per_game          0.21
minutes_per_game               0.20
last_minutes_per_game          0.16
yellow                         0.15
last_yellow                    0.14
last_red                       0.12
last_sent_off                  0.07
country_2                      0.07
red                            0.04
secondary_position             0.04
sent_off                       0.03
on_loan_from                   0.00
age                            0.00
second_yellow                 -0.00
last_second_yellow          

In [8]:
df_players_model = df_players[['age','minutes_played', 'appearances', 'goals','assists', 'last_minutes_played', 'last_appearances', 'last_goals','last_assists','latest_market_value', 'league', 'foot', 'outfitter']]

#df_players_model = df[['age', 'minutes_per_game', 'goals_per_game', 'assists_per_game','latest_market_value', 'league']]
#df_players_model = df[['clean_sheets', 'latest_market_value', 'appearances', 'minutes_played']]

#getting dummies for non numerical variables
df_dum = pd.get_dummies(df_players_model)

# train test split 
from sklearn.model_selection import train_test_split

X = df_dum.drop('latest_market_value', axis =1)
y = df_dum.latest_market_value.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101, shuffle = True)


#setting up algorithms
lm = LinearRegression()
lm.fit(X_train, y_train)

rf = RandomForestRegressor(random_state = 101)

In [9]:
y_predicted = lm.predict(X_test)
print('Linear Regression r2 result = %s' % metrics.r2_score(y_test, y_predicted).round(2))
print('Random Forest r2 result =', np.mean(cross_val_score(rf,X_train,y_train,scoring = 'r2', cv= 3, )).round(2))

Linear Regression r2 result = 0.39
Random Forest r2 result = 0.17


# Attempt using only keepers

In [10]:
#filtering only keepers
df_players = df[(df['main_position'] == 'Goalkeeper')]

In [11]:
df_players.corr()['latest_market_value'].sort_values(ascending = False).round(2)

latest_market_value            1.00
highest_market_value           0.94
clean_sheets                   0.43
appearances                    0.40
minutes_played                 0.40
last_clean_sheets              0.37
last_appearances               0.32
last_minutes_played            0.32
goals_conceded                 0.29
last_goals_conceded            0.21
yellow                         0.13
last_yellow                    0.12
minutes_per_game               0.10
last_minutes_per_game          0.09
red                            0.09
sent_off                       0.08
height                         0.03
country_2                      0.01
last_red                       0.00
age                           -0.00
last_sent_off                 -0.00
last_goals_per_game           -0.01
last_goals                    -0.01
last_second_yellow            -0.01
on_loan_from                  -0.01
goals_per_game                -0.02
goals                         -0.02
second_yellow               

In [12]:
#df_players_model = df_players[['age','minutes_played', 'appearances', 'goals','assists', 'last_minutes_played', 'last_appearances', 'last_goals','last_assists','latest_market_value', 'league', 'foot', 'outfitter']]
#df_players_model = df[['age', 'minutes_per_game', 'goals_per_game', 'assists_per_game','latest_market_value', 'league']]

#some of the best features for keepers
df_players_model = df[['clean_sheets', 'latest_market_value', 'appearances', 'minutes_played']]


df_dum = pd.get_dummies(df_players_model)

# train test split 
from sklearn.model_selection import train_test_split

X = df_dum.drop('latest_market_value', axis =1)
y = df_dum.latest_market_value.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101, shuffle = True)


#setting up algorithms
lm = LinearRegression()
lm.fit(X_train, y_train)

rf = RandomForestRegressor(random_state = 101)

In [13]:
y_predicted = lm.predict(X_test)
print('Linear Regression r2 result = %s' % metrics.r2_score(y_test, y_predicted).round(2))
print('Random Forest r2 result =', np.mean(cross_val_score(rf,X_train,y_train,scoring = 'r2', cv= 3, )).round(2))

Linear Regression r2 result = 0.12
Random Forest r2 result = -0.0


Unfortunately the best result I managed to get using these 2 algorithms explained around 0.4 and 0.5 per cent of the data, which is not good enough for a prediction model. In my perspectiv
e, there isn't enough stats so that one could actually predict player prices since football is way too complex to sum up its valuable stats in goals, assists, appearances, age, etc. Football takes into account many other factors that will hardly be found in any football stats website. Not to mention that these stats are way too general. For example, center-backs and defensive midfielders should have tackles and ball recovery stats to be taken into account too. Keepers should have a saves stat and so on. 