In [None]:

Dataset Overview:
    
This dataset is designed for the development of a predictive model focused 
on estimating the transfer values of football players. The project involves 
leveraging information derived from football players, aiming to build a model
capable of predicting transfer fees based on various data points. The player-centric
data encompasses fundamental details like age, height, and playing position, as 
well as professional statistics including goal scoring and 
assists (across the seasons 2021-2022 and 2022-2023), insights into injuries, 
and a comprehensive record of individual and team awards throughout their careers.


In [48]:
import pandas as pd
import numpy as np

In [8]:
file = '/kaggle/input/football-players-transfer-fee-prediction-dataset/final_data.csv'

In [9]:
df = pd.read_csv(file)

In [10]:
df.head(5)

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,...,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,/david-de-gea/profil/spieler/59377,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,...,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,/jack-butland/profil/spieler/128899,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,...,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0
2,/tom-heaton/profil/spieler/34130,Manchester United,Tom Heaton,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,...,0.616438,0.924658,292,697,84,4,600000,6000000,1,0
3,/lisandro-martinez/profil/spieler/480762,Manchester United,Lisandro Martínez,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,...,0.0,0.0,6408,175,22,9,50000000,50000000,2,0
4,/raphael-varane/profil/spieler/164770,Manchester United,Raphaël Varane,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,...,0.0,0.0,5031,238,51,21,40000000,80000000,2,0


In [11]:
df = df._get_numeric_data()

In [12]:
df.head(5)

Unnamed: 0,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,189.0,32.0,104,0.0,0.0,0.009585,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,196.0,30.0,15,0.0,0.0,0.069018,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0
2,188.0,37.0,4,0.0,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,6000000,1,0
3,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,0.0,6408,175,22,9,50000000,50000000,2,0
4,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,0.0,5031,238,51,21,40000000,80000000,2,0


In [13]:
df.info


<bound method DataFrame.info of            height   age  appearance     goals   assists  yellow cards  \
0      189.000000  32.0         104  0.000000  0.000000      0.009585   
1      196.000000  30.0          15  0.000000  0.000000      0.069018   
2      188.000000  37.0           4  0.000000  0.000000      0.000000   
3      175.000000  25.0          82  0.028090  0.056180      0.224719   
4      191.000000  30.0          63  0.017889  0.017889      0.053667   
...           ...   ...         ...       ...       ...           ...   
10749  181.240353  20.0          16  0.175953  0.087977      0.263930   
10750  190.000000  24.0          26  0.372671  0.186335      0.186335   
10751  181.240353  19.0          20  0.375000  0.000000      0.187500   
10752  181.240353  20.0          17  0.312139  0.104046      0.000000   
10753  170.000000  18.0          21  0.000000  0.000000      0.086042   

       second yellow cards  red cards  goals conceded  clean sheets  \
0                   

In [14]:
df.describe()

Unnamed: 0,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
count,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0
mean,181.240353,26.041903,36.407011,0.125554,0.086978,0.189757,0.004666,0.006826,0.131655,0.044881,2470.789381,117.961689,15.826297,1.960759,3622971.0,6152606.0,2.713223,0.307513
std,6.969818,4.777629,26.526541,0.235585,0.143351,0.432388,0.025232,0.081143,0.442335,0.924437,2021.703271,175.206827,23.383606,3.743936,9095410.0,13389880.0,0.986356,0.461485
min,156.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,176.0,22.0,12.0,0.0,0.0,0.053191,0.0,0.0,0.0,0.0,660.0,0.0,0.0,0.0,300000.0,450000.0,2.0,0.0
50%,181.240353,26.0,35.0,0.045969,0.040773,0.15025,0.0,0.0,0.0,0.0,2101.5,37.0,5.0,1.0,800000.0,1500000.0,3.0,0.0
75%,186.0,29.0,59.0,0.172263,0.133136,0.248276,0.0,0.0,0.0,0.0,3968.0,181.0,24.0,2.0,3000000.0,5000000.0,4.0,1.0
max,206.0,43.0,107.0,11.25,4.0,30.0,1.0,6.923077,9.0,90.0,9510.0,2349.0,339.0,92.0,180000000.0,200000000.0,4.0,1.0


In [15]:
df.isnull().sum()

height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
position_encoded       0
winger                 0
dtype: int64

In [49]:
df.corr()['current_value'].sort_values()

goals conceded        -0.063529
age                   -0.050156
second yellow cards   -0.013889
yellow cards          -0.011766
clean sheets          -0.011684
red cards             -0.010171
winger                 0.005034
height                 0.040535
position_encoded       0.085154
days_injured           0.107686
goals                  0.121574
games_injured          0.135654
assists                0.137415
award                  0.300454
appearance             0.419656
minutes played         0.420542
highest_value          0.834553
current_value          1.000000
Name: current_value, dtype: float64

In [17]:
#fit a linear regression to predict the 'highest_value' using the 'current_value'

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [51]:
x = df[['highest_value']]
y = df['current_value']

In [52]:
lm = LinearRegression()

In [55]:
lm.fit(x,y)
yhat = lm.predict(x)
lm.score(x,y)

0.6964788735669278

In [57]:
MSE = mean_squared_error(yhat,y)
MSE

25106899777969.08

In [22]:
# fit a linear regression to predict the 'highest_value' using the 'feature' list.

In [60]:
feature = ['goals conceded','yellow cards','winger','height','red cards','clean sheets','second yellow cards','position_encoded','age','goals','assists','days_injured','games_injured','minutes played','appearance','award','highest_value']

In [61]:
z = df[feature]

In [63]:
lm.fit(z,y)
yhat = lm.predict(z)
yhat

array([41699601.55448914, 11556401.07227261, -1581094.54067938, ...,
        1591497.90251865,  1453978.41573946,  1528984.3263688 ])

In [65]:
lm.score(z,y)

0.7519221203778701

In [26]:
#create a pipeline to predict the 'highest_value' using 'feature'

In [66]:
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [28]:
Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]

In [29]:
pipe = Pipeline(Input)

In [67]:
pipe.fit(z,y)

In [68]:
pipe.score(z,y)

0.9289158325591603

In [69]:
MSE = mean_squared_error(yhat,y)
MSE

20520701586738.805

In [32]:
# do a Ridge fit using feature list.your parameter at 0.1

In [33]:
from sklearn.linear_model import Ridge

In [34]:
Ridgemodel = Ridge(alpha = 0.1)

In [70]:
Ridgemodel.fit(z,y)
Ridgemodel.score(z,y)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0.7519221202751951

In [36]:
from sklearn.model_selection import train_test_split

In [71]:
x_data = df[feature]
y_data = df['current_value']

In [73]:
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.20,random_state = 0)

In [74]:
Ridgemodel.fit(x_train,y_train)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [75]:
Ridgemodel.score(x_test,y_test)

0.763376757893317

In [41]:
pr = PolynomialFeatures(degree = 4)

In [76]:
x_train_pr =pr.fit_transform(x_train)
x_test_pr = pr.fit_transform(x_test)

Ridgemodel.fit(x_train_pr,y_train)
Ridgemodel.score(x_test_pr,y_test)

0.6129962192674878

In [43]:
from sklearn.ensemble import RandomForestRegressor

In [44]:
rp = RandomForestRegressor()

In [45]:
rp.fit(x_train,y_train)
rp.score(x_test,y_test)

0.8226811688413505

In [47]:
# Conclusion

# From the model about, the Pipeline Regression with a R^2 value of  0.93 
# and a mean_squared_error of 20520701586738.805 is best fit to predict 
# the value of football players.

