In [None]:

Dataset Overview:
    
This dataset is designed for the development of a predictive model focused 
on estimating the transfer values of football players. The project involves 
leveraging information derived from football players, aiming to build a model
capable of predicting transfer fees based on various data points. The player-centric
data encompasses fundamental details like age, height, and playing position, as 
well as professional statistics including goal scoring and 
assists (across the seasons 2021-2022 and 2022-2023), insights into injuries, 
and a comprehensive record of individual and team awards throughout their careers.


In [1]:
import pandas as pd
import numpy as np


In [2]:
file = '/kaggle/input/football-players-transfer-fee-prediction-dataset/final_data.csv'

In [3]:
df = pd.read_csv(file)

In [4]:
df.head(5)

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,...,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,/david-de-gea/profil/spieler/59377,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,...,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,/jack-butland/profil/spieler/128899,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,...,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0
2,/tom-heaton/profil/spieler/34130,Manchester United,Tom Heaton,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,...,0.616438,0.924658,292,697,84,4,600000,6000000,1,0
3,/lisandro-martinez/profil/spieler/480762,Manchester United,Lisandro Martínez,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,...,0.0,0.0,6408,175,22,9,50000000,50000000,2,0
4,/raphael-varane/profil/spieler/164770,Manchester United,Raphaël Varane,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,...,0.0,0.0,5031,238,51,21,40000000,80000000,2,0


In [5]:
df = df._get_numeric_data()

In [6]:
df.head(5)

Unnamed: 0,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,189.0,32.0,104,0.0,0.0,0.009585,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,196.0,30.0,15,0.0,0.0,0.069018,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0
2,188.0,37.0,4,0.0,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,6000000,1,0
3,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,0.0,6408,175,22,9,50000000,50000000,2,0
4,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,0.0,5031,238,51,21,40000000,80000000,2,0


In [None]:
df.info


In [7]:
df.describe()

Unnamed: 0,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
count,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0,10754.0
mean,181.240353,26.041903,36.407011,0.125554,0.086978,0.189757,0.004666,0.006826,0.131655,0.044881,2470.789381,117.961689,15.826297,1.960759,3622971.0,6152606.0,2.713223,0.307513
std,6.969818,4.777629,26.526541,0.235585,0.143351,0.432388,0.025232,0.081143,0.442335,0.924437,2021.703271,175.206827,23.383606,3.743936,9095410.0,13389880.0,0.986356,0.461485
min,156.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,176.0,22.0,12.0,0.0,0.0,0.053191,0.0,0.0,0.0,0.0,660.0,0.0,0.0,0.0,300000.0,450000.0,2.0,0.0
50%,181.240353,26.0,35.0,0.045969,0.040773,0.15025,0.0,0.0,0.0,0.0,2101.5,37.0,5.0,1.0,800000.0,1500000.0,3.0,0.0
75%,186.0,29.0,59.0,0.172263,0.133136,0.248276,0.0,0.0,0.0,0.0,3968.0,181.0,24.0,2.0,3000000.0,5000000.0,4.0,1.0
max,206.0,43.0,107.0,11.25,4.0,30.0,1.0,6.923077,9.0,90.0,9510.0,2349.0,339.0,92.0,180000000.0,200000000.0,4.0,1.0


In [8]:
df.isnull().sum()

height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
position_encoded       0
winger                 0
dtype: int64

In [9]:
df.corr()['highest_value'].sort_values()

goals conceded        -0.061183
second yellow cards   -0.014155
yellow cards          -0.009772
clean sheets          -0.008302
red cards             -0.007257
winger                 0.001944
height                 0.036776
position_encoded       0.101053
age                    0.123468
goals                  0.129037
assists                0.145407
days_injured           0.234352
games_injured          0.285244
minutes played         0.399389
appearance             0.418394
award                  0.528641
current_value          0.834553
highest_value          1.000000
Name: highest_value, dtype: float64

In [None]:
#fit a linear regression to predict the 'highest_value' using the 'current_value'

In [12]:
from sklearn.linear_model import LinearRegression

In [10]:
x = df[['current_value']]
y = df['highest_value']

In [13]:
lm = LinearRegression()

In [14]:
lm.fit(x,y)
lm.score(x,y)

0.6964788735669278

In [None]:
# fit a linear regression to predict the 'highest_value' using the 'feature' list.

In [15]:
feature = ['position_encoded','age','goals','assists','days_injured','games_injured','minutes played','appearance','award','current_value']

In [16]:
z = df[feature]

In [17]:
lm.fit(z,y)
lm.score(z,y)

0.7957211050105177

In [None]:
#create a pipeline to predict the 'highest_value' using 'feature'

In [18]:
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [19]:
Input  =Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]

In [20]:
pipe = Pipeline(Input)

In [21]:
pipe.fit(z,y)

In [22]:
pipe.score(z,y)

0.8388193097600639

In [None]:
# do a Ridge fit using feature list.your parameter at 0.1

In [23]:
from sklearn.linear_model import Ridge

In [50]:
Ridgemodel = Ridge(alpha = 0.1)

In [51]:
Ridgemodel.fit(z,y)
Ridgemodel.score(z,y)

0.7957211050083155

In [26]:
from sklearn.model_selection import train_test_split

In [34]:
x_data = df[feature]
y_data = df['highest_value']

In [35]:
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.20,random_state = 0)

In [37]:
Ridgemodel.fit(x_train,y_train)

In [39]:
Ridgemodel.score(x_test,y_test)

0.8092572136604689

In [46]:
pr = PolynomialFeatures(degree = 2)

In [47]:
x_train_pr =pr.fit_transform(x_train)
x_test_pr = pr.fit_transform(x_test)

Ridgemodel.fit(x_train_pr,y_train)
Ridgemodel.score(x_test_pr,y_test)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0.8115373719070069

In [None]:
# Conclusion

From the model about, the Pipeline Regression with a R^2 value of 
0.839 is best fit to predict the value of football players.
