In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"D:\Ultimate Programming\Data Bases\Project Dataset\price of a Football player.csv")
df.head(2)

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3.0,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2.0,Germany,0,4,1,1,0


In [3]:
df['market_value'] = df.pop('market_value')

q1 = df['page_views'].quantile(0.25)
q3 = df['page_views'].quantile(0.75)
iqr = q3 - q1
min_range = q1 - iqr*1.5
max_range = q3 + iqr*1.5

df = df[df['page_views']<max_range]

In [4]:
df.head(3)

Unnamed: 0,name,club,age,position,position_cat,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing,market_value
2,Petr Cech,Arsenal,35,GK,4,1529,5.5,5.90%,134,2.0,Czech Republic,0,6,1,1,0,7.0
4,Laurent Koscielny,Arsenal,31,CB,3,912,6.0,0.70%,121,2.0,France,0,4,1,1,0,22.0
5,Hector Bellerin,Arsenal,22,RB,3,1675,6.0,13.70%,119,2.0,Spain,0,2,1,1,0,30.0


In [5]:
# Encoding the dataset
from sklearn.preprocessing import LabelEncoder
name = df.drop('name', axis=1, inplace=True)
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

In [6]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
for col in df.select_dtypes(include=['int64', 'float64']).columns :
    df[col] = ss.fit_transform(df[[col]])

In [7]:
df['region'] = df['region'].fillna(df['region'].mean())

In [8]:
df.isnull().sum()

club            0
age             0
position        0
position_cat    0
page_views      0
fpl_value       0
fpl_sel         0
fpl_points      0
region          0
nationality     0
new_foreign     0
age_cat         0
club_id         0
big_club        0
new_signing     0
market_value    0
dtype: int64

In [9]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [10]:
lr = LinearRegression()
knr = KNeighborsRegressor()
svm = SVR()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()

In [11]:
regressions = {
    'SVC' : svm,
    'KN' : knr,  
    'DT': dt, 
    'LR': lr, 
    'RF': rf, 
    'GbBoost': gb
}

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def train_regression(reg, x_train, x_test, y_train, y_test):
    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_test)
    mse = mean_squared_error(y_test, y_pred) * 100
    mae = mean_absolute_error(y_test, y_pred) * 100
    r_score = r2_score(y_test, y_pred) * 100
    return mse, mae, r_score  

In [13]:
x = df.iloc[: ,:-1]
y = df['market_value']

In [14]:
print(x.shape, y.shape)

(423, 15) (423,)


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [17]:
train_regression(lr, x_train, x_test, y_train, y_test)

(31.490852393744508, 38.89097790576891, 49.13834358286435)

In [19]:
for name, reg in regressions.items():
    mse, mae, r2 = train_regression(reg, x_train, x_test, y_train, y_test)
    print(f"For {name}:, mse - {mse:.2f}, mae - {mae:.2f}, r2 - {r2:.2f}")

For SVC:, mse - 22.57, mae - 33.57, r2 - 63.55
For KN:, mse - 28.88, mae - 38.71, r2 - 53.36
For DT:, mse - 42.03, mae - 40.38, r2 - 32.11
For LR:, mse - 31.49, mae - 38.89, r2 - 49.14
For RF:, mse - 24.25, mae - 34.54, r2 - 60.84
For GbBoost:, mse - 20.30, mae - 31.86, r2 - 67.22
