In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **1.  DATA PREPARATION**

---

Includes:
* Loading the dataset
* Handling missing, duplicate and unnecessary data
* Imputing numeric and non-numeric values
* Encoding non-numeric data (Label Encoder)


In [None]:
import numpy as np
import pandas as pd
from numpy import array
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

Loading the dataset

In [None]:
# Importing the data sets for players 21 and 22
players_21_data = pd.read_csv('/content/drive/MyDrive/MidSem Project/players_21.csv', low_memory=False)
players_22_data = pd.read_csv('/content/drive/MyDrive/MidSem Project/players_22.csv', low_memory=False)

In [None]:
# Creating Data Frames using the loaded data sets
fifa_21 = pd.DataFrame(players_21_data.copy())
fifa_21.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,103500000.0,560000.0,33,...,52+3,52+3,52+3,62+3,19+3,https://cdn.sofifa.net/players/158/023/21_120.png,https://cdn.sofifa.net/teams/241/60.png,https://cdn.sofifa.net/flags/es.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",92,92,63000000.0,220000.0,35,...,54+3,54+3,54+3,61+3,20+3,https://cdn.sofifa.net/players/020/801/21_120.png,https://cdn.sofifa.net/teams/45/60.png,https://cdn.sofifa.net/flags/it.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
2,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,91,91,111000000.0,240000.0,31,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/21_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,,https://cdn.sofifa.net/flags/pl.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,132000000.0,270000.0,28,...,49+3,49+3,49+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/21_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CAM, CM",91,91,129000000.0,370000.0,29,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/21_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


Dropping Unnecessary Data

In [None]:
# Dropping all the duplicate data in the players_21 dataset
fifa_21 = fifa_21.drop_duplicates()

# Drop columns with 30% or more missing values
threshold = 0.3
fifa_21 = fifa_21.dropna(axis=1, thresh=int(threshold * len(fifa_21)))

# Drop columns containing "url"
fifa_21 = fifa_21.drop(columns=[col for col in fifa_21.columns if 'url' in col.lower()])

Separating the numerical and non-numerical values

In [None]:
# Numeric Values
numeric_fifa_21 = fifa_21.select_dtypes(include=['number'])
numeric_fifa_21 = pd.DataFrame(numeric_fifa_21)

In [None]:
numeric_fifa_21

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,158023,93,93,103500000.0,560000.0,33,170,72,241.0,1.0,...,75,96,32,35,24,6,11,15,14,8
1,20801,92,92,63000000.0,220000.0,35,187,83,45.0,1.0,...,84,95,28,32,24,7,11,15,14,11
2,188545,91,91,111000000.0,240000.0,31,184,80,21.0,1.0,...,88,88,35,42,19,15,6,12,8,10
3,190871,91,91,132000000.0,270000.0,28,175,68,73.0,1.0,...,92,93,35,30,29,9,9,15,15,11
4,192985,91,91,129000000.0,370000.0,29,181,70,10.0,1.0,...,84,91,68,65,53,15,13,5,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,257710,47,52,70000.0,1000.0,21,177,70,112165.0,1.0,...,35,40,45,56,47,12,13,8,14,6
18940,257933,47,53,70000.0,1000.0,21,174,68,112540.0,1.0,...,35,35,43,42,53,8,8,13,14,10
18941,257936,47,47,45000.0,2000.0,28,185,79,111774.0,1.0,...,36,35,38,43,45,8,5,11,5,7
18942,258736,47,67,130000.0,500.0,17,171,58,1920.0,4.0,...,50,45,18,11,13,11,13,9,9,6


In [None]:
# Non-numeric values
categorical_fifa_21 = fifa_21.select_dtypes(include=['object', 'category'])

In [None]:
categorical_fifa_21

Unnamed: 0,short_name,long_name,player_positions,dob,club_name,league_name,club_position,club_joined,nationality_name,preferred_foot,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",1987-06-24,FC Barcelona,Spain Primera Division,CAM,2004-07-01,Argentina,Left,...,65+3,65+3,65+3,66+3,62+3,52+3,52+3,52+3,62+3,19+3
1,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",1985-02-05,Juventus,Italian Serie A,LS,2018-07-10,Portugal,Right,...,61+3,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3,20+3
2,R. Lewandowski,Robert Lewandowski,ST,1988-08-21,FC Bayern München,German 1. Bundesliga,ST,2014-07-01,Poland,Right,...,65+3,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3,19+3
3,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",1992-02-05,Paris Saint-Germain,French Ligue 1,LW,2017-08-03,Brazil,Right,...,62+3,62+3,62+3,67+3,62+3,49+3,49+3,49+3,62+3,20+3
4,K. De Bruyne,Kevin De Bruyne,"CAM, CM",1991-06-28,Manchester City,English Premier League,RCM,2015-08-30,Belgium,Right,...,80+3,80+3,80+3,79+3,75+3,69+3,69+3,69+3,75+3,21+3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,Zhang Mengxuan,张梦炫,CB,1999-04-26,Chongqing Liangjiang Athletic,Chinese Super League,SUB,2020-08-01,China PR,Right,...,41+2,41+2,41+2,42+2,45+2,47+2,47+2,47+2,45+2,15+2
18940,Huang Wenzhou,黄文卓,CM,1999-01-07,Shanghai Port FC,Chinese Super League,RES,2020-08-01,China PR,Right,...,48+2,48+2,48+2,47+2,47+2,46+2,46+2,46+2,47+2,15+2
18941,Song Yue,宋岳,CM,1991-11-20,Tianjin Jinmen Tiger FC,Chinese Super League,RES,2020-08-01,China PR,Right,...,47,47,47,47,47,46+1,46+1,46+1,47,11+2
18942,V. Da Silva,Ivanilson Loforte Tique Da Silva,ST,2003-03-30,Oldham Athletic,English League Two,SUB,2020-08-01,England,Right,...,32+2,32+2,32+2,35+2,33+2,26+2,26+2,26+2,33+2,14+2


Imputing Numerical & Non-numerical Values

In [None]:
# Imputation for Numeric Values
imp_numeric_fifa_21 = numeric_fifa_21.fillna(numeric_fifa_21.median())

In [None]:
imp_numeric_fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18944 entries, 0 to 18943
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    18944 non-null  int64  
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   value_eur                    18944 non-null  float64
 4   wage_eur                     18944 non-null  float64
 5   age                          18944 non-null  int64  
 6   height_cm                    18944 non-null  int64  
 7   weight_kg                    18944 non-null  int64  
 8   club_team_id                 18944 non-null  float64
 9   league_level                 18944 non-null  float64
 10  club_jersey_number           18944 non-null  float64
 11  club_contract_valid_until    18944 non-null  float64
 12  nationality_id               18944 non-null  int64  
 13  weak_foot       

In [None]:
# Imputation for Categorical Values
imp = SimpleImputer(strategy='most_frequent')
imp_categorical_fifa_21 = imp.fit_transform(categorical_fifa_21)
imp_categorical_fifa_21 = pd.DataFrame(imp_categorical_fifa_21)

Integer Encoding Categorical Values

In [None]:
imp_categorical_fifa_21

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",1987-06-24,FC Barcelona,Spain Primera Division,CAM,2004-07-01,Argentina,Left,...,65+3,65+3,65+3,66+3,62+3,52+3,52+3,52+3,62+3,19+3
1,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",1985-02-05,Juventus,Italian Serie A,LS,2018-07-10,Portugal,Right,...,61+3,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3,20+3
2,R. Lewandowski,Robert Lewandowski,ST,1988-08-21,FC Bayern München,German 1. Bundesliga,ST,2014-07-01,Poland,Right,...,65+3,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3,19+3
3,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",1992-02-05,Paris Saint-Germain,French Ligue 1,LW,2017-08-03,Brazil,Right,...,62+3,62+3,62+3,67+3,62+3,49+3,49+3,49+3,62+3,20+3
4,K. De Bruyne,Kevin De Bruyne,"CAM, CM",1991-06-28,Manchester City,English Premier League,RCM,2015-08-30,Belgium,Right,...,80+3,80+3,80+3,79+3,75+3,69+3,69+3,69+3,75+3,21+3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,Zhang Mengxuan,张梦炫,CB,1999-04-26,Chongqing Liangjiang Athletic,Chinese Super League,SUB,2020-08-01,China PR,Right,...,41+2,41+2,41+2,42+2,45+2,47+2,47+2,47+2,45+2,15+2
18940,Huang Wenzhou,黄文卓,CM,1999-01-07,Shanghai Port FC,Chinese Super League,RES,2020-08-01,China PR,Right,...,48+2,48+2,48+2,47+2,47+2,46+2,46+2,46+2,47+2,15+2
18941,Song Yue,宋岳,CM,1991-11-20,Tianjin Jinmen Tiger FC,Chinese Super League,RES,2020-08-01,China PR,Right,...,47,47,47,47,47,46+1,46+1,46+1,47,11+2
18942,V. Da Silva,Ivanilson Loforte Tique Da Silva,ST,2003-03-30,Oldham Athletic,English League Two,SUB,2020-08-01,England,Right,...,32+2,32+2,32+2,35+2,33+2,26+2,26+2,26+2,33+2,14+2


In [None]:
label_encoder = LabelEncoder()
# Create a new DataFrame to store the encoded values
encoded_df = pd.DataFrame()

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Loop through each column in the DataFrame
for column in imp_categorical_fifa_21.columns:
    # Encode the current column and add it to the new DataFrame
    encoded_df[column] = label_encoder.fit_transform(imp_categorical_fifa_21[column])

# encoded_df contains the integer encoded values of imp_categorical_fifa_21
encoded_df.info()

encoded_df.columns = categorical_fifa_21.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       18944 non-null  int64
 1   1       18944 non-null  int64
 2   2       18944 non-null  int64
 3   3       18944 non-null  int64
 4   4       18944 non-null  int64
 5   5       18944 non-null  int64
 6   6       18944 non-null  int64
 7   7       18944 non-null  int64
 8   8       18944 non-null  int64
 9   9       18944 non-null  int64
 10  10      18944 non-null  int64
 11  11      18944 non-null  int64
 12  12      18944 non-null  int64
 13  13      18944 non-null  int64
 14  14      18944 non-null  int64
 15  15      18944 non-null  int64
 16  16      18944 non-null  int64
 17  17      18944 non-null  int64
 18  18      18944 non-null  int64
 19  19      18944 non-null  int64
 20  20      18944 non-null  int64
 21  21      18944 non-null  int64
 22  22      18944 non-null  int64
 23  23      189

In [None]:
encoded_df

Unnamed: 0,short_name,long_name,player_positions,dob,club_name,league_name,club_position,club_joined,nationality_name,preferred_foot,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,10059,10302,539,942,238,41,0,8,6,0,...,136,136,136,133,110,59,59,59,110,18
1,3261,3332,587,391,361,24,13,1135,123,1,...,110,110,110,126,104,66,66,66,104,20
2,14329,14687,560,1271,240,19,27,289,122,1,...,136,136,136,119,104,96,96,96,104,18
3,13007,12964,331,2438,461,17,14,888,20,1,...,116,116,116,139,110,52,52,52,110,20
4,8987,9626,10,2229,407,15,19,487,14,1,...,232,232,232,204,185,158,158,158,185,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,17684,18137,55,4982,138,6,28,1769,30,1,...,39,39,39,39,44,46,46,46,44,9
18940,6680,18648,149,4875,560,6,21,1769,30,1,...,52,52,52,48,49,44,44,44,49,9
18941,15978,18041,149,2366,608,6,21,1769,30,1,...,49,49,49,46,48,43,43,43,48,1
18942,16799,7152,560,6175,444,14,28,1769,48,1,...,21,21,21,30,26,10,10,10,26,7


In [None]:
new_fifa_21 = pd.concat([imp_numeric_fifa_21, encoded_df], axis=1)

In [None]:
new_fifa_21

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,158023,93,93,103500000.0,560000.0,33,170,72,241.0,1.0,...,136,136,136,133,110,59,59,59,110,18
1,20801,92,92,63000000.0,220000.0,35,187,83,45.0,1.0,...,110,110,110,126,104,66,66,66,104,20
2,188545,91,91,111000000.0,240000.0,31,184,80,21.0,1.0,...,136,136,136,119,104,96,96,96,104,18
3,190871,91,91,132000000.0,270000.0,28,175,68,73.0,1.0,...,116,116,116,139,110,52,52,52,110,20
4,192985,91,91,129000000.0,370000.0,29,181,70,10.0,1.0,...,232,232,232,204,185,158,158,158,185,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,257710,47,52,70000.0,1000.0,21,177,70,112165.0,1.0,...,39,39,39,39,44,46,46,46,44,9
18940,257933,47,53,70000.0,1000.0,21,174,68,112540.0,1.0,...,52,52,52,48,49,44,44,44,49,9
18941,257936,47,47,45000.0,2000.0,28,185,79,111774.0,1.0,...,49,49,49,46,48,43,43,43,48,1
18942,258736,47,67,130000.0,500.0,17,171,58,1920.0,4.0,...,21,21,21,30,26,10,10,10,26,7


In [None]:
new_fifa_21.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18944 entries, 0 to 18943
Data columns (total 98 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    18944 non-null  int64  
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   value_eur                    18944 non-null  float64
 4   wage_eur                     18944 non-null  float64
 5   age                          18944 non-null  int64  
 6   height_cm                    18944 non-null  int64  
 7   weight_kg                    18944 non-null  int64  
 8   club_team_id                 18944 non-null  float64
 9   league_level                 18944 non-null  float64
 10  club_jersey_number           18944 non-null  float64
 11  club_contract_valid_until    18944 non-null  float64
 12  nationality_id               18944 non-null  int64  
 13  weak_foot       

# **2. FEATURE ENGINEERING**

---

Includes:
- Selecting eatures with high correlation
- Use correlation analysis

In [None]:
# Correlation Analysis Model
overall_corr = new_fifa_21.corr()['overall']

In [None]:
# Display table with
overall_corr

sofifa_id   -0.486575
overall      1.000000
potential    0.636366
value_eur    0.553449
wage_eur     0.586851
               ...   
lcb          0.475691
cb           0.475691
rcb          0.475691
rb           0.530054
gk           0.164540
Name: overall, Length: 98, dtype: float64

In [None]:
# Correlation threshold (0.5)
high_corr_values = overall_corr[(overall_corr > 0.5) | (overall_corr < -0.5)]

In [None]:
high_corr_values

overall                    1.000000
potential                  0.636366
value_eur                  0.553449
wage_eur                   0.586851
release_clause_eur         0.610319
passing                    0.660281
dribbling                  0.592937
attacking_short_passing    0.502191
movement_reactions         0.867234
power_shot_power           0.558372
mentality_vision           0.509087
mentality_composure        0.705252
ls                         0.593021
st                         0.593021
rs                         0.593021
lw                         0.596837
lf                         0.606748
cf                         0.606748
rf                         0.606748
rw                         0.596837
lam                        0.614777
cam                        0.614777
ram                        0.614777
lm                         0.617624
lcm                        0.647694
cm                         0.647694
rcm                        0.647694
rm                         0

In [None]:
# Dropping the columns with low correlation to Overall
filtered_fifa_21 = new_fifa_21.loc[:, high_corr_values.index]

In [None]:
filtered_fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18944 entries, 0 to 18943
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   overall                  18944 non-null  int64  
 1   potential                18944 non-null  int64  
 2   value_eur                18944 non-null  float64
 3   wage_eur                 18944 non-null  float64
 4   release_clause_eur       18944 non-null  float64
 5   passing                  18944 non-null  float64
 6   dribbling                18944 non-null  float64
 7   attacking_short_passing  18944 non-null  int64  
 8   movement_reactions       18944 non-null  int64  
 9   power_shot_power         18944 non-null  int64  
 10  mentality_vision         18944 non-null  int64  
 11  mentality_composure      18944 non-null  int64  
 12  ls                       18944 non-null  int64  
 13  st                       18944 non-null  int64  
 14  rs                    

In [None]:
#Scaling the data
sc = StandardScaler()
scaled_fifa_21 = sc.fit_transform(filtered_fifa_21)
fifa_21 = pd.DataFrame(scaled_fifa_21, columns=filtered_fifa_21.columns)
fifa_21

Unnamed: 0,overall,potential,value_eur,wage_eur,release_clause_eur,passing,dribbling,attacking_short_passing,movement_reactions,power_shot_power,...,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,rb
0,3.902006,3.586563,13.071234,27.845078,13.676627,3.482519,3.410670,2.216206,3.554438,2.119026,...,3.312891,3.312891,3.032407,1.088219,0.885213,0.885213,0.885213,1.088219,0.575984,0.575984
1,3.759192,3.422893,7.810099,10.660644,7.310503,2.451148,2.778572,1.598583,3.664174,2.719163,...,2.846936,2.846936,2.974117,0.936808,0.402104,0.402104,0.402104,0.936808,0.449876,0.449876
2,3.616378,3.259222,14.045519,11.671493,13.024735,2.141737,2.357173,1.735833,3.444701,2.344077,...,2.601696,2.601696,2.721528,0.785397,0.885213,0.885213,0.885213,0.785397,0.449876,0.449876
3,3.616378,3.259222,16.773515,13.187767,16.538836,2.966834,3.305320,1.941707,3.225227,1.668922,...,3.067652,3.067652,3.012977,1.218000,0.513591,0.513591,0.513591,1.218000,0.575984,0.575984
4,3.616378,3.259222,16.383801,18.242012,15.978617,3.688793,2.673222,2.422080,3.225227,2.494112,...,3.337415,3.337415,2.993547,2.623959,2.668997,2.668997,2.668997,2.623959,2.152338,2.152338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,-2.667458,-3.123941,-0.364795,-0.408153,-0.414712,-3.221388,-3.753112,-1.901280,-1.493455,-1.706851,...,-1.469281,-1.469281,-1.436476,-0.945015,-0.917153,-0.917153,-0.917153,-0.945015,-0.811208,-0.811208
18940,-2.667458,-2.960270,-0.364795,-0.408153,-0.413184,-0.849236,-1.646118,-0.254285,-1.273981,-1.481799,...,-0.782610,-0.782610,-0.950728,-0.750344,-0.675599,-0.675599,-0.675599,-0.750344,-0.706118,-0.706118
18941,-2.667458,-3.942295,-0.368043,-0.357611,-0.415730,-0.849236,-1.751467,-0.185661,-1.932402,-0.656610,...,-0.807134,-0.807134,-0.795289,-0.793604,-0.731342,-0.731342,-0.731342,-0.793604,-0.727136,-0.727136
18942,-2.667458,-0.668878,-0.357001,-0.433425,-0.403711,-1.777469,-1.014019,-0.940533,-0.944771,-0.581593,...,-0.954278,-0.954278,-0.873008,-1.139686,-1.251612,-1.251612,-1.251612,-1.139686,-1.189533,-1.189533


In [None]:
fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   overall                  18944 non-null  float64
 1   potential                18944 non-null  float64
 2   value_eur                18944 non-null  float64
 3   wage_eur                 18944 non-null  float64
 4   release_clause_eur       18944 non-null  float64
 5   passing                  18944 non-null  float64
 6   dribbling                18944 non-null  float64
 7   attacking_short_passing  18944 non-null  float64
 8   movement_reactions       18944 non-null  float64
 9   power_shot_power         18944 non-null  float64
 10  mentality_vision         18944 non-null  float64
 11  mentality_composure      18944 non-null  float64
 12  ls                       18944 non-null  float64
 13  st                       18944 non-null  float64
 14  rs                    

# **3. TRAINING MODEL**

---

Includes:
- Create and train models
- Train 5 RegressorModels (RandomForest, XGBoost, Gradient Boost, Decision Tree Regressor, Support Vector Machine)
- Predict values for each Regressor Model

In [None]:
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
X = filtered_fifa_21.drop(columns=['overall'])
y = filtered_fifa_21['overall']

In [None]:
X

Unnamed: 0,potential,value_eur,wage_eur,release_clause_eur,passing,dribbling,attacking_short_passing,movement_reactions,power_shot_power,mentality_vision,...,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,rb
0,93,103500000.0,560000.0,138400000.0,91.0,95.0,91,94,86,95,...,216,216,249,133,136,136,136,133,110,110
1,92,63000000.0,220000.0,75900000.0,81.0,89.0,82,95,94,82,...,197,197,246,126,110,110,110,126,104,104
2,91,111000000.0,240000.0,132000000.0,78.0,85.0,84,93,89,79,...,187,187,233,119,136,136,136,119,104,104
3,91,132000000.0,270000.0,166500000.0,86.0,94.0,87,91,80,90,...,206,206,248,139,116,116,116,139,110,110
4,91,129000000.0,370000.0,161000000.0,93.0,88.0,94,91,91,94,...,217,217,247,204,232,232,232,204,185,185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,52,70000.0,1000.0,57000.0,26.0,27.0,31,48,35,25,...,21,21,19,39,39,39,39,39,44,44
18940,53,70000.0,1000.0,72000.0,49.0,47.0,55,50,38,53,...,49,49,44,48,52,52,52,48,49,49
18941,47,45000.0,2000.0,47000.0,49.0,46.0,56,44,49,44,...,48,48,52,46,49,49,49,46,48,48
18942,67,130000.0,500.0,165000.0,40.0,53.0,45,53,50,49,...,42,42,48,30,21,21,21,30,26,26


In [None]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

***Decision Tree Regressor***

In [None]:
dt = DecisionTreeRegressor(random_state=48)
dt.fit(Xtrain, ytrain)
dt_y_pred = dt.predict(Xtest)
dt_mae = mean_absolute_error(ytest, dt_y_pred)
print("Mean Absolute Error:", dt_mae)

Mean Absolute Error: 0.6801266825019794


***RandomForest Regressor***

In [None]:
rf = RandomForestRegressor(n_estimators=300, max_depth = 40, random_state=42, n_jobs= -1)
rf.fit(Xtrain, ytrain)
rf_y_pred=rf.predict(Xtest)
rf_mae = mean_absolute_error(ytest, rf_y_pred)
print("Mean Absolute Error:", rf_mae)

Mean Absolute Error: 0.5069446643793437


**XGBoost Regressor**

In [None]:
xgb_reg = xgb.XGBRegressor(
    objective ='reg:squarederror',
    colsample_bytree = 0.3,
    learning_rate = 0.1,
    max_depth = 5, alpha = 10,
    n_estimators = 100,
    random_state=42)
xgb_reg.fit(Xtrain, ytrain)
xgb_y_pred = xgb_reg.predict(Xtest)
xgb_mae = mean_absolute_error(ytest, xgb_y_pred)
print("Mean Absolute Error:", xgb_mae)

Mean Absolute Error: 0.8369165170346882


***Gradient Boost***

In [None]:
gb = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state = 42,
    max_depth = 3)
gb.fit(Xtrain, ytrain)
gb_y_pred = gb.predict(Xtest)
gb_mae = mean_absolute_error(ytest, gb_y_pred)
print("Mean Absolute Error:", gb_mae)

Mean Absolute Error: 1.0298629041580827


***Support Vector Machine (SVM)***

In [None]:
svr = SVR(kernel='rbf')
svr.fit(Xtrain, ytrain)
svr_y_pred = svr.predict(Xtest)
svr_mae = mean_absolute_error(ytest, svr_y_pred)
print("Mean Absolute Error:", svr_mae)

Mean Absolute Error: 2.454147638292999


# **4. EVALUATION**
Includes:
- Using Mean Absolute
- Fine Tuning the model
- Repeated training and testing

Training with GridSearch

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold,GridSearchCV

CROSS VALIDATION FOR BEST MODEL

In [None]:
rf_classifier = RandomForestClassifier()

In [None]:
# Define the hyperparameters and their possible values for tuning
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]   # Minimum number of samples required to split an internal node
}

# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X, y)

# Print the best parameters and corresponding accuracy score
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy Score: {:.2f}".format(grid_search.best_score_))

# Optionally, you can also perform cross-validation with the best parameters found
best_rf_classifier = grid_search.best_estimator_
cv_scores = cross_val_score(best_rf_classifier, X, y, cv=5)
print("Cross-Validation Scores: ", cv_scores)
print("Mean CV Accuracy: {:.2f}".format(cv_scores.mean()))



Best Parameters:  {'max_depth': 30, 'min_samples_split': 5, 'n_estimators': 150}
Best Accuracy Score: 0.71




Cross-Validation Scores:  [0.63631565 0.71443653 0.7212985  0.72921615 0.71013728]
Mean CV Accuracy: 0.70


TESTING WITH FIFA 22 DATASET

In [None]:
# Creating Data Frames using the loaded data sets
fifa_22 = pd.DataFrame(players_22_data.copy())
fifa_22.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [None]:
# Dropping all the duplicate data in the players_21 dataset
fifa_22 = fifa_22.drop_duplicates()

# Numeric Values
numeric_fifa_22 = fifa_22.select_dtypes(include=['number'])
numeric_fifa_22 = pd.DataFrame(numeric_fifa_22)

# Non-numeric values
categorical_fifa_22 = fifa_22.select_dtypes(include=['object', 'category'])

# Imputation for Numeric Values
imp_numeric_fifa_22 = numeric_fifa_22.fillna(numeric_fifa_22.median())

# Imputation for Categorical Values
imp_categorical_fifa_22 = imp.fit_transform(categorical_fifa_22)
imp_categorical_fifa_22 = pd.DataFrame(imp_categorical_fifa_22)

encoded_22_df = pd.DataFrame()

# Encoding the categorical values
for column in imp_categorical_fifa_22.columns:
    # Encode the current column and add it to the new DataFrame
    encoded_22_df[column] = label_encoder.fit_transform(imp_categorical_fifa_22[column])

# encoded_df contains the integer encoded values of imp_categorical_fifa_21


encoded_22_df.columns = categorical_fifa_22.columns

new_fifa_22 = pd.concat([imp_numeric_fifa_22, encoded_22_df], axis=1)

#Scaling the data
sc = StandardScaler()
scaled_fifa_22 = sc.fit_transform(new_fifa_22)
fifa_22 = pd.DataFrame(scaled_fifa_22, columns=new_fifa_22.columns)

In [None]:
X = new_fifa_22.drop(columns=['overall'])
y = new_fifa_22['overall']

Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

Best Parameters:
* max_depth =  30
* min_samples_split = 5
* n_estimators = 150

In [None]:
rf_clf = RandomForestClassifier(n_estimators= 150, max_depth = 30, min_samples_split= 5, random_state=42)
rf_clf.fit(X, y)
y_pred = rf_clf.predict(Xtest)
mae = mean_absolute_error(ytest, y_pred)
print("MAE:", mae)

MAE: 0.0


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytest, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


Saving the model to a .pkl fiie

In [140]:
import pickle

# Specifying the file name
pickle_filename = "randomForest_model.pkl"

# Saving the model to the file
with open(pickle_filename, 'wb') as model_file:
    pickle.dump(rf_clf, model_file)