In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Question 1.**
---
**Data Preparation**

---
Includes:
* Loading the dataset
* Handling missing, duplicate and unnecessary data
* Imputing numeric and non-numeric values
* Encoding non-numeric data (Label Encoder)


In [None]:
import numpy as np
import pandas as pd
from numpy import array
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

Loading the dataset

In [None]:
# Importing the data sets for players 21 and 22
players_21_data = pd.read_csv('/content/drive/MyDrive/MidSem Project/players_21.csv', low_memory=False)
players_22_data = pd.read_csv('/content/drive/MyDrive/MidSem Project/players_22.csv', low_memory=False)

In [None]:
# Creating Data Frames using the loaded data sets
fifa_21 = pd.DataFrame(players_21_data.copy())
fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 15.9+ MB


Dropping Unnecessary Data

In [None]:
# Dropping all the duplicate data in the players_21 dataset
fifa_21 = fifa_21.drop_duplicates()

# Drop columns with 30% or more missing values
threshold = 0.3
fifa_21 = fifa_21.dropna(axis=1, thresh=int(threshold * len(fifa_21)))

# Drop columns containing "url"
fifa_21 = fifa_21.drop(columns=[col for col in fifa_21.columns if 'url' in col.lower()])

# Drop player positions as they realistically do not determine a players rating
fifa_21.drop(['ls','st','rs','lw', 'lf','cf','rf','rw','lam','cam','ram','lm','lcm', 'cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb', 'cb','rcb','rb','gk'], axis = 1,  inplace=True)

In [None]:
print(fifa_21)

       sofifa_id         short_name                            long_name  \
0         158023           L. Messi       Lionel Andrés Messi Cuccittini   
1          20801  Cristiano Ronaldo  Cristiano Ronaldo dos Santos Aveiro   
2         188545     R. Lewandowski                   Robert Lewandowski   
3         190871          Neymar Jr        Neymar da Silva Santos Júnior   
4         192985       K. De Bruyne                      Kevin De Bruyne   
...          ...                ...                                  ...   
18939     257710     Zhang Mengxuan                                  张梦炫   
18940     257933      Huang Wenzhou                                  黄文卓   
18941     257936           Song Yue                                   宋岳   
18942     258736        V. Da Silva     Ivanilson Loforte Tique Da Silva   
18943     258760           B. Hough                            Ben Hough   

      player_positions  overall  potential    value_eur  wage_eur  age  \
0           R

Separating the numerical and non-numerical values

In [None]:
# Numeric Values
numeric_fifa_21 = fifa_21.select_dtypes(include=['number'])
numeric_fifa_21 = pd.DataFrame(numeric_fifa_21)

In [None]:
numeric_fifa_21

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,158023,93,93,103500000.0,560000.0,33,170,72,241.0,1.0,...,75,96,32,35,24,6,11,15,14,8
1,20801,92,92,63000000.0,220000.0,35,187,83,45.0,1.0,...,84,95,28,32,24,7,11,15,14,11
2,188545,91,91,111000000.0,240000.0,31,184,80,21.0,1.0,...,88,88,35,42,19,15,6,12,8,10
3,190871,91,91,132000000.0,270000.0,28,175,68,73.0,1.0,...,92,93,35,30,29,9,9,15,15,11
4,192985,91,91,129000000.0,370000.0,29,181,70,10.0,1.0,...,84,91,68,65,53,15,13,5,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,257710,47,52,70000.0,1000.0,21,177,70,112165.0,1.0,...,35,40,45,56,47,12,13,8,14,6
18940,257933,47,53,70000.0,1000.0,21,174,68,112540.0,1.0,...,35,35,43,42,53,8,8,13,14,10
18941,257936,47,47,45000.0,2000.0,28,185,79,111774.0,1.0,...,36,35,38,43,45,8,5,11,5,7
18942,258736,47,67,130000.0,500.0,17,171,58,1920.0,4.0,...,50,45,18,11,13,11,13,9,9,6


In [None]:
# Non-numeric values
categorical_fifa_21 = fifa_21.select_dtypes(include=['object', 'category'])

In [None]:
categorical_fifa_21

Unnamed: 0,short_name,long_name,player_positions,dob,club_name,league_name,club_position,club_joined,nationality_name,preferred_foot,work_rate,body_type,real_face,player_traits
0,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",1987-06-24,FC Barcelona,Spain Primera Division,CAM,2004-07-01,Argentina,Left,Medium/Low,Unique,Yes,"Finesse Shot, Long Shot Taker (AI), Speed Drib..."
1,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",1985-02-05,Juventus,Italian Serie A,LS,2018-07-10,Portugal,Right,High/Low,Unique,Yes,"Power Free-Kick, Flair, Long Shot Taker (AI), ..."
2,R. Lewandowski,Robert Lewandowski,ST,1988-08-21,FC Bayern München,German 1. Bundesliga,ST,2014-07-01,Poland,Right,High/Medium,Unique,Yes,"Solid Player, Finesse Shot, Outside Foot Shot,..."
3,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",1992-02-05,Paris Saint-Germain,French Ligue 1,LW,2017-08-03,Brazil,Right,High/Medium,Unique,Yes,"Injury Prone, Flair, Speed Dribbler (AI), Outs..."
4,K. De Bruyne,Kevin De Bruyne,"CAM, CM",1991-06-28,Manchester City,English Premier League,RCM,2015-08-30,Belgium,Right,High/High,Unique,Yes,"Injury Prone, Leadership, Early Crosser, Long ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,Zhang Mengxuan,张梦炫,CB,1999-04-26,Chongqing Liangjiang Athletic,Chinese Super League,SUB,2020-08-01,China PR,Right,Low/Low,Normal (170-185),No,
18940,Huang Wenzhou,黄文卓,CM,1999-01-07,Shanghai Port FC,Chinese Super League,RES,2020-08-01,China PR,Right,Low/Low,Lean (170-185),No,
18941,Song Yue,宋岳,CM,1991-11-20,Tianjin Jinmen Tiger FC,Chinese Super League,RES,2020-08-01,China PR,Right,Low/Low,Lean (185+),No,
18942,V. Da Silva,Ivanilson Loforte Tique Da Silva,ST,2003-03-30,Oldham Athletic,English League Two,SUB,2020-08-01,England,Right,Medium/Medium,Lean (170-185),No,


Imputing Numerical & Non-numerical Values

In [None]:
# Imputation for Numeric Values
imp_numeric_fifa_21 = numeric_fifa_21.fillna(numeric_fifa_21.median())

In [None]:
imp_numeric_fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18944 entries, 0 to 18943
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    18944 non-null  int64  
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   value_eur                    18944 non-null  float64
 4   wage_eur                     18944 non-null  float64
 5   age                          18944 non-null  int64  
 6   height_cm                    18944 non-null  int64  
 7   weight_kg                    18944 non-null  int64  
 8   club_team_id                 18944 non-null  float64
 9   league_level                 18944 non-null  float64
 10  club_jersey_number           18944 non-null  float64
 11  club_contract_valid_until    18944 non-null  float64
 12  nationality_id               18944 non-null  int64  
 13  weak_foot       

In [None]:
# Imputation for Categorical Values
imp = SimpleImputer(strategy='most_frequent')
imp_categorical_fifa_21 = imp.fit_transform(categorical_fifa_21)
imp_categorical_fifa_21 = pd.DataFrame(imp_categorical_fifa_21)

Integer Encoding Categorical Values

In [None]:
imp_categorical_fifa_21

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",1987-06-24,FC Barcelona,Spain Primera Division,CAM,2004-07-01,Argentina,Left,Medium/Low,Unique,Yes,"Finesse Shot, Long Shot Taker (AI), Speed Drib..."
1,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",1985-02-05,Juventus,Italian Serie A,LS,2018-07-10,Portugal,Right,High/Low,Unique,Yes,"Power Free-Kick, Flair, Long Shot Taker (AI), ..."
2,R. Lewandowski,Robert Lewandowski,ST,1988-08-21,FC Bayern München,German 1. Bundesliga,ST,2014-07-01,Poland,Right,High/Medium,Unique,Yes,"Solid Player, Finesse Shot, Outside Foot Shot,..."
3,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",1992-02-05,Paris Saint-Germain,French Ligue 1,LW,2017-08-03,Brazil,Right,High/Medium,Unique,Yes,"Injury Prone, Flair, Speed Dribbler (AI), Outs..."
4,K. De Bruyne,Kevin De Bruyne,"CAM, CM",1991-06-28,Manchester City,English Premier League,RCM,2015-08-30,Belgium,Right,High/High,Unique,Yes,"Injury Prone, Leadership, Early Crosser, Long ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,Zhang Mengxuan,张梦炫,CB,1999-04-26,Chongqing Liangjiang Athletic,Chinese Super League,SUB,2020-08-01,China PR,Right,Low/Low,Normal (170-185),No,Speed Dribbler (AI)
18940,Huang Wenzhou,黄文卓,CM,1999-01-07,Shanghai Port FC,Chinese Super League,RES,2020-08-01,China PR,Right,Low/Low,Lean (170-185),No,Speed Dribbler (AI)
18941,Song Yue,宋岳,CM,1991-11-20,Tianjin Jinmen Tiger FC,Chinese Super League,RES,2020-08-01,China PR,Right,Low/Low,Lean (185+),No,Speed Dribbler (AI)
18942,V. Da Silva,Ivanilson Loforte Tique Da Silva,ST,2003-03-30,Oldham Athletic,English League Two,SUB,2020-08-01,England,Right,Medium/Medium,Lean (170-185),No,Speed Dribbler (AI)


In [None]:
label_encoder = LabelEncoder()
# Create a new DataFrame to store the encoded values
encoded_df = pd.DataFrame()

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Loop through each column in the DataFrame
for column in imp_categorical_fifa_21.columns:
    # Encode the current column and add it to the new DataFrame
    encoded_df[column] = label_encoder.fit_transform(imp_categorical_fifa_21[column])

# encoded_df contains the integer encoded values of imp_categorical_fifa_21
encoded_df.info()

encoded_df.columns = categorical_fifa_21.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       18944 non-null  int64
 1   1       18944 non-null  int64
 2   2       18944 non-null  int64
 3   3       18944 non-null  int64
 4   4       18944 non-null  int64
 5   5       18944 non-null  int64
 6   6       18944 non-null  int64
 7   7       18944 non-null  int64
 8   8       18944 non-null  int64
 9   9       18944 non-null  int64
 10  10      18944 non-null  int64
 11  11      18944 non-null  int64
 12  12      18944 non-null  int64
 13  13      18944 non-null  int64
dtypes: int64(14)
memory usage: 2.0 MB


In [None]:
encoded_df

Unnamed: 0,short_name,long_name,player_positions,dob,club_name,league_name,club_position,club_joined,nationality_name,preferred_foot,work_rate,body_type,real_face,player_traits
0,10059,10302,539,942,238,41,0,8,6,0,7,9,1,172
1,3261,3332,587,391,361,24,13,1135,123,1,1,9,1,699
2,14329,14687,560,1271,240,19,27,289,122,1,2,9,1,805
3,13007,12964,331,2438,461,17,14,888,20,1,2,9,1,343
4,8987,9626,10,2229,407,15,19,487,14,1,0,9,1,355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,17684,18137,55,4982,138,6,28,1769,30,1,4,4,0,891
18940,6680,18648,149,4875,560,6,21,1769,30,1,4,1,0,891
18941,15978,18041,149,2366,608,6,21,1769,30,1,4,2,0,891
18942,16799,7152,560,6175,444,14,28,1769,48,1,8,1,0,891


In [None]:
new_fifa_21 = pd.concat([imp_numeric_fifa_21, encoded_df], axis=1)

In [None]:
new_fifa_21

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,club_name,league_name,club_position,club_joined,nationality_name,preferred_foot,work_rate,body_type,real_face,player_traits
0,158023,93,93,103500000.0,560000.0,33,170,72,241.0,1.0,...,238,41,0,8,6,0,7,9,1,172
1,20801,92,92,63000000.0,220000.0,35,187,83,45.0,1.0,...,361,24,13,1135,123,1,1,9,1,699
2,188545,91,91,111000000.0,240000.0,31,184,80,21.0,1.0,...,240,19,27,289,122,1,2,9,1,805
3,190871,91,91,132000000.0,270000.0,28,175,68,73.0,1.0,...,461,17,14,888,20,1,2,9,1,343
4,192985,91,91,129000000.0,370000.0,29,181,70,10.0,1.0,...,407,15,19,487,14,1,0,9,1,355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,257710,47,52,70000.0,1000.0,21,177,70,112165.0,1.0,...,138,6,28,1769,30,1,4,4,0,891
18940,257933,47,53,70000.0,1000.0,21,174,68,112540.0,1.0,...,560,6,21,1769,30,1,4,1,0,891
18941,257936,47,47,45000.0,2000.0,28,185,79,111774.0,1.0,...,608,6,21,1769,30,1,4,2,0,891
18942,258736,47,67,130000.0,500.0,17,171,58,1920.0,4.0,...,444,14,28,1769,48,1,8,1,0,891


In [None]:
new_fifa_21.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18944 entries, 0 to 18943
Data columns (total 71 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    18944 non-null  int64  
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   value_eur                    18944 non-null  float64
 4   wage_eur                     18944 non-null  float64
 5   age                          18944 non-null  int64  
 6   height_cm                    18944 non-null  int64  
 7   weight_kg                    18944 non-null  int64  
 8   club_team_id                 18944 non-null  float64
 9   league_level                 18944 non-null  float64
 10  club_jersey_number           18944 non-null  float64
 11  club_contract_valid_until    18944 non-null  float64
 12  nationality_id               18944 non-null  int64  
 13  weak_foot       

# **Question 2.**
---
**Feature Extraction & Engineering**

---
Includes:
- Selecting features with high correlation
- Use correlation analysis

In [None]:
# Correlation Analysis Model
overall_corr = new_fifa_21.corr()['overall']

In [None]:
# Display table with values and their correlations with the target: overall
print(overall_corr)

sofifa_id        -0.486575
overall           1.000000
potential         0.636366
value_eur         0.553449
wage_eur          0.586851
                    ...   
preferred_foot   -0.052533
work_rate        -0.233214
body_type         0.163377
real_face         0.447426
player_traits    -0.378064
Name: overall, Length: 71, dtype: float64


In [None]:
# Correlation threshold (+-0.5)
high_corr_values = overall_corr[(overall_corr > 0.5) | (overall_corr < -0.5)]

In [None]:
high_corr_values

overall                    1.000000
potential                  0.636366
value_eur                  0.553449
wage_eur                   0.586851
release_clause_eur         0.610319
passing                    0.660281
dribbling                  0.592937
attacking_short_passing    0.502191
movement_reactions         0.867234
power_shot_power           0.558372
mentality_vision           0.509087
mentality_composure        0.705252
Name: overall, dtype: float64

In [None]:
# Dropping the columns with low correlation to Overall
filtered_fifa_21 = new_fifa_21.loc[:, high_corr_values.index]

In [None]:
filtered_fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18944 entries, 0 to 18943
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   overall                  18944 non-null  int64  
 1   potential                18944 non-null  int64  
 2   value_eur                18944 non-null  float64
 3   wage_eur                 18944 non-null  float64
 4   release_clause_eur       18944 non-null  float64
 5   passing                  18944 non-null  float64
 6   dribbling                18944 non-null  float64
 7   attacking_short_passing  18944 non-null  int64  
 8   movement_reactions       18944 non-null  int64  
 9   power_shot_power         18944 non-null  int64  
 10  mentality_vision         18944 non-null  int64  
 11  mentality_composure      18944 non-null  int64  
dtypes: float64(5), int64(7)
memory usage: 1.9 MB


In [None]:
values = ['potential', 'value_eur', 'wage_eur','release_clause_eur', 'passing', 'dribbling', 'attacking_short_passing', 'movement_reactions', 'power_shot_power','mentality_vision','mentality_composure']

In [None]:
#Scaling the data
sc = StandardScaler()
scaled_fifa_21 = sc.fit_transform(filtered_fifa_21.drop(columns='overall'))
fifa_21 = pd.DataFrame(scaled_fifa_21, columns=values)
fifa_21

Unnamed: 0,potential,value_eur,wage_eur,release_clause_eur,passing,dribbling,attacking_short_passing,movement_reactions,power_shot_power,mentality_vision,mentality_composure
0,3.586563,13.071234,27.845078,13.676627,3.482519,3.410670,2.216206,3.554438,2.119026,3.000047,3.137573
1,3.422893,7.810099,10.660644,7.310503,2.451148,2.778572,1.598583,3.664174,2.719163,2.052781,3.055051
2,3.259222,14.045519,11.671493,13.024735,2.141737,2.357173,1.735833,3.444701,2.344077,1.834181,2.477402
3,3.259222,16.773515,13.187767,16.538836,2.966834,3.305320,1.941707,3.225227,1.668922,2.635714,2.890009
4,3.259222,16.383801,18.242012,15.978617,3.688793,2.673222,2.422080,3.225227,2.494112,2.927180,2.724966
...,...,...,...,...,...,...,...,...,...,...,...
18939,-3.123941,-0.364795,-0.408153,-0.414712,-3.221388,-3.753112,-1.901280,-1.493455,-1.706851,-2.100615,-1.483625
18940,-2.960270,-0.364795,-0.408153,-0.413184,-0.849236,-1.646118,-0.254285,-1.273981,-1.481799,-0.060350,-1.896232
18941,-3.942295,-0.368043,-0.357611,-0.415730,-0.849236,-1.751467,-0.185661,-1.932402,-0.656610,-0.716150,-1.896232
18942,-0.668878,-0.357001,-0.433425,-0.403711,-1.777469,-1.014019,-0.940533,-0.944771,-0.581593,-0.351817,-1.071018


In [None]:
fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   potential                18944 non-null  float64
 1   value_eur                18944 non-null  float64
 2   wage_eur                 18944 non-null  float64
 3   release_clause_eur       18944 non-null  float64
 4   passing                  18944 non-null  float64
 5   dribbling                18944 non-null  float64
 6   attacking_short_passing  18944 non-null  float64
 7   movement_reactions       18944 non-null  float64
 8   power_shot_power         18944 non-null  float64
 9   mentality_vision         18944 non-null  float64
 10  mentality_composure      18944 non-null  float64
dtypes: float64(11)
memory usage: 1.6 MB


# **Question 3 & 4.**
---
**Training & Cross Validation**

---
Includes:
- Train 5 Regressor Models (RandomForest, XGBoost, Gradient Boost, Decision Tree Regressor, Support Vector Machine)
- Predict values for each Regressor Model
- Fine-tuning the model
- Optimization

In [None]:
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
X = fifa_21
y = filtered_fifa_21['overall']

In [None]:
X

Unnamed: 0,potential,value_eur,wage_eur,release_clause_eur,passing,dribbling,attacking_short_passing,movement_reactions,power_shot_power,mentality_vision,mentality_composure
0,3.586563,13.071234,27.845078,13.676627,3.482519,3.410670,2.216206,3.554438,2.119026,3.000047,3.137573
1,3.422893,7.810099,10.660644,7.310503,2.451148,2.778572,1.598583,3.664174,2.719163,2.052781,3.055051
2,3.259222,14.045519,11.671493,13.024735,2.141737,2.357173,1.735833,3.444701,2.344077,1.834181,2.477402
3,3.259222,16.773515,13.187767,16.538836,2.966834,3.305320,1.941707,3.225227,1.668922,2.635714,2.890009
4,3.259222,16.383801,18.242012,15.978617,3.688793,2.673222,2.422080,3.225227,2.494112,2.927180,2.724966
...,...,...,...,...,...,...,...,...,...,...,...
18939,-3.123941,-0.364795,-0.408153,-0.414712,-3.221388,-3.753112,-1.901280,-1.493455,-1.706851,-2.100615,-1.483625
18940,-2.960270,-0.364795,-0.408153,-0.413184,-0.849236,-1.646118,-0.254285,-1.273981,-1.481799,-0.060350,-1.896232
18941,-3.942295,-0.368043,-0.357611,-0.415730,-0.849236,-1.751467,-0.185661,-1.932402,-0.656610,-0.716150,-1.896232
18942,-0.668878,-0.357001,-0.433425,-0.403711,-1.777469,-1.014019,-0.940533,-0.944771,-0.581593,-0.351817,-1.071018


In [None]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

18775    50
17559    55
7667     67
2638     73
18148    53
         ..
11284    64
11964    63
5390     70
860      78
15795    59
Name: overall, Length: 15155, dtype: int64

***Decision Tree Regressor***

In [None]:
dt = DecisionTreeRegressor(random_state=48)
dt.fit(Xtrain, ytrain)
dt_y_pred = dt.predict(Xtest)
dt_mae = mean_absolute_error(ytest, dt_y_pred)
print("Mean Absolute Error:", dt_mae)

Mean Absolute Error: 0.7408287147004486


***RandomForest Regressor***

In [None]:
rf = RandomForestRegressor()
rf.fit(Xtrain, ytrain)
rf_y_pred=rf.predict(Xtest)
rf_mae = mean_absolute_error(ytest, rf_y_pred)
print("Mean Absolute Error:", rf_mae)

Mean Absolute Error: 0.6193454737397731


**XGBoost Regressor**

In [None]:
xgb_reg = xgb.XGBRegressor(
    objective ='reg:squarederror',
    colsample_bytree = 0.3,
    learning_rate = 0.1,
    max_depth = 5, alpha = 10,
    n_estimators = 100,
    random_state=42)
xgb_reg.fit(Xtrain, ytrain)
xgb_y_pred = xgb_reg.predict(Xtest)
xgb_mae = mean_absolute_error(ytest, xgb_y_pred)
print("Mean Absolute Error:", xgb_mae)

Mean Absolute Error: 1.0720424878625576


***Gradient Boost***

In [None]:
gb = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state = 42,
    max_depth = 3)
gb.fit(Xtrain, ytrain)
gb_y_pred = gb.predict(Xtest)
gb_mae = mean_absolute_error(ytest, gb_y_pred)
print("Mean Absolute Error:", gb_mae)

Mean Absolute Error: 1.1121404980569096


***Support Vector Machine (SVM)***

In [None]:
svr = SVR(kernel='rbf')
svr.fit(Xtrain, ytrain)
svr_y_pred = svr.predict(Xtest)
svr_mae = mean_absolute_error(ytest, svr_y_pred)
print("Mean Absolute Error:", svr_mae)

Mean Absolute Error: 1.2616923345982927


CROSS VALIDATION FOR BEST MODEL

In [None]:
from sklearn.model_selection import KFold,GridSearchCV

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 5, 10, 20],
}

# Create a RandomForestRegressor
rf_reg = RandomForestRegressor()

# Perform GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_reg, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)

# Fit the model to the data
grid_search.fit(Xtrain, ytrain)

# Print the best hyperparameters found by GridSearchCV
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the model on the test set
best_rf_reg = grid_search.best_estimator_
y_pred = best_rf_reg.predict(Xtest)
mae = mean_absolute_error(ytest, y_pred)
print("Mean Absolute Error on Test Set:", mae)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Hyperparameters: {'max_depth': 20, 'n_estimators': 200}
Mean Absolute Error on Test Set: 0.6114095243987908


# **Question 5.**

---
Includes:
- Preparing the players_22 dataset
- Testing the model with players_22 dataset
- Saving the model to a pickle file

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# Creating Data Frames using the loaded data sets
fifa_22 = pd.DataFrame(players_22_data.copy())
fifa_22.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [None]:
# Dropping all the duplicate data in the players_21 dataset
fifa_22 = fifa_22.drop_duplicates()

# Numeric Values
numeric_fifa_22 = fifa_22.select_dtypes(include=['number'])
numeric_fifa_22 = pd.DataFrame(numeric_fifa_22)

# Non-numeric values
categorical_fifa_22 = fifa_22.select_dtypes(include=['object', 'category'])

# Imputation for Numeric Values
imp_numeric_fifa_22 = numeric_fifa_22.fillna(numeric_fifa_22.median())

# Imputation for Categorical Values
imp_categorical_fifa_22 = imp.fit_transform(categorical_fifa_22)
imp_categorical_fifa_22 = pd.DataFrame(imp_categorical_fifa_22)

encoded_22_df = pd.DataFrame()

# Encoding the categorical values
for column in imp_categorical_fifa_22.columns:
    # Encode the current column and add it to the new DataFrame
    encoded_22_df[column] = label_encoder.fit_transform(imp_categorical_fifa_22[column])

# encoded_df contains the integer encoded values of imp_categorical_fifa_21


encoded_22_df.columns = categorical_fifa_22.columns

new_fifa_22 = pd.concat([imp_numeric_fifa_22, encoded_22_df], axis=1)
new_fifa_22 = new_fifa_22[values]

In [None]:
print(fifa_22)

       sofifa_id                                         player_url  \
0         158023  https://sofifa.com/player/158023/lionel-messi/...   
1         188545  https://sofifa.com/player/188545/robert-lewand...   
2          20801  https://sofifa.com/player/20801/c-ronaldo-dos-...   
3         190871  https://sofifa.com/player/190871/neymar-da-sil...   
4         192985  https://sofifa.com/player/192985/kevin-de-bruy...   
...          ...                                                ...   
19234     261962  https://sofifa.com/player/261962/defu-song/220002   
19235     262040  https://sofifa.com/player/262040/caoimhin-port...   
19236     262760  https://sofifa.com/player/262760/nathan-logue/...   
19237     262820  https://sofifa.com/player/262820/luke-rudden/2...   
19238     264540  https://sofifa.com/player/264540/emanuel-lalch...   

               short_name                            long_name  \
0                L. Messi       Lionel Andrés Messi Cuccittini   
1          R. L

In [None]:
#Scaling the data
sc = StandardScaler()
scaled_fifa_22 = sc.fit_transform(new_fifa_22)
scaled_fifa_22 = pd.DataFrame(scaled_fifa_22, columns=[values])

In [None]:
print(scaled_fifa_22)

      potential  value_eur   wage_eur release_clause_eur   passing dribbling  \
0      3.601780   9.889397  15.996581           9.589016  3.539198  3.542596   
1      3.437470  15.350116  13.424792          13.234253  2.275622  2.554865   
2      3.273160   5.547138  13.424792           5.385624  2.380920  2.774360   
3      3.273160  16.600160  13.424792          16.093937  3.012708  3.432848   
4      3.273160  16.139617  17.539654          15.646035  3.749794  2.774360   
...         ...        ...        ...                ...       ...       ...   
19234 -3.134932  -0.364913  -0.411430          -0.346563 -1.199211 -1.615558   
19235 -1.984762  -0.359649  -0.437148          -0.341119 -0.778020 -1.835054   
19236 -2.642002  -0.360965  -0.437148          -0.342360 -1.304509 -1.505810   
19237 -1.820452  -0.359649  -0.437148          -0.337949 -2.252191 -1.615558   
19238 -1.820452  -0.359649  -0.437148          -0.339465 -1.304509 -1.615558   

      attacking_short_passing movement_

In [None]:
X = scaled_fifa_22
y = fifa_22['overall']

Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

Best Parameters:
'max_depth': 30, 'max_features': 'log2', 'n_estimators': 500

In [None]:
rf_reg = RandomForestRegressor(max_depth= 20, n_estimators=200)
rf_reg.fit(Xtrain, ytrain)
y_pred = rf_reg.predict(Xtest)
mae = mean_absolute_error(ytest, y_pred)
print("MAE:", mae)

MAE: 0.6737908392191473


Saving the model and scaler to a .pkl fiie

In [None]:
import pickle

# Specifying the file name
pickle_filename = "rnf_model.pkl"

# Saving the model to the file
with open(pickle_filename, 'wb') as model_file:
    pickle.dump(rf_reg, model_file)

In [None]:
# Specifying the file name for scaler
pickle_filename = "scaler.pkl"

# Saving the scaler to the file
with open(pickle_filename, 'wb') as model_file:
    pickle.dump(sc, model_file)