In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# 01 - Problem (Case Study)
* Data Description.
* Goal.

In [43]:
#We try to predict a player's "Overall Rating" by analysing data from fifa21_train.csv

# 02 - Getting the data

In [44]:
data=pd.read_csv('fifa21_train.csv')

# 03 - Cleaning/Wrangling/EDA
* Change headers names.
* Deal with NaN values.
* Categorical Features.
* Numerical Features.
* Exploration.

In [45]:
# (Dana) we drop: identity data (nationality, names, club, position, Height, Weight, Foot, team & contract, joined, Loan Date End, Wage, Release Clause, Contract..) including ID since this is already indexed
# (Dana) Remove Nan rows
# (AS) Remove \ convert money symbols from row 'value'
# (AS) Convert columns W/F, SM, IR to a labeled system by deleting the star sign
# (Kevin) columns [LS	ST	RS	LW	LF	CF	RF	RW	LAM	CAM	RAM	LM	LCM	CM	RCM	RM	LWB	LDM	CDM	RDM	RWB	LB	LCB	CB	RCB	RB	GK] sum the numbers within

In [46]:
#Dana
print(data.shape)
data= data.drop(['ID', 'Nationality', 'Name', 'Club', 'BP', 'Position', 'Height', 'Weight', 'foot', 'Team & Contract', 'Joined', 'Loan Date End', 'Wage', 'Release Clause', 'Contract'],axis=1)
data=data.dropna()
print(data.shape)
#279 rows and 15 columns were dropped

(11701, 101)
(11422, 86)


In [47]:
#Dana
data['Hits'] = pd.to_numeric(data['Hits'], errors='coerce')
data.dropna(subset=['Hits'], inplace=True)
pd.set_option('display.max_rows', 200)
print(data.dtypes)
print(data.shape)

Age                   int64
Growth                int64
Value                object
Attacking             int64
Crossing              int64
Finishing             int64
Heading Accuracy      int64
Short Passing         int64
Volleys             float64
Skill                 int64
Dribbling             int64
Curve               float64
FK Accuracy           int64
Long Passing          int64
Ball Control          int64
Movement              int64
Acceleration          int64
Sprint Speed          int64
Agility             float64
Reactions             int64
Balance             float64
Power                 int64
Shot Power            int64
Jumping             float64
Stamina               int64
Strength              int64
Long Shots            int64
Mentality             int64
Aggression            int64
Interceptions       float64
Positioning         float64
Vision              float64
Penalties             int64
Composure           float64
Defending             int64
Marking             

In [48]:
#AnneSo
# Remove \ convert money symbols from row 'value'
def convert_currency(value):
    multiplier = 1
    if 'M' in value:
        multiplier = 1e6  # 1 million
    elif 'K' in value:
        multiplier = 1e3  # 1 thousand
#remove money symbol and letters
    return float(value.replace('€', '').replace('M', '').replace('K', '')) * multiplier

# Apply the custom function to the 'Value' column
data['Value'] = data['Value'].map(convert_currency)
#FYI The map function is similar to apply but is particularly useful when you want to apply a function element-wise to a Series.

In [49]:
#AnneSo
# Convert columns W/F, SM, IR to a labeled system by deleting the star sign
star_column = ['W/F', 'SM', 'IR']
for column in star_column:
    data[column] = data[column].str.replace('★', '')
data[star_column] = data[star_column].apply(pd.to_numeric)

In [50]:
#Kevin
data.loc[:, 'LS':'GK'] = data.loc[:, 'LS':'GK'].astype(str)
for cols in data.loc[:, 'LS':'GK'].columns:
    data[cols] = data[cols].apply(eval)
# display(data.loc[:, 'LS':'GK'])

In [51]:
pd.set_option('display.max_rows', 200)
print(data.isnull().sum())

Age                 0
Growth              0
Value               0
Attacking           0
Crossing            0
Finishing           0
Heading Accuracy    0
Short Passing       0
Volleys             0
Skill               0
Dribbling           0
Curve               0
FK Accuracy         0
Long Passing        0
Ball Control        0
Movement            0
Acceleration        0
Sprint Speed        0
Agility             0
Reactions           0
Balance             0
Power               0
Shot Power          0
Jumping             0
Stamina             0
Strength            0
Long Shots          0
Mentality           0
Aggression          0
Interceptions       0
Positioning         0
Vision              0
Penalties           0
Composure           0
Defending           0
Marking             0
Standing Tackle     0
Sliding Tackle      0
Goalkeeping         0
GK Diving           0
GK Handling         0
GK Kicking          0
GK Positioning      0
GK Reflexes         0
Total Stats         0
Base Stats

# 04 - Processing Data
* Dealing with outliers.
* Normalization.
* Encoding Categorical Data.
* Splitting into train set and test set.

In [52]:
# we skip the outliers for now because there's too much data
# (Kevin) Convert columns A/W, D/W to a labled system - low,medium,high >> 1,2,3
# (Dana)normalizing the data
# (AS)Splitting into X_train and y_train

In [53]:
#convert 'Low', 'Medium', 'High' to 1,2,3
conv_mapper = {'Low':1,'Medium':2,'High':3}
data.loc[:, ['A/W', 'D/W']] = data.loc[:, ['A/W', 'D/W']].replace(conv_mapper)
# display(data.loc[:,['A/W', 'D/W']])

In [54]:
correlations_matrix = data.corr()
# display(correlations_matrix['OVA'])

corr_ova = correlations_matrix.loc['OVA']
corr_ova = corr_ova[corr_ova > 0.55]
corr_ova = corr_ova.index
corr_ova = corr_ova.tolist()
corr_ova.pop()

'OVA'

In [55]:
#Normalizing


y = data['OVA']
# X = data.loc[:, corr_ova]
X = data.drop(['OVA'], axis=1)
# X = data.drop(['OVA', 'Total Stats', 'Base Stats'], axis=1)

# X1 = data.loc[:, "Total Stats": "Base Stats"]
# X2 = data.loc[:, 'LS':'GK']
# X = pd.concat([X1,X2], axis=1)

# X = data.loc[:, ["Total Stats"]]

In [56]:
#Selecting
# from sklearn.feature_selection import SelectKBest, f_regression

# selector = SelectKBest(score_func=f_regression, k=20)

# selector.fit(X,y)

# selected_features = X.columns[selector.get_support()]

# X = X[selected_features]


In [57]:
#Scaling
from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(X)

x_normalized = transformer.transform(X)
print(x_normalized.shape)
x_normalized
X = pd.DataFrame(x_normalized, columns=X.columns)

(11380, 85)


In [58]:
#Splitting into train set and test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)#dont know if random_state requires value
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)


# 05 - Modeling

In [59]:
#apply model
# corr_data = pd.concat([X,y],axis=1)
# # #heatmap
# correlations_matrix = corr_data.corr()
# display(correlations_matrix['OVA'])
# sns.heatmap(correlations_matrix, annot=True)
# plt.show()

# 06 - Model Validation
* R2.
* MSE.
* RMSE.
* MAE.

In [60]:
# Validation and Scoring
predict_result = lm.predict(X_test)

r2 = r2_score(y_test, predict_result)
mse = mean_squared_error(y_test, predict_result)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, predict_result)

print('R2 score:', r2)
print('MSE:', mse)
print('RMSE:', rmse)
print('MAE:', mae)

R2 score: 0.9126685725787749
MSE: 3.969092057170003
RMSE: 1.9922580297667274
MAE: 1.5428031827527735


In [61]:
# we can try to validate the score based on a different dataset by dropping more columns and reruning the model

# 07 - Reporting
* Present results.

In [62]:
#we decided to ignore outliers.
#removing the scaling doesn't change the Model result
#R2 has a suprisingly high score. The Average Score Value OVA is directly derived from our dataset
#We could improve the model removing some variables

# 08 - Validation Data

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
#read
datav=pd.read_csv('fifa21_validate.csv')

In [64]:
def convert_currency(value):
    multiplier = 1
    if 'M' in value:
        multiplier = 1e6  # 1 million
    elif 'K' in value:
        multiplier = 1e3  # 1 thousand
#remove money symbol and letters
    return float(value.replace('€', '').replace('M', '').replace('K', '')) * multiplier

In [65]:
#clean datav
def clean_datav(datav):
    datav = datav.drop(['ID', 'Nationality', 'Name', 'Club', 'BP', 'Position', 'Height', 'Weight', 'foot', 'Team & Contract', 'Joined', 'Loan Date End', 'Wage', 'Release Clause', 'Contract'],axis=1)
    datav.dropna(inplace=True) #cleans unesseary column and drop missing values

    datav['Hits'] = pd.to_numeric(datav['Hits'], errors='coerce')
    datav.dropna(subset=['Hits'], inplace=True) #cleans Hits column by dropping rows with non-num values

    datav['Value'] = datav['Value'].map(convert_currency) #convert currency records to actural numeric values

    star_column = ['W/F', 'SM', 'IR']
    for column in star_column:
        datav[column] = datav[column].str.replace('★', '')
    datav[star_column] = datav[star_column].apply(pd.to_numeric) #removes stars and changing star attribute to num

    datav.loc[:, 'LS':'GK'] = datav.loc[:, 'LS':'GK'].astype(str)
    for cols in datav.loc[:, 'LS':'GK'].columns:
        datav[cols] = datav[cols].apply(eval) #evaluates columns with calculation formula

    conv_mapper = {'Low':1,'Medium':2,'High':3}
    datav.loc[:, ['A/W', 'D/W']] = datav.loc[:, ['A/W', 'D/W']].replace(conv_mapper) #converts L/M/H to 1/2/3
    return datav

datav=clean_datav(datav)

In [66]:
#selects yv Xv
yv = datav['OVA']
Xv = datav.drop(['OVA'], axis=1)

In [67]:
#normalize
Xv_normalized = transformer.transform(Xv)
Xv = pd.DataFrame(Xv_normalized, columns=Xv.columns)

In [68]:
#predict and score
vali_predict = lm.predict(Xv)

r2v = r2_score(yv, vali_predict)
msev = mean_squared_error(yv, vali_predict)
rmsev = np.sqrt(mse)
maev = mean_absolute_error(yv, vali_predict)

print('R2 score:', r2v)
print('MSE:', msev)
print('RMSE:', rmsev)
print('MAE:', maev)

R2 score: 0.9120263290107803
MSE: 3.9378124038736844
RMSE: 1.9922580297667274
MAE: 1.5539758426626933
