Create a Model that Predicts Overall rating of the players.

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:

data = pd.read_csv('fifa21_train.csv')


# Rename Columns

In [3]:
data.columns = data.columns.str.lower().str.replace(' ','_')
data.columns = data.columns.str.lower().str.replace('/','_')

In [4]:
data = data.rename(columns={'bp':'best_position',
                            'team_&_contract':'team_contract', 
                            'w_f':'weak_foot', 'sm': 'skill_moves', 'a_w': 'attacking_work_rate', 'd_w': 'defensive_work_rate', 'ir': 'international_reputation', 'wf': 'weak_foot' })

# Checking

In [5]:
# data.head()
# data.columns.tolist()
# data1.dtypes.tolist()
# data.isna().count().tolist()
# data['height']

# Decisions.
- OVR = ATT + IR (Overal Rating equals attributes and international reputation).

- We know that the variables used to calculate OVR are the ones of the **attributes** of the player (crossing, dribbling, positioning, etc.) plus the **international reputation**.

- Our categorical column is **"best position"**.

- We also have to take into account how the OVR changes when the player does not play in its "best position".

- So we need to use the information of the **last columns as well** (the ones with the information of the players' performance in each postion).

- For these columns (the ones with "+"), we´re going to keep **the second number**. We think the first number can be calculated from the columns of attributes.

- We know some columns are the sum of other columns. We decided to keep just the columns of attributes that are not a summation of other columns.

- We cleaned and treated this data and obtained our first model.

- We tried to improve our first model.

- We built a second model in which we decided to include all of our selected features (without discarding anyone based on correlation numbers).

- Then in our third and last model we included a new feature: **"wage"**. This gaves us our best results.


# New DF

#### Clean Wage

In [6]:
icon = '€'

data['wage'] = data['wage'].str.replace(icon, '')
data['wage'] = data['wage'].str.replace('K', '000')

data['wage'] = data['wage'].astype('int')

#### Divide Columns, Choose Right

In [7]:
data_pos = data.loc[:, 'ls':'gk']

for col in data_pos.columns:
    data_pos[[col, col + '_right']] = data_pos[col].str.split('+', expand=True)

# Convertir las columnas resultantes en numéricas
data_pos = data_pos.astype(int)

data_pos = data_pos.drop(data_pos.loc[:, 'ls':'gk'].columns, axis=1)

#### Drop Columns

In [8]:
data_att = data.loc[:, 'crossing':'gk_reflexes']
data_att = data_att.drop(['skill', 'movement', 'power', 'mentality', 'defending', 'goalkeeping'], axis=1)

#### Clean Star

In [9]:
icon = '★'

data['international_reputation'] = data['international_reputation'].str.replace(icon, '')
data['international_reputation'] = data['international_reputation'].str.replace(' ', '')

data['international_reputation'] = data['international_reputation'].astype('int')

#### Categorical to Numerical

In [10]:
data_cat = pd.DataFrame(data['best_position'])

encoder = OneHotEncoder(drop='first').fit(data_cat)
encoded = encoder.transform(data_cat).toarray()

data_cat = pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

#### Clean NaN

In [11]:
# data_att.describe()

In [12]:
for col in data_att.columns:
    data_att[col] = data_att[col].fillna(data_att[col].mean())

# data_att.describe()

#### Concatenate Numerical

In [13]:
data_num = pd.concat([data_att, data_pos, data['international_reputation'], data['wage']], axis=1)
data_num.columns.tolist()
# data_nor.isna().sum().tolist()

['crossing',
 'finishing',
 'heading_accuracy',
 'short_passing',
 'volleys',
 'dribbling',
 'curve',
 'fk_accuracy',
 'long_passing',
 'ball_control',
 'acceleration',
 'sprint_speed',
 'agility',
 'reactions',
 'balance',
 'shot_power',
 'jumping',
 'stamina',
 'strength',
 'long_shots',
 'aggression',
 'interceptions',
 'positioning',
 'vision',
 'penalties',
 'composure',
 'marking',
 'standing_tackle',
 'sliding_tackle',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes',
 'ls_right',
 'st_right',
 'rs_right',
 'lw_right',
 'lf_right',
 'cf_right',
 'rf_right',
 'rw_right',
 'lam_right',
 'cam_right',
 'ram_right',
 'lm_right',
 'lcm_right',
 'cm_right',
 'rcm_right',
 'rm_right',
 'lwb_right',
 'ldm_right',
 'cdm_right',
 'rdm_right',
 'rwb_right',
 'lb_right',
 'lcb_right',
 'cb_right',
 'rcb_right',
 'rb_right',
 'gk_right',
 'international_reputation',
 'wage']

In [14]:
data_num.dtypes

crossing                      int64
finishing                     int64
heading_accuracy              int64
short_passing                 int64
volleys                     float64
                             ...   
rcb_right                     int32
rb_right                      int32
gk_right                      int32
international_reputation      int32
wage                          int32
Length: 63, dtype: object

# Check Correlation

In [15]:
data1_target = pd.concat([data['ova'], data_num], axis=1)
data1_target.dtypes
correlation_matrix = data1_target.corr()
correlation_matrix.columns

Index(['ova', 'crossing', 'finishing', 'heading_accuracy', 'short_passing',
       'volleys', 'dribbling', 'curve', 'fk_accuracy', 'long_passing',
       'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions',
       'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
       'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
       'composure', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes', 'ls_right', 'st_right', 'rs_right', 'lw_right',
       'lf_right', 'cf_right', 'rf_right', 'rw_right', 'lam_right',
       'cam_right', 'ram_right', 'lm_right', 'lcm_right', 'cm_right',
       'rcm_right', 'rm_right', 'lwb_right', 'ldm_right', 'cdm_right',
       'rdm_right', 'rwb_right', 'lb_right', 'lcb_right', 'cb_right',
       'rcb_right', 'rb_right', 'gk_right', 'international_reputation',
       'wage'],
      dtype='object')

For this model, we don't use correlation figures to discard any of the features.

# Normalize

In [16]:
tra = MinMaxScaler().fit(data_num)
nor = tra.transform(data_num)

data_nor = pd.DataFrame(nor, columns = data_num.columns)

data_nor

Unnamed: 0,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,...,rdm_right,rwb_right,lb_right,lcb_right,cb_right,rcb_right,rb_right,gk_right,international_reputation,wage
0,0.545455,0.478261,0.431818,0.720930,0.465116,0.615385,0.444444,0.561798,0.642857,0.637363,...,0.750,0.666667,0.666667,0.750,0.750,0.750,0.666667,0.333333,0.00,0.007143
1,0.681818,0.826087,0.806818,0.697674,0.837209,0.857143,0.822222,0.752809,0.642857,0.813187,...,0.875,0.833333,0.833333,0.875,0.875,0.875,0.833333,0.666667,0.25,0.041071
2,0.761364,0.793478,0.329545,0.813953,0.825581,0.879121,0.944444,0.966292,0.773810,0.879121,...,0.875,0.833333,0.833333,0.875,0.875,0.875,0.833333,0.666667,0.25,0.087500
3,0.431818,0.423913,0.602273,0.627907,0.372093,0.538462,0.411111,0.460674,0.571429,0.615385,...,0.875,0.833333,0.833333,0.875,0.875,0.875,0.833333,0.666667,0.00,0.007143
4,0.488636,0.369565,0.636364,0.697674,0.348837,0.648352,0.444444,0.449438,0.619048,0.670330,...,0.875,0.833333,0.833333,0.875,0.875,0.875,0.833333,0.666667,0.00,0.003571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11696,0.636364,0.228261,0.522727,0.593023,0.279070,0.571429,0.500000,0.269663,0.535714,0.593407,...,0.875,0.833333,0.833333,0.875,0.875,0.875,0.833333,0.666667,0.00,0.005357
11697,0.068182,0.119565,0.090909,0.244186,0.139535,0.131868,0.122222,0.157303,0.214286,0.142857,...,0.875,0.833333,0.833333,0.875,0.875,0.875,0.833333,0.666667,0.00,0.007143
11698,0.659091,0.684783,0.522727,0.651163,0.430233,0.725275,0.588889,0.370787,0.583333,0.681319,...,0.875,0.833333,0.833333,0.875,0.875,0.875,0.833333,0.666667,0.00,0.003571
11699,0.659091,0.684783,0.522727,0.755814,0.477527,0.758242,0.506865,0.651685,0.750000,0.769231,...,0.625,0.500000,0.500000,0.625,0.625,0.625,0.500000,0.000000,0.50,0.000000


# Concatenate

In [17]:
data1 = pd.concat([data_cat, data_nor], axis=1)
# data1.describe().T
# data1.dtypes

# Train-Test Split

In [18]:
X = data1
y = data['ova']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 42)

# Linear Regression

In [19]:
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [20]:
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.899349316710787

# R2

In [21]:
predictions_test = lm.predict(X_test) # The R2 test should be similar to the train test. That means is a strong model.
r2_score(y_test, predictions_test)

0.8931452810350685

# MAE

In [22]:
mae = mean_absolute_error(y_test, predictions_test) # It is under 2, so it´s good. We can also compare this with the range of the target. It should be similar to the RMSE. But the RMSE is going to be bigger because of the squaring.
mae

1.6947456715631577

# MSE

In [23]:
mse=mean_squared_error(y_test,predictions_test)
mse

4.842045385324614

# RMSE

In [24]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse

2.2004648111989007

# Create a DEF Function

In [25]:
def clean (df):

    # Rename columns
    df.columns = df.columns.str.lower().str.replace(' ','_')
    df.columns = df.columns.str.lower().str.replace('/','_')
    df = df.rename(columns={'bp':'best_position',
                                'team_&_contract':'team_contract', 
                                'w_f':'weak_foot', 'sm': 'skill_moves', 'a_w': 'attacking_work_rate', 'd_w': 'defensive_work_rate', 'ir': 'international_reputation', 'wf': 'weak_foot' })
    # Edit wage   
    icon = '€'

    df['wage'] = df['wage'].str.replace(icon, '')
    df['wage'] = df['wage'].str.replace('K', '000')

    df['wage'] = df['wage'].astype('int')

    # Edit positions
    data_pos = df.loc[:, 'ls':'gk']
    for col in data_pos.columns:
        data_pos[[col, col + '_right']] = data_pos[col].str.split('+', expand=True)
    data_pos = data_pos.astype(int)
    data_pos = data_pos.drop(data_pos.loc[:, 'ls':'gk'].columns, axis=1)
    
    # Edit attributes
    data_att = df.loc[:, 'crossing':'gk_reflexes']
    data_att = data_att.drop(['skill', 'movement', 'power', 'mentality', 'defending', 'goalkeeping'], axis=1)
    
    # Edit international reputation
    icon = '★'
    df['international_reputation'] = df['international_reputation'].str.replace(icon, '')
    df['international_reputation'] = df['international_reputation'].str.replace(' ', '')
    df['international_reputation'] = df['international_reputation'].astype('int')
    
    # Edit best position
    data_cat = pd.DataFrame(df['best_position'])
    # don´t use this, it´s already trained:
    # encoder = OneHotEncoder(drop='first').fit(data_cat)
    encoded = encoder.transform(data_cat).toarray()
    data_cat = pd.DataFrame(encoded,columns=encoder.get_feature_names_out())
    
    # Fill na
    for col in data_att.columns:
        data_att[col] = data_att[col].fillna(data_att[col].mean())
    
    # Concat
    data_num = pd.concat([data_att, data_pos, df['international_reputation'], df['wage']], axis=1)
    
    
    # Normalise
    # don´t use this, it´s already trained:
    # tra = MinMaxScaler().fit(data_num)
    nor = tra.transform(data_num)
    data_nor = pd.DataFrame(nor, columns = data_num.columns)
    
    # concatenate final
    df = pd.concat([data_cat, data_nor], axis=1)
    
    return df

In [26]:
data_validate = pd.read_csv('fifa21_validate.csv')

In [27]:
data_validate_clean = clean(data_validate)

In [28]:
X_val = data_validate_clean
y_val = data_validate['ova']

predictions_test_val = lm.predict(X_val)
r2_score(y_val, predictions_test_val)

0.8975637259605032

In [29]:
mae = mean_absolute_error(y_val, predictions_test_val)
mae

1.6766299339340625

In [30]:
mse=mean_squared_error(y_val,predictions_test_val)
mse

4.687635612335356

In [31]:
rmse = np.sqrt(mse)
rmse

2.1650948275619144