In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max_columns', None)
plt.rcParams["figure.figsize"] = (20,20)

In [15]:
file = pd.read_csv('fifa21_train.csv') #Read file

In [16]:
#function to clean and process data
def new_clean(file):
    file=file.rename(columns={
    'FK Accuracy':'Free kick Accuracy',
    'GK Diving':'Goalkeeper Diving',
    'GK Handling':'Goalkeeper Handling',
    'GK Kicking':'Goalkeeper Kicking',
    'GK Positioning':'Goalkeeper Position',
    'GK Reflexes':'Goalkeeper Reflexes',
    'W/F':'Weak foot',
    'SM':'Skill Moves',
    'A/W':'Attacking Workrate',
    'D/W':'Defensive Workrate',
    'IR':'International Reputation',
    'PAC':'Pace total',
    'SHO':'Shooting total',
    'PAS':'Passing total',
    'DRI':'Dribbling total',
    'DEF':'Defending total',
    'PHY':'Physicality total',
    'LS':'LS: Left Striker',
    'ST':'ST: Striker',
    'RS':'RS: Right Striker',
    'LW':'LW: Left Winger',
    'LF':'LF: Left Forward',
    'CF':'CF: Center Forward',
    'RF':'RF: Right Forward',
    'RW':'RW: Right Winger',
    'LAM':'LAM: Left Attacking Midfielder',
    'CAM':'CAM: Center Attacking Midfielder',
    'RAM':'RAM: Right Attacking Midfielder',
    'LM':'LM: Left Midfielder',
    'LCM':'LCM: Left Center Midfielder',
    'CM':'CM: Center Midfielder',
    'RCM':'RCM: Right Center Midfielder',
    'RM':'RM: Right Midfielder',
    'LWB':'LWB: Left Wing Back',
    'LDM':'LDM: Left Defensive Midfielder',
    'CDM':'CDM: Center Defensive Midfielder',
    'RDM':'RDM: Right Defensive Midfielder',
    'RWB':'RWB: Right Wing Back',
    'LB':'LB: Left Back',
    'LCB':'LCB: Left Center Back',
    'CB':'CB: Center Back',
    'RCB':'RCB: Right Center Back',
    'RB':'RB: Right Back',
    'GK':'GK: Goalkeeper'
    })
    file = file[file['Club'].isna() == False]
    file['Position'] = file['Position'].fillna(file['BP']) 
    file['Other position'] = file['Position'].apply(lambda x: len(x.split(' '))) 
    file.drop('Position', axis= 1, inplace = True)
    file.drop('ID', axis= 1, inplace = True)
    file.drop('Loan Date End',axis=1, inplace= True)
    file = file[file['Joined'].isna() == False] 
    file['Composure'] = file['Composure'].fillna(file['Composure'].mean()) 
    file['Attacking Workrate'] = file['Attacking Workrate'].str.replace("High", '3')
    file['Attacking Workrate'] = file['Attacking Workrate'].str.replace("Medium", '2')
    file['Attacking Workrate'] = file['Attacking Workrate'].str.replace("Low", '1')
    file['Defensive Workrate'] = file['Defensive Workrate'].str.replace("High", '3')
    file['Defensive Workrate'] = file['Defensive Workrate'].str.replace("Medium", '2')
    file['Defensive Workrate'] = file['Defensive Workrate'].str.replace("Low", '1')
    file['Attacking Workrate'] = file['Attacking Workrate'].fillna(file['Attacking Workrate'].median()).astype('int')
    file['Defensive Workrate'] = file['Defensive Workrate'].fillna(file['Defensive Workrate'].median()).astype('int')
    file['Wage']=file['Wage'].str.replace("€",'')
    file['Wage']=file['Wage'].str.replace("K",'000')
    file['Wage']=file['Wage'].str.replace("M",'000000')
    file['Wage']=file['Wage'].str.replace(".",'')
    file['Value']=file['Value'].str.replace("€",'')
    file['Value']=file['Value'].str.replace("K",'000')
    file['Value']=file['Value'].str.replace("M",'000000')
    file['Value']=file['Value'].str.replace(".",'')
    file['Release Clause']=file['Release Clause'].str.replace("€",'')
    file['Release Clause']=file['Release Clause'].str.replace("K",'000')
    file['Release Clause']=file['Release Clause'].str.replace("M",'000000')
    file['Release Clause']=file['Release Clause'].str.replace(".",'')
    coltoremove = file.columns[-30:-2]
    for columns in coltoremove:
        file.drop(columns, axis= 1, inplace= True)
    def change_height(ht):
        ftin = ht.split("'")
        feet = int(ftin[0])
        inches = float(ftin[1].split('"')[0])
        return (12*feet) + inches
    file['Height'] = file['Height'].apply(lambda x: change_height(x))
    file['Weight'] = file['Weight'].apply(lambda x: x.replace('lbs', '')).astype('int')
    file['Joined'] = pd.to_datetime(file['Joined']).dt.year
    file['Wage'] = pd.to_numeric(file['Wage'])
    file['Value'] = pd.to_numeric(file['Value'])
    file['Release Clause'] = pd.to_numeric(file['Release Clause'])
    file['Weak foot'] = file['Weak foot'].str.rstrip('★').astype('int')
    file['Skill Moves'] = file['Skill Moves'].str.rstrip('★').astype('int')
    file['International Reputation'] = file['International Reputation'].str.rstrip('★').astype('int')
    file.drop('Contract', axis= 1, inplace= True)
    file.drop('Name', axis= 1, inplace= True)
    file.drop('Team & Contract', axis = 1, inplace= True)
    file.drop('Club', axis = 1, inplace= True)
    file.drop('Nationality', axis = 1, inplace= True)
    categoricals = file.select_dtypes('object')
    numericals = file._get_numeric_data().reset_index()
    min_max_transformer = MinMaxScaler().fit(numericals.drop('OVA', axis=1)) # Fiting MinMaxScaler to data then storing as variable min_max_transformer
    X_normalized = min_max_transformer.transform(numericals.drop('OVA', axis=1)) #Creating X_normalized variable using transform method on min_max_transformer for X
    min_max_X = pd.DataFrame(X_normalized, columns= numericals.drop('OVA', axis=1).columns)
    min_max_X['OVA'] = numericals['OVA']
    encoder = OneHotEncoder(drop='first').fit(categoricals) 
    encoded = encoder.transform(categoricals).toarray() 
    encoded_columns = OneHotEncoder(drop='first').fit(categoricals).get_feature_names(input_features=categoricals.columns) # this allows us to get the columns names for our encoded array
    onehot_encoded_X = pd.DataFrame(encoded, columns = encoded_columns) 
    onehot_encoded_X.reset_index()
    file = pd.concat([onehot_encoded_X, min_max_X], axis=1)
    file.drop('index', axis= 1, inplace=True)
    file.drop('Crossing', axis= 1, inplace= True)
    file.drop('Finishing', axis= 1, inplace= True)
    file.drop('Heading Accuracy', axis= 1, inplace= True)
    file.drop('Short Passing', axis= 1, inplace= True)
    file.drop('Volleys', axis= 1, inplace= True)
    file.drop('Dribbling', axis= 1, inplace= True)
    file.drop('Curve', axis= 1, inplace= True)
    file.drop('Free kick Accuracy', axis= 1, inplace= True)
    file.drop('Long Passing', axis= 1, inplace= True)
    file.drop('Ball Control', axis= 1, inplace= True)
    file.drop('Acceleration', axis= 1, inplace= True)
    file.drop('Sprint Speed', axis= 1, inplace= True)
    file.drop('Agility', axis= 1, inplace= True)
    file.drop('Reactions', axis= 1, inplace= True)
    file.drop('Balance', axis= 1, inplace= True)
    file.drop('Shot Power', axis= 1, inplace= True)
    file.drop('Jumping', axis= 1, inplace= True)
    file.drop('Stamina', axis= 1, inplace= True)
    file.drop('Strength', axis= 1, inplace= True)
    file.drop('Long Shots', axis= 1, inplace= True)
    file.drop('Aggression', axis= 1, inplace= True)
    file.drop('Interceptions', axis= 1, inplace= True)
    file.drop('Positioning', axis= 1, inplace= True)
    file.drop('Vision', axis= 1, inplace= True)
    file.drop('Penalties', axis= 1, inplace= True)
    file.drop('Composure', axis= 1, inplace= True)
    file.drop('Marking', axis= 1, inplace= True)
    file.drop('Standing Tackle', axis= 1, inplace= True)
    file.drop('Sliding Tackle', axis= 1, inplace= True)
    file.drop('Goalkeeper Diving', axis= 1, inplace= True)
    file.drop('Goalkeeper Handling', axis= 1, inplace= True)
    file.drop('Goalkeeper Kicking', axis= 1, inplace= True)
    file.drop('Goalkeeper Position', axis= 1, inplace= True)
    file.drop('Goalkeeper Reflexes', axis= 1, inplace= True)
    return file

In [17]:
train = new_clean(file) #Renaming cleaned file to train and using it all for training the model below

  file['Wage']=file['Wage'].str.replace(".",'')
  file['Value']=file['Value'].str.replace(".",'')
  file['Release Clause']=file['Release Clause'].str.replace(".",'')


In [18]:
#X-Y split
y = train['OVA'] #Target
X = train.drop('OVA', axis= 1) #Features

lm = linear_model.LinearRegression() #linear regression model
lm.fit(X,y) 
print(lm.intercept_) 
print(lm.coef_) 
print( 'r2_score is ', r2_score(lm.predict(X),y))
print('MSE is ', mean_squared_error(lm.predict(X),y))

43.218017472702726
[ 5.18769593  1.01200416  0.30055571 -0.46209231 -5.51208617  1.63587744
  0.17066767  0.37042884  1.02437704  1.75153904 -0.04802005 -0.05922485
  0.97472652  0.75456158  0.0686882   1.00103325  0.6438383   1.04017331
 -3.5977272  -1.69723229  7.83487462  6.90585478 12.66862968 11.47669954
 -8.35125431  0.30720433 -3.88016237 -3.72262115 -1.05943713  6.70223673
  0.71974702  9.49891762  0.12324286  3.09373493 -0.51646298 -0.35446669
  1.97493661 -1.27509291  1.54875304  6.87909633 18.76816518  3.28585147
  7.30320469 -1.22077597]
r2_score is  0.8529664191767368
MSE is  6.005418049513938


# Validating model

In [19]:
test = pd.read_csv('fifa21_validate.csv') #file for validation
clean_test = new_clean(test) #Cleaned and processed test

  file['Wage']=file['Wage'].str.replace(".",'')
  file['Value']=file['Value'].str.replace(".",'')
  file['Release Clause']=file['Release Clause'].str.replace(".",'')


## Metrics

In [20]:
print(r2_score(lm.predict(clean_test.drop('OVA', axis=1)), clean_test['OVA'])) #R2
print(mean_squared_error(lm.predict(clean_test.drop('OVA', axis=1)), clean_test['OVA'])) #MSE
print(np.sqrt(mean_squared_error(lm.predict(clean_test.drop('OVA', axis=1)), clean_test['OVA']))) #RMSE

0.8627871196401479
6.7294640330824125
2.594121052125828
