In [544]:
import pandas as pd   # import pandas
import numpy as np    # import numpy

pd.set_option('display.max_columns', None)
file = pd.read_csv('fifa21_train.csv')    # import the document

In [545]:
# rename columns and lower case them to handle them easier

file = file.rename(columns={'Heading Accuracy':'heading_accuracy','Short Passing':'short_passing', 
                           'FK Accuracy':'fk_accuracy', 'Long Passing':'long_passing', 'Ball Control':'ball_control',
                           'Sprint Speed':'sprint_speed', 'Shot Power':'shot_power', 'Long Shots':'long_shots',
                           'Standing Tackle':'Standing_Tackle', 'Sliding Tackle':'Sliding_Tackle',
                           'GK Diving':'GK_Diving', 'GK Handling':'GK_Handling', 'GK Kicking':'GK_Kicking',
                            'GK Positioning':'GK_Positioning', 'GK Reflexes':'GK_Reflexes'})

colu = []
for colname in file.columns:
    colu.append(colname.lower())
file.columns = colu

In [546]:
pd.options.display.max_rows = 5   # To display all the NaN values and deal with them
file.isnull().sum().sum()         # they are 0 now as we already worked with them (.sum().sum() gives us the total sum)

12117

# Deal with NaN values.

In [547]:
file['position'] = file['position'].fillna(file['bp'])   # position will be filled with the base position

In [548]:
file['composure'] = file['composure'].fillna(np.mean(file['composure']))   # composure is filled with the mean of composure as there were not so many players and we did not want to drop them

In [549]:
# fill the rest of NaN with a simple substraction because we notices the 'totals' include subcategories but we prefer using the subcategories rather than the totals.
file['volleys'] = file['attacking'] - file['crossing'] - file['finishing'] - file['heading_accuracy'] - file['short_passing']
file['curve'] = file['skill'] - file['dribbling'] - file['fk_accuracy'] - file['long_passing'] - file['ball_control']
file['jumping'] = file['power'] - file['shot_power'] - file['stamina'] - file['strength'] - file['long_shots']
file['sliding_tackle'] = file['defending'] - file['marking'] - file['standing_tackle']

In [550]:
# This values are part of a bigger column that will be dropped, but needed to obtain them. 
# We decided to divide the values of 'agility' & 'Balance' as well as 'interceptions', 'vision' &
# 'positioning' equally in their respective 'groups' (as seen in the original DF)
file['agility'] = (file['movement'] - file['acceleration'] - file['sprint_speed'] - file['reactions'])/2
file['balance'] = file['agility']

file['interceptions'] = (file['mentality'] - file['aggression'] - file['penalties'])/3
file['vision'] = file['interceptions']
file['positioning'] = file['interceptions']

In [551]:
# Finally deal with nulls of a/w and d/w, we chose to fill with the mode that also happens to be the Median
print(file['a/w'].mode()) ; print(file['d/w'].mode())
file['a/w'] = file['a/w'].fillna('Medium')
file['d/w'] = file['d/w'].fillna('Medium')

0    Medium
Name: a/w, dtype: object
0    Medium
Name: d/w, dtype: object


In [552]:
# We checked for duplicates with ID and had 0 duplicates
# We also checked for 'name' duplicates and found 469; but after analyzing the data realized they are all different people
print(file.duplicated('id').sum())
print(file.duplicated('name').sum())
dupli = file[file.duplicated('name',keep=False)==True]   # keep=false - default setting will show the duplicates after the original (so not the original) this makes it show
dupli.tail()

0
469


Unnamed: 0,id,name,age,nationality,club,bp,position,team & contract,height,weight,foot,growth,joined,loan date end,value,wage,release clause,contract,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total stats,base stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
11656,205598,André Sousa,29,Portugal,Gazişehir Gaziantep F.K.,CM,CM,Gazişehir Gaziantep F.K. 2020 ~ 2021,"5'11""",172lbs,Left,0,"Jan 25, 2020",,€850K,€6K,€1.7M,2020 ~ 2021,321,68,61,55,70,67,360,69,78,74,68,71,325,67,57,67.0,67,67.0,367,80,67,72,74,74,328,70,66.333333,66.333333,66.333333,59,65.0,172,57,60,55,45,15,8,6,8,8,1918,400,2 ★,3★,Medium,Medium,1 ★,62,68,70,69,59,72,3,67+1,67+1,67+1,67+0,68+0,68+0,68+0,67+0,68+0,68+0,68+0,68+0,68+0,68+0,68+0,68+0,65+2,66+2,66+2,66+2,65+2,63+2,63+2,63+2,63+2,63+2,16+2,68
11662,221220,J. Quintero,23,Colombia,Deportivo Cali,CB,CB,Deportivo Cali 2013 ~ 2018,"6'0""",174lbs,Right,8,"Jul 1, 2013",,€1.3M,€2K,€2.1M,2013 ~ 2018,204,26,19,71,64,24,192,36,25,22,54,55,286,58,64,51.0,62,51.0,269,40,68,55,84,22,233,66,42.666667,42.666667,42.666667,39,57.0,202,75,66,61,45,7,7,14,9,8,1431,318,2 ★,2★,Low,Medium,1 ★,61,25,46,45,69,72,5,44+1,44+1,44+1,42+0,43+0,43+0,43+0,42+0,45+1,45+1,45+1,46+1,51+1,51+1,51+1,46+1,57+1,63+1,63+1,63+1,57+1,60+1,68+1,68+1,68+1,60+1,14+1,69
11673,251990,Luquinhas,19,Brazil,Portimonense SC,RM,CAM RW,Portimonense SC 2019 ~ 2024,"5'6""",143lbs,Right,8,"Jul 1, 2019",,€525K,€1K,€1.2M,2019 ~ 2024,267,63,51,30,63,60,306,68,55,59,57,67,395,79,71,90.5,64,90.5,253,59,53,53,38,50,242,42,46.666667,46.666667,46.666667,60,58.0,100,37,31,32,41,6,8,15,6,6,1604,335,3 ★,3★,Medium,Low,1 ★,75,54,61,71,31,43,10,57+2,57+2,57+2,64+0,62+0,62+0,62+0,64+0,63+2,63+2,63+2,64+2,57+2,57+2,57+2,64+2,51+2,47+2,47+2,47+2,51+2,48+2,39+2,39+2,39+2,48+2,13+2,63
11693,214343,D. Moreno,28,Colombia,Junior FC,CDM,CDM CM,Junior FC 2020 ~ 2021,"5'11""",170lbs,Right,0,"Jan 1, 2020",,€800K,€2K,€1.1M,2020 ~ 2021,266,47,30,61,64,64,256,57,39,35,61,64,346,71,70,69.0,67,69.0,302,59,34,94,79,36,288,69,60.666667,60.666667,60.666667,37,64.0,193,68,62,63,54,7,7,12,16,12,1705,372,2 ★,2★,Medium,High,1 ★,70,40,57,61,65,79,7,56+2,56+2,56+2,57+0,57+0,57+0,57+0,57+0,58+2,58+2,58+2,60+2,62+2,62+2,62+2,60+2,66+2,67+1,67+1,67+1,66+2,66+2,66+2,66+2,66+2,66+2,17+2,68
11698,221489,J. Flores,22,Chile,CD Antofagasta,RM,LM CAM RM,CD Antofagasta 2019 ~ 2024,"5'6""",143lbs,Right,8,"Jan 23, 2019",,€1.1M,€2K,€1.7M,2019 ~ 2024,286,64,66,51,64,41,291,71,57,38,58,67,388,77,76,86.5,62,86.5,278,61,71,64,37,45,241,27,49.666667,49.666667,49.666667,65,58.0,62,37,12,13,57,13,10,11,7,16,1603,337,3 ★,3★,High,Medium,1 ★,76,60,60,72,26,43,6,62+2,62+2,62+2,67+0,65+0,65+0,65+0,67+0,65+2,65+2,65+2,66+2,58+2,58+2,58+2,66+2,49+2,44+2,44+2,44+2,49+2,45+2,35+2,35+2,35+2,45+2,17+2,67


In [553]:
file = file.drop(['id', 'nationality', 'club', 'team & contract',
                 'height', 'weight', 'foot', 'joined', 'loan date end',
                 'value', 'wage', 'release clause', 'contract', 'attacking',
                 'skill', 'movement', 'power', 'mentality', 'defending',
                 'goalkeeping', 'total stats', 'base stats', 'w/f', 'ir' ],axis=1)

In [554]:
pd.options.display.max_rows = 5   # To display all the NaN values and deal with them
file.isnull().sum().sum()

0

In [555]:
file

Unnamed: 0,name,age,bp,position,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,sm,a/w,d/w,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
0,A. Pasche,26,CM,CM CDM,1,54,47,43,70,44,61,44,55,63,63,64,73,71.5,66,71.5,62,73,71,55,45,54,60.666667,60.666667,60.666667,54,54.000000,49,56,43,7,12,14,9,6,2★,High,Medium,69,51,63,63,51,60,3,58+1,58+1,58+1,61+0,62+0,62+0,62+0,61+0,63+1,63+1,63+1,63+1,63+1,63+1,63+1,63+1,59+1,59+1,59+1,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64
1,Alan Carvalho,30,ST,ST LW LM,0,66,79,76,68,76,83,78,72,63,79,83,83,81.5,75,81.5,74,81,75,74,68,54,61.000000,61.000000,61.000000,76,70.000000,35,20,22,11,7,14,7,16,4★,High,Low,83,75,68,82,33,71,44,77+0,77+0,77+0,77+0,77+0,77+0,77+0,77+0,76+1,76+1,76+1,76+1,68+2,68+2,68+2,76+1,57+2,53+2,53+2,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11699,Anderson Silva,26,CM,CM,7,64,66,51,73,0,74,0,63,72,75,71,72,0.0,55,0.0,61,0,71,64,62,64,41.333333,41.333333,41.333333,53,59.947732,55,58,0,9,24,72,24,24,1★,Medium,Medium,72,63,70,72,57,66,3,68+0,68+0,68+0,71+0,68+0,68+0,68+0,71+0,72+0,72+0,72+0,71+0,71+0,71+0,71+0,71+0,66+0,68+0,68+0,68+0,66+0,64+0,60+0,60+0,60+0,64+0,25+0,68
11700,T. Conechny,22,CAM,ST CAM LM,9,43,64,65,64,53,64,41,43,55,64,81,78,85.5,61,85.5,62,85,65,42,58,34,49.666667,49.666667,49.666667,61,56.000000,20,18,18,10,8,14,8,8,3★,High,Low,79,62,56,67,24,48,8,64+2,64+2,64+2,65+0,65+0,65+0,65+0,65+0,65+2,65+2,65+2,63+2,58+2,58+2,58+2,63+2,46+2,43+2,43+2,43+2,46+2,44+2,36+2,36+2,36+2,44+2,15+2,64


In [556]:
pd.options.display.max_rows = 50
file['hits'] = file['hits'].str.replace('K','000')
file['hits'] = file['hits'].str.replace('.','')
file['hits'] = pd.to_numeric(file['hits'], errors='coerce')
file['hits'].unique

  file['hits'] = file['hits'].str.replace('.','')


<bound method Series.unique of 0         3
1        44
2        73
3         7
4         4
         ..
11696     3
11697     3
11698     6
11699     3
11700     8
Name: hits, Length: 11701, dtype: int64>

In [557]:
file.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,11701.0,25.27049,4.95764,16.0,21.0,25.0,29.0,43.0
growth,11701.0,5.534655,5.810903,-1.0,0.0,4.0,10.0,26.0
crossing,11701.0,51.593795,17.872747,6.0,41.0,56.0,65.0,94.0
finishing,11701.0,48.048116,19.399617,3.0,33.0,52.0,64.0,95.0
heading_accuracy,11701.0,53.49406,16.956195,5.0,46.0,56.0,65.0,93.0
short_passing,11701.0,60.4368,13.971811,8.0,56.0,63.0,69.0,94.0
volleys,11701.0,44.909409,17.901015,0.0,32.0,47.0,59.0,90.0
dribbling,11701.0,57.852491,18.050938,5.0,53.0,63.0,70.0,96.0
curve,11701.0,49.443979,18.383852,0.0,37.0,52.0,64.0,94.0
fk_accuracy,11701.0,44.35826,17.484142,5.0,32.0,43.0,58.0,94.0


In [558]:
data = file.copy()
data

Unnamed: 0,name,age,bp,position,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,sm,a/w,d/w,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
0,A. Pasche,26,CM,CM CDM,1,54,47,43,70,44,61,44,55,63,63,64,73,71.5,66,71.5,62,73,71,55,45,54,60.666667,60.666667,60.666667,54,54.000000,49,56,43,7,12,14,9,6,2★,High,Medium,69,51,63,63,51,60,3,58+1,58+1,58+1,61+0,62+0,62+0,62+0,61+0,63+1,63+1,63+1,63+1,63+1,63+1,63+1,63+1,59+1,59+1,59+1,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64
1,Alan Carvalho,30,ST,ST LW LM,0,66,79,76,68,76,83,78,72,63,79,83,83,81.5,75,81.5,74,81,75,74,68,54,61.000000,61.000000,61.000000,76,70.000000,35,20,22,11,7,14,7,16,4★,High,Low,83,75,68,82,33,71,44,77+0,77+0,77+0,77+0,77+0,77+0,77+0,77+0,76+1,76+1,76+1,76+1,68+2,68+2,68+2,76+1,57+2,53+2,53+2,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77
2,S. Giovinco,33,CAM,CAM CF,0,73,76,34,78,75,85,89,91,74,85,84,76,93.0,78,93.0,79,34,75,42,78,75,61.333333,61.333333,61.333333,73,82.000000,23,29,28,6,3,6,3,3,4★,High,Medium,80,77,78,86,27,56,73,73+2,73+2,73+2,80+0,79+0,79+0,79+0,80+0,80+0,80+0,80+0,79+1,74+2,74+2,74+2,79+1,59+2,56+2,56+2,56+2,59+2,53+2,41+2,41+2,41+2,53+2,12+2,80
3,J. Evans,22,CDM,CDM CM,13,44,42,58,62,36,54,41,46,57,61,54,59,57.0,55,57.0,57,60,64,58,38,61,47.333333,47.333333,47.333333,54,48.000000,55,58,55,8,9,6,7,12,2★,Medium,Medium,57,44,54,57,57,60,7,50+2,50+2,50+2,51+0,51+0,51+0,51+0,51+0,53+2,53+2,53+2,53+2,56+2,56+2,56+2,53+2,56+2,58+2,58+2,58+2,56+2,57+2,58+2,58+2,58+2,57+2,14+2,59
4,Y. Demoncy,23,CDM,CDM CM,8,49,37,61,68,34,64,44,45,61,66,66,66,65.0,62,65.0,61,34,81,61,43,66,59.666667,59.666667,59.666667,49,58.000000,58,61,66,8,9,15,5,15,3★,Low,Medium,66,44,60,64,60,66,4,56+2,56+2,56+2,59+0,59+0,59+0,59+0,59+0,61+2,61+2,61+2,62+2,63+2,63+2,63+2,62+2,64+2,64+2,64+2,64+2,64+2,63+2,61+2,61+2,61+2,63+2,15+2,65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11696,B. Böðvarsson,25,LB,LB,5,62,24,51,59,28,57,49,29,54,59,78,73,64.5,53,64.5,31,72,77,68,25,63,54.666667,54.666667,54.666667,39,46.000000,59,64,61,13,14,14,6,14,2★,Medium,Medium,75,28,56,59,60,69,3,48+2,48+2,48+2,55+0,52+0,52+0,52+0,55+0,54+2,54+2,54+2,58+2,56+2,56+2,56+2,58+2,63+2,60+2,60+2,60+2,63+2,63+2,61+2,61+2,61+2,63+2,16+2,65
11697,G. Gallon,27,GK,GK,4,12,14,13,29,16,17,15,19,27,18,40,40,46.0,65,46.0,50,59,29,64,15,30,17.333333,17.333333,17.333333,19,56.000000,12,16,13,72,67,66,70,72,1★,Medium,Medium,72,67,66,72,40,70,3,28+2,28+2,28+2,24+0,26+0,26+0,26+0,24+0,24+2,24+2,24+2,25+2,25+2,25+2,25+2,25+2,25+2,26+2,26+2,26+2,25+2,24+2,26+2,26+2,26+2,24+2,69+2,70
11698,J. Flores,22,RM,LM CAM RM,8,64,66,51,64,41,71,57,38,58,67,77,76,86.5,62,86.5,61,71,64,37,45,27,49.666667,49.666667,49.666667,65,58.000000,37,12,13,13,10,11,7,16,3★,High,Medium,76,60,60,72,26,43,6,62+2,62+2,62+2,67+0,65+0,65+0,65+0,67+0,65+2,65+2,65+2,66+2,58+2,58+2,58+2,66+2,49+2,44+2,44+2,44+2,49+2,45+2,35+2,35+2,35+2,45+2,17+2,67
11699,Anderson Silva,26,CM,CM,7,64,66,51,73,0,74,0,63,72,75,71,72,0.0,55,0.0,61,0,71,64,62,64,41.333333,41.333333,41.333333,53,59.947732,55,58,0,9,24,72,24,24,1★,Medium,Medium,72,63,70,72,57,66,3,68+0,68+0,68+0,71+0,68+0,68+0,68+0,71+0,72+0,72+0,72+0,71+0,71+0,71+0,71+0,71+0,66+0,68+0,68+0,68+0,66+0,64+0,60+0,60+0,60+0,64+0,25+0,68


In [559]:
X = data.drop(['ova'],axis=1)   # OVA is our target
y = data['ova']

X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)

In [560]:
from sklearn.preprocessing import MinMaxScaler 

transformer = MinMaxScaler().fit(X_num)
x_normalized = transformer.transform(X_num)
print(x_normalized.shape)
data_normalized = pd.DataFrame(x_normalized, columns = X_num.columns)
pd.DataFrame(x_normalized, columns=X_num.columns)

print(type(data_normalized))

(11701, 43)
<class 'pandas.core.frame.DataFrame'>


In [561]:
X_cat1=X_cat.drop(columns=['position', 'ls', 'st', 'rs', 'lw', 'lf',
                           'cf', 'rf', 'rw' ,'lam', 'cam','ram', 'lm',
                            'lcm', 'cm','rcm', 'rm', 'lwb', 'ldm' , 'cdm',
                            'rdm' , 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb',
                            'gk'],axis=1)

In [562]:
X_num.corr()

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits
age,1.0,-0.854136,0.15928,0.110223,0.178446,0.185496,0.17934,0.04414,0.173597,0.231401,0.230267,0.129641,-0.183392,-0.175948,-0.077099,0.504708,-0.077099,0.307396,0.168928,0.069533,0.350674,0.198765,0.27928,0.242512,0.242512,0.242512,0.201801,0.413964,0.158495,0.10987,0.079784,0.116316,0.119441,0.125299,0.129068,0.116089,-0.176232,0.311175,0.402361,0.242675,0.225285,0.434448,-0.044662
growth,-0.854136,1.0,-0.252015,-0.189302,-0.228983,-0.271381,-0.251627,-0.153463,-0.253594,-0.275789,-0.294287,-0.224358,0.026837,0.013074,-0.039872,-0.562717,-0.039872,-0.368967,-0.22105,-0.22623,-0.371032,-0.27713,-0.317922,-0.324221,-0.324221,-0.324221,-0.231044,-0.465516,-0.199809,-0.145174,-0.120153,-0.04952,-0.04773,-0.047191,-0.055057,-0.045935,0.019677,-0.334891,-0.439781,-0.320569,-0.240244,-0.484123,0.040944
crossing,0.15928,-0.252015,1.0,0.645621,0.43557,0.800162,0.664806,0.854544,0.818334,0.751382,0.740585,0.834774,0.633555,0.601544,0.660918,0.372938,0.660918,0.533293,0.08146,0.634695,-0.055725,0.733978,0.457167,0.820503,0.820503,0.820503,0.629131,0.590896,0.417238,0.404626,0.378677,-0.647155,-0.644804,-0.628339,-0.645034,-0.649351,0.303331,0.205157,0.50426,0.445988,0.22619,-0.000844,0.036906
finishing,0.110223,-0.189302,0.645621,1.0,0.455388,0.650934,0.875771,0.820629,0.747704,0.695429,0.485792,0.783835,0.567471,0.550783,0.572061,0.330846,0.572061,0.726205,0.054275,0.472761,0.003737,0.883367,0.240675,0.633437,0.633437,0.633437,0.839377,0.560906,-0.033656,-0.073428,-0.116006,-0.578322,-0.576235,-0.560607,-0.574026,-0.57548,0.279905,0.608192,0.321891,0.468894,-0.265831,-0.061217,0.05013
heading_accuracy,0.178446,-0.228983,0.43557,0.455388,1.0,0.630159,0.478915,0.531864,0.403207,0.366814,0.478891,0.639741,0.255302,0.31627,0.15151,0.331619,0.15151,0.372332,0.36923,0.570417,0.504878,0.480324,0.68574,0.594774,0.594774,0.594774,0.537305,0.516048,0.535808,0.513173,0.476353,-0.713628,-0.711295,-0.693076,-0.705803,-0.711708,-0.170158,-0.115646,-0.092903,-0.146766,0.376336,0.401288,0.019749
short_passing,0.185496,-0.271381,0.800162,0.650934,0.630159,1.0,0.673312,0.839028,0.752634,0.719229,0.886005,0.913722,0.508665,0.496319,0.553398,0.490238,0.553398,0.585329,0.162841,0.686334,0.141147,0.752279,0.614463,0.883164,0.883164,0.883164,0.665128,0.716718,0.550348,0.531136,0.488904,-0.717932,-0.715735,-0.697825,-0.712055,-0.719149,0.095696,0.153317,0.466105,0.348521,0.360711,0.167888,0.049795
volleys,0.17934,-0.251627,0.664806,0.875771,0.478915,0.673312,1.0,0.789894,0.810603,0.729989,0.534099,0.773288,0.509738,0.489444,0.591051,0.379871,0.591051,0.739453,0.135098,0.464982,0.037756,0.857321,0.311398,0.667062,0.667062,0.667062,0.820655,0.610954,0.054833,0.016005,-0.004173,-0.564526,-0.570284,-0.577161,-0.567359,-0.569596,0.21148,0.538622,0.366331,0.44755,-0.159121,-0.00838,0.045972
dribbling,0.04414,-0.153463,0.854544,0.820629,0.531864,0.839028,0.789894,1.0,0.829886,0.744447,0.704079,0.938222,0.713093,0.686167,0.713499,0.353965,0.713499,0.617583,0.09419,0.657167,-0.040269,0.841421,0.43674,0.819521,0.819521,0.819521,0.758656,0.631071,0.299085,0.2764,0.243403,-0.753101,-0.750538,-0.732805,-0.748854,-0.752566,0.330614,0.2831,0.361337,0.493366,0.050767,-0.047277,0.055826
curve,0.173597,-0.253594,0.818334,0.747704,0.403207,0.752634,0.810603,0.829886,1.0,0.835364,0.682583,0.810013,0.556903,0.51664,0.674736,0.39326,0.674736,0.671723,0.115029,0.539393,-0.049607,0.820317,0.381058,0.774804,0.774804,0.774804,0.726952,0.633895,0.249561,0.226667,0.215055,-0.581268,-0.587455,-0.596169,-0.586761,-0.590282,0.24695,0.398113,0.511517,0.494821,0.054465,-0.024401,0.047926
fk_accuracy,0.231401,-0.275789,0.751382,0.695429,0.366814,0.719229,0.729989,0.744447,0.835364,1.0,0.689447,0.746248,0.446731,0.405067,0.538774,0.376534,0.538774,0.662166,0.018933,0.481112,-0.032598,0.805765,0.371314,0.721136,0.721136,0.721136,0.730167,0.598287,0.251366,0.236422,0.195164,-0.528536,-0.527341,-0.508734,-0.526201,-0.529288,0.142495,0.40251,0.527765,0.440015,0.083423,-0.011374,0.036962


In [563]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [564]:
# OneHotEncoder
X_cat_encoded = X_cat1['bp']
X_cat_encoded = pd.DataFrame(X_cat_encoded, columns=['bp'])

encoder = OneHotEncoder(drop='first').fit(X_cat_encoded)
encoded = encoder.transform(X_cat_encoded).toarray()
encoded
cols = encoder.get_feature_names(input_features=X_cat_encoded.columns)
onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded.head()



Unnamed: 0,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [565]:
# Label Encoder for the values: A/W, D/W, SM

# we want to know what are the values given
print(X_cat1['a/w'].unique())
print(X_cat1['d/w'].unique())
print(X_cat1['sm'].unique())

['High' 'Medium' 'Low']
['Medium' 'Low' 'High']
['2★' '4★' '3★' '1★' '5★']


In [566]:
# Create a list with the values for the label encoder

# just for a/w
X_cat_aw = X_cat1['a/w']
X_cat_aw = pd.DataFrame(X_cat_aw, columns=['a/w'])

# just for d/w
X_cat_dw = X_cat1['d/w']
X_cat_dw = pd.DataFrame(X_cat_dw, columns=['d/w'])

# # just for sm
X_cat_sm = X_cat1['sm']
X_cat_sm = pd.DataFrame(X_cat_sm, columns=['sm'])

# just for a/w
label_encoded_aw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_aw) # give a value to each type
label_encoded_aw = pd.DataFrame(label_encoded_aw,columns=X_cat_aw.columns)
display(label_encoded_aw.head())

# just for d/w
label_encoded_dw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_dw) # give a value to each type
label_encoded_dw = pd.DataFrame(label_encoded_dw,columns=X_cat_dw.columns)
display(label_encoded_dw.head())

# just for sm
label_encoded_sm = LabelEncoder().fit(['1★','2★', '3★', '4★','5★']).transform(X_cat_sm) # give a value to each type
label_encoded_sm = pd.DataFrame(label_encoded_sm,columns=X_cat_sm.columns)
display(label_encoded_sm.head())

  y = column_or_1d(y, warn=True)


Unnamed: 0,a/w
0,0
1,0
2,0
3,2
4,1


  y = column_or_1d(y, warn=True)


Unnamed: 0,d/w
0,2
1,1
2,2
3,2
4,2


  y = column_or_1d(y, warn=True)


Unnamed: 0,sm
0,1
1,3
2,3
3,1
4,2


In [567]:
# Concat DataFrames

X = pd.concat([data_normalized,label_encoded_aw, label_encoded_dw, label_encoded_sm, onehot_encoded], axis=1)
X

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits,a/w,d/w,sm,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
0,0.370370,0.074074,0.545455,0.478261,0.431818,0.720930,0.488889,0.615385,0.468085,0.561798,0.642857,0.637363,0.607143,0.729412,0.748691,0.583333,0.748691,0.602410,0.768421,0.694118,0.454545,0.455556,0.517241,0.722222,0.722222,0.722222,0.534884,0.500000,0.516854,0.595238,0.477778,0.067416,0.116279,0.139535,0.076923,0.056180,0.614286,0.447368,0.558824,0.522388,0.506494,0.500000,0.000000,0,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.518519,0.037037,0.681818,0.826087,0.806818,0.697674,0.844444,0.857143,0.829787,0.752809,0.642857,0.813187,0.833333,0.847059,0.853403,0.708333,0.853403,0.746988,0.852632,0.741176,0.701299,0.711111,0.517241,0.726190,0.726190,0.726190,0.790698,0.690476,0.359551,0.166667,0.244444,0.112360,0.058140,0.139535,0.054945,0.168539,0.814286,0.763158,0.632353,0.805970,0.272727,0.666667,0.000446,0,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.629630,0.037037,0.761364,0.793478,0.329545,0.813953,0.833333,0.879121,0.946809,0.966292,0.773810,0.879121,0.845238,0.764706,0.973822,0.750000,0.973822,0.807229,0.357895,0.741176,0.285714,0.822222,0.758621,0.730159,0.730159,0.730159,0.755814,0.833333,0.224719,0.273810,0.311111,0.056180,0.011628,0.046512,0.010989,0.022472,0.771429,0.789474,0.779412,0.865672,0.194805,0.439394,0.000761,0,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.222222,0.518519,0.431818,0.423913,0.602273,0.627907,0.400000,0.538462,0.436170,0.460674,0.571429,0.615385,0.488095,0.564706,0.596859,0.430556,0.596859,0.542169,0.631579,0.611765,0.493506,0.377778,0.597701,0.563492,0.563492,0.563492,0.534884,0.428571,0.584270,0.619048,0.611111,0.078652,0.081395,0.046512,0.054945,0.123596,0.442857,0.355263,0.426471,0.432836,0.584416,0.500000,0.000043,2,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.259259,0.333333,0.488636,0.369565,0.636364,0.697674,0.377778,0.648352,0.468085,0.449438,0.619048,0.670330,0.630952,0.647059,0.680628,0.527778,0.680628,0.590361,0.357895,0.811765,0.532468,0.433333,0.655172,0.710317,0.710317,0.710317,0.476744,0.547619,0.617978,0.654762,0.733333,0.078652,0.081395,0.151163,0.032967,0.157303,0.571429,0.355263,0.514706,0.537313,0.623377,0.590909,0.000011,1,2,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11696,0.333333,0.222222,0.636364,0.228261,0.522727,0.593023,0.311111,0.571429,0.521277,0.269663,0.535714,0.593407,0.773810,0.729412,0.675393,0.402778,0.675393,0.228916,0.757895,0.764706,0.623377,0.233333,0.620690,0.650794,0.650794,0.650794,0.360465,0.404762,0.629213,0.690476,0.677778,0.134831,0.139535,0.139535,0.043956,0.146067,0.700000,0.144737,0.455882,0.462687,0.623377,0.636364,0.000000,2,2,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11697,0.407407,0.185185,0.068182,0.119565,0.090909,0.244186,0.177778,0.131868,0.159574,0.157303,0.214286,0.142857,0.321429,0.341176,0.481675,0.569444,0.481675,0.457831,0.621053,0.200000,0.571429,0.122222,0.241379,0.206349,0.206349,0.206349,0.127907,0.523810,0.101124,0.119048,0.144444,0.797753,0.755814,0.744186,0.747253,0.797753,0.657143,0.657895,0.602941,0.656716,0.363636,0.651515,0.000000,2,2,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11698,0.222222,0.333333,0.659091,0.684783,0.522727,0.651163,0.455556,0.725275,0.606383,0.370787,0.583333,0.681319,0.761905,0.764706,0.905759,0.527778,0.905759,0.590361,0.747368,0.611765,0.220779,0.455556,0.206897,0.591270,0.591270,0.591270,0.662791,0.547619,0.382022,0.071429,0.144444,0.134831,0.093023,0.104651,0.054945,0.168539,0.714286,0.565789,0.514706,0.656716,0.181818,0.242424,0.000033,0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
11699,0.370370,0.296296,0.659091,0.684783,0.522727,0.755814,0.000000,0.758242,0.000000,0.651685,0.750000,0.769231,0.690476,0.717647,0.000000,0.430556,0.000000,0.590361,0.000000,0.694118,0.571429,0.644444,0.632184,0.492063,0.492063,0.492063,0.523256,0.570806,0.584270,0.619048,0.000000,0.089888,0.255814,0.813953,0.241758,0.258427,0.657143,0.605263,0.661765,0.656716,0.584416,0.590909,0.000000,2,2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [568]:
# Apply linear regression & Model Validation
# We defined a function to do all the next steps together:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import math

In [569]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('X_train.shape: ', X_train.shape)
print('X_test.shape: ',X_test.shape)
print('y_train.shape: ',y_train.shape)
print('y_test.shape: ',y_test.shape)
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_train)
print('r2_score,y_train, predictions: ',r2_score(y_train, predictions))
predictions_test = lm.predict(X_test)
print('y train prediction: ',r2_score(y_train, predictions))
print('y test prediction: ',r2_score(y_test, predictions_test))
mse=mean_squared_error(y_test,predictions_test)
print('mse: ',mse)
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
print('rmse: ',rmse)
mae = mean_absolute_error(y_test, predictions_test)
print('mae: ',mae)
print('y test mean: ',y_test.mean())

X_train.shape:  (9360, 60)
X_test.shape:  (2341, 60)
y_train.shape:  (9360,)
y_test.shape:  (2341,)
r2_score,y_train, predictions:  0.9085801230059186
y train prediction:  0.9085801230059186
y test prediction:  0.9122196516921816
mse:  4.102988027787845
rmse:  2.0255833796187814
mae:  1.562529428225514
y test mean:  66.81973515591628


In [570]:
pd.set_option('display.max_columns', None)
file_validate = pd.read_csv('fifa21_validate.csv')    # import the document
file_validate

Unnamed: 0,ID,Name,Age,Nationality,Club,BP,Position,Team & Contract,Height,Weight,foot,Growth,Joined,Loan Date End,Value,Wage,Release Clause,Contract,Attacking,Crossing,Finishing,Heading Accuracy,Short Passing,Volleys,Skill,Dribbling,Curve,FK Accuracy,Long Passing,Ball Control,Movement,Acceleration,Sprint Speed,Agility,Reactions,Balance,Power,Shot Power,Jumping,Stamina,Strength,Long Shots,Mentality,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Defending,Marking,Standing Tackle,Sliding Tackle,Goalkeeping,GK Diving,GK Handling,GK Kicking,GK Positioning,GK Reflexes,Total Stats,Base Stats,W/F,SM,A/W,D/W,IR,PAC,SHO,PAS,DRI,DEF,PHY,Hits,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,OVA
0,219461,E. Palmer-Brown,23,United States,FK Austria Wien,CB,CB,"FK Austria Wien Jun 30, 2021 On Loan","6'2""",194lbs,Right,7,"Feb 8, 2018","Jun 30, 2021",€975K,€5K,€0,"Jun 30, 2021 On Loan",230,47,21,62,60,40.0,228,44,43.0,36,51,54,303,60,68,63.0,63,49.0,288,48,77.0,51,87,25,246,68,62.0,38.0,39.0,39,49.0,200,68,67,65.0,56,11,8,15,13,9,1551,334,2 ★,2★,Low,High,1 ★,64,30,50,50,66,74,34,48+2,48+2,48+2,48+0,47+0,47+0,47+0,48+0,48+2,48+2,48+2,50+2,51+2,51+2,51+2,50+2,59+2,61+2,61+2,61+2,59+2,61+2,67+2,67+2,67+2,61+2,16+2,67
1,221896,D. Avdijaj,22,Kosovo,Heart of Midlothian,CAM,LM CAM,Heart of Midlothian 2020 ~ 2020,"5'8""",154lbs,Right,5,"Jan 20, 2020",,€1.2M,€3K,€2.2M,2020 ~ 2020,298,62,60,44,62,70.0,330,76,68.0,56,60,70,375,77,72,83.0,64,79.0,323,76,62.0,63,47,75,286,72,26.0,64.0,64.0,60,65.0,61,19,23,19.0,53,14,13,9,9,8,1726,358,4 ★,3★,High,Low,1 ★,74,67,62,74,24,57,12,64+2,64+2,64+2,68+0,68+0,68+0,68+0,68+0,68+2,68+2,68+2,67+2,61+2,61+2,61+2,67+2,49+2,47+2,47+2,47+2,49+2,45+2,38+2,38+2,38+2,45+2,17+2,68
2,247428,D. Ochoa,19,United States,Real Salt Lake,GK,GK,Real Salt Lake 2018 ~ 2020,"6'2""",176lbs,Right,17,"Nov 28, 2018",,€120K,€500,€249K,2018 ~ 2020,48,7,5,11,21,4.0,52,6,8.0,8,20,10,165,28,25,33.0,41,38.0,171,40,49.0,22,54,6,76,20,9.0,7.0,26.0,14,31.0,27,8,9,10.0,269,56,52,53,53,55,808,295,2 ★,1★,Medium,Medium,1 ★,56,52,53,55,26,53,3,18+2,18+2,18+2,15+0,17+0,17+0,17+0,15+0,17+2,17+2,17+2,16+2,18+2,18+2,18+2,16+2,16+2,18+2,18+2,18+2,16+2,16+2,18+2,18+2,18+2,16+2,53+2,54
3,255120,N. Kenneh,16,England,Leeds United,CDM,CB CDM RB,Leeds United 2020 ~ 2022,"6'3""",170lbs,Right,23,"Jan 10, 2020",,€160K,€500,€464K,2020 ~ 2022,215,38,31,55,59,32.0,224,51,34.0,38,47,54,275,59,58,56.0,48,54.0,242,48,48.0,60,58,28,230,61,55.0,33.0,40.0,41,59.0,159,53,52,54.0,36,7,5,13,5,6,1381,303,3 ★,2★,Medium,Medium,1 ★,58,34,47,52,53,59,6,46+2,46+2,46+2,47+0,46+0,46+0,46+0,47+0,47+2,47+2,47+2,49+2,49+2,49+2,49+2,49+2,53+2,54+2,54+2,54+2,53+2,53+2,54+2,54+2,54+2,53+2,11+2,55
4,215556,E. Fernandes,24,Switzerland,1. FSV Mainz 05,CDM,CM CDM,1. FSV Mainz 05 2019 ~ 2023,"6'2""",170lbs,Right,5,"Jul 1, 2019",,€2.3M,€13K,€4.3M,2019 ~ 2023,295,57,59,45,78,56.0,327,71,57.0,51,74,74,320,68,66,66.0,64,56.0,337,73,56.0,74,72,62,314,66,78.0,53.0,62.0,55,63.0,211,72,68,71.0,60,12,7,13,15,13,1864,407,4 ★,2★,Medium,Medium,1 ★,67,62,68,70,69,71,45,63+2,63+2,63+2,66+0,66+0,66+0,66+0,66+0,68+2,68+2,68+2,67+2,70+2,70+2,70+2,67+2,70+2,72+2,72+2,72+2,70+2,69+2,68+2,68+2,68+2,69+2,18+2,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,239074,S. Aw,21,Senegal,Gil Vicente FC,LB,LB,Gil Vicente FC 2020 ~ 2023,"5'8""",143lbs,Left,11,"Aug 9, 2020",,€325K,€1K,€731K,2020 ~ 2023,197,59,23,42,51,22.0,207,51,30.0,29,41,56,344,74,78,63.0,50,79.0,245,31,66.0,68,50,30,232,59,52.0,51.0,34.0,36,59.0,167,52,57,58.0,51,6,15,7,10,13,1443,315,3 ★,2★,High,Medium,1 ★,76,28,46,55,53,57,4,44+2,44+2,44+2,51+0,48+0,48+0,48+0,51+0,48+2,48+2,48+2,53+2,48+2,48+2,48+2,53+2,58+2,52+2,52+2,52+2,58+2,58+2,54+2,54+2,54+2,58+2,15+2,60
1995,241223,S. Mogi,21,Japan,Cerezo Osaka,GK,GK,Cerezo Osaka 2017 ~ 2021,"6'5""",176lbs,Right,9,"Jan 1, 2017",,€190K,€700,€285K,2017 ~ 2021,77,13,9,14,34,7.0,70,9,13.0,10,25,13,192,31,30,36.0,59,36.0,177,43,44.0,23,60,7,88,21,8.0,4.0,36.0,19,33.0,36,14,11,11.0,288,60,55,57,54,62,928,318,2 ★,1★,Medium,Medium,1 ★,60,55,57,62,30,54,3,22+2,22+2,22+2,20+0,22+0,22+0,22+0,20+0,23+2,23+2,23+2,22+2,24+2,24+2,24+2,22+2,20+2,23+2,23+2,23+2,20+2,20+2,22+2,22+2,22+2,20+2,58+2,59
1996,210930,Carles Gil,27,Spain,New England Revolution,RM,RM CAM CM,New England Revolution 2019 ~ 2024,"5'7""",146lbs,Left,0,"Feb 13, 2019",,€8M,€9K,€12M,2019 ~ 2024,332,76,72,34,79,71.0,373,77,76.0,65,77,78,351,64,65,74.0,73,75.0,329,63,60.0,83,57,66,306,41,40.0,75.0,81.0,69,75.0,118,36,43,39.0,58,12,15,16,9,6,1867,388,4 ★,4★,High,Medium,2 ★,65,69,78,77,39,60,15,67+2,67+2,67+2,74+0,73+0,73+0,73+0,74+0,76+0,76+0,76+0,75+1,74+2,74+2,74+2,75+1,62+2,60+2,60+2,60+2,62+2,57+2,47+2,47+2,47+2,57+2,18+2,76
1997,162993,J. Perch,34,England,Mansfield Town,CDM,CDM RB CM,Mansfield Town 2020 ~ 2021,"5'11""",176lbs,Right,0,"Aug 13, 2020",,€140K,€4K,€245K,2020 ~ 2021,268,58,44,61,62,43.0,261,56,47.0,37,60,61,288,54,52,57.0,61,64.0,300,53,65.0,68,68,46,297,74,62.0,54.0,56.0,51,60.0,182,61,62,59.0,43,10,7,9,9,8,1639,346,3 ★,2★,Medium,Medium,1 ★,53,47,58,58,61,69,4,55+2,55+2,55+2,56+0,55+0,55+0,55+0,56+0,57+2,57+2,57+2,57+2,59+2,59+2,59+2,57+2,60+2,62+1,62+1,62+1,60+2,60+2,63+0,63+0,63+0,60+2,14+2,63


In [571]:
file_validate = file_validate.rename(columns={'Heading Accuracy':'heading_accuracy','Short Passing':'short_passing', 
                           'FK Accuracy':'fk_accuracy', 'Long Passing':'long_passing', 'Ball Control':'ball_control',
                           'Sprint Speed':'sprint_speed', 'Shot Power':'shot_power', 'Long Shots':'long_shots',
                           'Standing Tackle':'Standing_Tackle', 'Sliding Tackle':'Sliding_Tackle',
                           'GK Diving':'GK_Diving', 'GK Handling':'GK_Handling', 'GK Kicking':'GK_Kicking',
                            'GK Positioning':'GK_Positioning', 'GK Reflexes':'GK_Reflexes'})

colu2 = []
for colname2 in file_validate.columns:
    colu2.append(colname2.lower())
file_validate.columns = colu2
file_validate

Unnamed: 0,id,name,age,nationality,club,bp,position,team & contract,height,weight,foot,growth,joined,loan date end,value,wage,release clause,contract,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total stats,base stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
0,219461,E. Palmer-Brown,23,United States,FK Austria Wien,CB,CB,"FK Austria Wien Jun 30, 2021 On Loan","6'2""",194lbs,Right,7,"Feb 8, 2018","Jun 30, 2021",€975K,€5K,€0,"Jun 30, 2021 On Loan",230,47,21,62,60,40.0,228,44,43.0,36,51,54,303,60,68,63.0,63,49.0,288,48,77.0,51,87,25,246,68,62.0,38.0,39.0,39,49.0,200,68,67,65.0,56,11,8,15,13,9,1551,334,2 ★,2★,Low,High,1 ★,64,30,50,50,66,74,34,48+2,48+2,48+2,48+0,47+0,47+0,47+0,48+0,48+2,48+2,48+2,50+2,51+2,51+2,51+2,50+2,59+2,61+2,61+2,61+2,59+2,61+2,67+2,67+2,67+2,61+2,16+2,67
1,221896,D. Avdijaj,22,Kosovo,Heart of Midlothian,CAM,LM CAM,Heart of Midlothian 2020 ~ 2020,"5'8""",154lbs,Right,5,"Jan 20, 2020",,€1.2M,€3K,€2.2M,2020 ~ 2020,298,62,60,44,62,70.0,330,76,68.0,56,60,70,375,77,72,83.0,64,79.0,323,76,62.0,63,47,75,286,72,26.0,64.0,64.0,60,65.0,61,19,23,19.0,53,14,13,9,9,8,1726,358,4 ★,3★,High,Low,1 ★,74,67,62,74,24,57,12,64+2,64+2,64+2,68+0,68+0,68+0,68+0,68+0,68+2,68+2,68+2,67+2,61+2,61+2,61+2,67+2,49+2,47+2,47+2,47+2,49+2,45+2,38+2,38+2,38+2,45+2,17+2,68
2,247428,D. Ochoa,19,United States,Real Salt Lake,GK,GK,Real Salt Lake 2018 ~ 2020,"6'2""",176lbs,Right,17,"Nov 28, 2018",,€120K,€500,€249K,2018 ~ 2020,48,7,5,11,21,4.0,52,6,8.0,8,20,10,165,28,25,33.0,41,38.0,171,40,49.0,22,54,6,76,20,9.0,7.0,26.0,14,31.0,27,8,9,10.0,269,56,52,53,53,55,808,295,2 ★,1★,Medium,Medium,1 ★,56,52,53,55,26,53,3,18+2,18+2,18+2,15+0,17+0,17+0,17+0,15+0,17+2,17+2,17+2,16+2,18+2,18+2,18+2,16+2,16+2,18+2,18+2,18+2,16+2,16+2,18+2,18+2,18+2,16+2,53+2,54
3,255120,N. Kenneh,16,England,Leeds United,CDM,CB CDM RB,Leeds United 2020 ~ 2022,"6'3""",170lbs,Right,23,"Jan 10, 2020",,€160K,€500,€464K,2020 ~ 2022,215,38,31,55,59,32.0,224,51,34.0,38,47,54,275,59,58,56.0,48,54.0,242,48,48.0,60,58,28,230,61,55.0,33.0,40.0,41,59.0,159,53,52,54.0,36,7,5,13,5,6,1381,303,3 ★,2★,Medium,Medium,1 ★,58,34,47,52,53,59,6,46+2,46+2,46+2,47+0,46+0,46+0,46+0,47+0,47+2,47+2,47+2,49+2,49+2,49+2,49+2,49+2,53+2,54+2,54+2,54+2,53+2,53+2,54+2,54+2,54+2,53+2,11+2,55
4,215556,E. Fernandes,24,Switzerland,1. FSV Mainz 05,CDM,CM CDM,1. FSV Mainz 05 2019 ~ 2023,"6'2""",170lbs,Right,5,"Jul 1, 2019",,€2.3M,€13K,€4.3M,2019 ~ 2023,295,57,59,45,78,56.0,327,71,57.0,51,74,74,320,68,66,66.0,64,56.0,337,73,56.0,74,72,62,314,66,78.0,53.0,62.0,55,63.0,211,72,68,71.0,60,12,7,13,15,13,1864,407,4 ★,2★,Medium,Medium,1 ★,67,62,68,70,69,71,45,63+2,63+2,63+2,66+0,66+0,66+0,66+0,66+0,68+2,68+2,68+2,67+2,70+2,70+2,70+2,67+2,70+2,72+2,72+2,72+2,70+2,69+2,68+2,68+2,68+2,69+2,18+2,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,239074,S. Aw,21,Senegal,Gil Vicente FC,LB,LB,Gil Vicente FC 2020 ~ 2023,"5'8""",143lbs,Left,11,"Aug 9, 2020",,€325K,€1K,€731K,2020 ~ 2023,197,59,23,42,51,22.0,207,51,30.0,29,41,56,344,74,78,63.0,50,79.0,245,31,66.0,68,50,30,232,59,52.0,51.0,34.0,36,59.0,167,52,57,58.0,51,6,15,7,10,13,1443,315,3 ★,2★,High,Medium,1 ★,76,28,46,55,53,57,4,44+2,44+2,44+2,51+0,48+0,48+0,48+0,51+0,48+2,48+2,48+2,53+2,48+2,48+2,48+2,53+2,58+2,52+2,52+2,52+2,58+2,58+2,54+2,54+2,54+2,58+2,15+2,60
1995,241223,S. Mogi,21,Japan,Cerezo Osaka,GK,GK,Cerezo Osaka 2017 ~ 2021,"6'5""",176lbs,Right,9,"Jan 1, 2017",,€190K,€700,€285K,2017 ~ 2021,77,13,9,14,34,7.0,70,9,13.0,10,25,13,192,31,30,36.0,59,36.0,177,43,44.0,23,60,7,88,21,8.0,4.0,36.0,19,33.0,36,14,11,11.0,288,60,55,57,54,62,928,318,2 ★,1★,Medium,Medium,1 ★,60,55,57,62,30,54,3,22+2,22+2,22+2,20+0,22+0,22+0,22+0,20+0,23+2,23+2,23+2,22+2,24+2,24+2,24+2,22+2,20+2,23+2,23+2,23+2,20+2,20+2,22+2,22+2,22+2,20+2,58+2,59
1996,210930,Carles Gil,27,Spain,New England Revolution,RM,RM CAM CM,New England Revolution 2019 ~ 2024,"5'7""",146lbs,Left,0,"Feb 13, 2019",,€8M,€9K,€12M,2019 ~ 2024,332,76,72,34,79,71.0,373,77,76.0,65,77,78,351,64,65,74.0,73,75.0,329,63,60.0,83,57,66,306,41,40.0,75.0,81.0,69,75.0,118,36,43,39.0,58,12,15,16,9,6,1867,388,4 ★,4★,High,Medium,2 ★,65,69,78,77,39,60,15,67+2,67+2,67+2,74+0,73+0,73+0,73+0,74+0,76+0,76+0,76+0,75+1,74+2,74+2,74+2,75+1,62+2,60+2,60+2,60+2,62+2,57+2,47+2,47+2,47+2,57+2,18+2,76
1997,162993,J. Perch,34,England,Mansfield Town,CDM,CDM RB CM,Mansfield Town 2020 ~ 2021,"5'11""",176lbs,Right,0,"Aug 13, 2020",,€140K,€4K,€245K,2020 ~ 2021,268,58,44,61,62,43.0,261,56,47.0,37,60,61,288,54,52,57.0,61,64.0,300,53,65.0,68,68,46,297,74,62.0,54.0,56.0,51,60.0,182,61,62,59.0,43,10,7,9,9,8,1639,346,3 ★,2★,Medium,Medium,1 ★,53,47,58,58,61,69,4,55+2,55+2,55+2,56+0,55+0,55+0,55+0,56+0,57+2,57+2,57+2,57+2,59+2,59+2,59+2,57+2,60+2,62+1,62+1,62+1,60+2,60+2,63+0,63+0,63+0,60+2,14+2,63


In [572]:
# Check NaN
pd.options.display.max_rows = 500   # To display all the NaN values and deal with them
file_validate.isnull().sum()

id                     0
name                   0
age                    0
nationality            0
club                   3
bp                     0
position              55
team & contract        0
height                 0
weight                 0
foot                   0
growth                 0
joined                 3
loan date end       1889
value                  0
wage                   0
release clause         0
contract               0
attacking              0
crossing               0
finishing              0
heading_accuracy       0
short_passing          0
volleys                3
skill                  0
dribbling              0
curve                  3
fk_accuracy            0
long_passing           0
ball_control           0
movement               0
acceleration           0
sprint_speed           0
agility                3
reactions              0
balance                3
power                  0
shot_power             0
jumping                3
stamina                0


In [573]:
# We decided to use the original Composure mean becuase it is more accurate due to the number of players in the Data Frame
composure_mean = np.mean(file['composure'])
composure_mean

59.94773244615622

In [574]:
# Fill NaN values
file_validate['position'] = file_validate['position'].fillna(file_validate['bp'])
file_validate['composure'] = file_validate['composure'].fillna(composure_mean)

file_validate['volleys'] = file_validate['attacking'] - file_validate['crossing'] - file_validate['finishing'] - file_validate['heading_accuracy'] - file_validate['short_passing']
file_validate['curve'] = file_validate['skill'] - file_validate['dribbling'] - file_validate['fk_accuracy'] - file_validate['long_passing'] - file_validate['ball_control']
file_validate['jumping'] = file_validate['power'] - file_validate['shot_power'] - file_validate['stamina'] - file_validate['strength'] - file_validate['long_shots']
file_validate['sliding_tackle'] = file_validate['defending'] - file_validate['marking'] - file_validate['standing_tackle']
file_validate['vision'] = (file_validate['mentality'] - file_validate['aggression'] - file_validate['penalties'] - file_validate['positioning'] - file_validate['interceptions'])

file_validate['agility'] = (file_validate['movement'] - file_validate['acceleration'] - file_validate['sprint_speed'] - file_validate['reactions'])/2
file_validate['balance'] = file_validate['agility']

# print(file['a/w'].mode()) ; print(file['d/w'].mode())
file_validate['a/w'] = file_validate['a/w'].fillna('Medium')
file_validate['d/w'] = file_validate['d/w'].fillna('Medium')

In [575]:
file_validate = file_validate.drop(['id', 'nationality', 'club', 'team & contract',
                 'height', 'weight', 'foot', 'joined', 'loan date end',
                 'value', 'wage', 'release clause', 'contract', 'attacking',
                 'skill', 'movement', 'power', 'mentality', 'defending',
                 'goalkeeping', 'total stats', 'base stats', 'w/f', 'ir','position', 'ls', 'st',
                'rs', 'lw', 'lf','cf', 'rf', 'rw' ,'lam', 'cam','ram', 'lm',
                'lcm', 'cm','rcm', 'rm', 'lwb', 'ldm' , 'cdm', 'rdm' , 'rwb', 'lb', 'lcb', 'cb',
                    'rcb', 'rb', 'gk' ],axis=1)

In [576]:
file_validate

Unnamed: 0,name,age,bp,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,sm,a/w,d/w,pac,sho,pas,dri,def,phy,hits,ova
0,E. Palmer-Brown,23,CB,7,47,21,62,60,40,44,43,36,51,54,60,68,56.0,63,56.0,48,77,51,87,25,68,62.0,38.0,39.0,39,49.0,68,67,65,11,8,15,13,9,2★,Low,High,64,30,50,50,66,74,34,67
1,D. Avdijaj,22,CAM,5,62,60,44,62,70,76,68,56,60,70,77,72,81.0,64,81.0,76,62,63,47,75,72,26.0,64.0,64.0,60,65.0,19,23,19,14,13,9,9,8,3★,High,Low,74,67,62,74,24,57,12,68
2,D. Ochoa,19,GK,17,7,5,11,21,4,6,8,8,20,10,28,25,35.5,41,35.5,40,49,22,54,6,20,9.0,7.0,26.0,14,31.0,8,9,10,56,52,53,53,55,1★,Medium,Medium,56,52,53,55,26,53,3,54
3,N. Kenneh,16,CDM,23,38,31,55,59,32,51,34,38,47,54,59,58,55.0,48,55.0,48,48,60,58,28,61,55.0,33.0,40.0,41,59.0,53,52,54,7,5,13,5,6,2★,Medium,Medium,58,34,47,52,53,59,6,55
4,E. Fernandes,24,CDM,5,57,59,45,78,56,71,57,51,74,74,68,66,61.0,64,61.0,73,56,74,72,62,66,78.0,53.0,62.0,55,63.0,72,68,71,12,7,13,15,13,2★,Medium,Medium,67,62,68,70,69,71,45,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,S. Aw,21,LB,11,59,23,42,51,22,51,30,29,41,56,74,78,71.0,50,71.0,31,66,68,50,30,59,52.0,51.0,34.0,36,59.0,52,57,58,6,15,7,10,13,2★,High,Medium,76,28,46,55,53,57,4,60
1995,S. Mogi,21,GK,9,13,9,14,34,7,9,13,10,25,13,31,30,36.0,59,36.0,43,44,23,60,7,21,8.0,4.0,36.0,19,33.0,14,11,11,60,55,57,54,62,1★,Medium,Medium,60,55,57,62,30,54,3,59
1996,Carles Gil,27,RM,0,76,72,34,79,71,77,76,65,77,78,64,65,74.5,73,74.5,63,60,83,57,66,41,40.0,75.0,81.0,69,75.0,36,43,39,12,15,16,9,6,4★,High,Medium,65,69,78,77,39,60,15,76
1997,J. Perch,34,CDM,0,58,44,61,62,43,56,47,37,60,61,54,52,60.5,61,60.5,53,65,68,68,46,74,62.0,54.0,56.0,51,60.0,61,62,59,10,7,9,9,8,2★,Medium,Medium,53,47,58,58,61,69,4,63


In [577]:
# Check NaN again
pd.options.display.max_rows = 500   # To display all the NaN values and deal with them
file_validate.isnull().sum().sum()

0

In [578]:
pd.options.display.max_rows = 50
file_validate['hits'] = file_validate['hits'].str.replace('K','000')
file_validate['hits'] = file_validate['hits'].str.replace('.','')
file_validate['hits'] = pd.to_numeric(file_validate['hits'], errors='coerce')
file_validate['hits'].unique

  file_validate['hits'] = file_validate['hits'].str.replace('.','')


<bound method Series.unique of 0       34
1       12
2        3
3        6
4       45
        ..
1994     4
1995     3
1996    15
1997     4
1998     5
Name: hits, Length: 1999, dtype: int64>

In [579]:
data_validate = file_validate.copy()
data_validate.head()

Unnamed: 0,name,age,bp,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,sm,a/w,d/w,pac,sho,pas,dri,def,phy,hits,ova
0,E. Palmer-Brown,23,CB,7,47,21,62,60,40,44,43,36,51,54,60,68,56.0,63,56.0,48,77,51,87,25,68,62.0,38.0,39.0,39,49.0,68,67,65,11,8,15,13,9,2★,Low,High,64,30,50,50,66,74,34,67
1,D. Avdijaj,22,CAM,5,62,60,44,62,70,76,68,56,60,70,77,72,81.0,64,81.0,76,62,63,47,75,72,26.0,64.0,64.0,60,65.0,19,23,19,14,13,9,9,8,3★,High,Low,74,67,62,74,24,57,12,68
2,D. Ochoa,19,GK,17,7,5,11,21,4,6,8,8,20,10,28,25,35.5,41,35.5,40,49,22,54,6,20,9.0,7.0,26.0,14,31.0,8,9,10,56,52,53,53,55,1★,Medium,Medium,56,52,53,55,26,53,3,54
3,N. Kenneh,16,CDM,23,38,31,55,59,32,51,34,38,47,54,59,58,55.0,48,55.0,48,48,60,58,28,61,55.0,33.0,40.0,41,59.0,53,52,54,7,5,13,5,6,2★,Medium,Medium,58,34,47,52,53,59,6,55
4,E. Fernandes,24,CDM,5,57,59,45,78,56,71,57,51,74,74,68,66,61.0,64,61.0,73,56,74,72,62,66,78.0,53.0,62.0,55,63.0,72,68,71,12,7,13,15,13,2★,Medium,Medium,67,62,68,70,69,71,45,70


In [580]:
X1 = data_validate.drop(['ova'],axis=1)   # OVA is our target
y1 = data_validate['ova']

X1_num = X1.select_dtypes(np.number)
X1_cat = X1.select_dtypes(object)

transformer1 = MinMaxScaler().fit(X1_num)
x_normalized1 = transformer1.transform(X1_num)
print(x_normalized1.shape)
data_validate_normalized = pd.DataFrame(x_normalized1, columns = X1_num.columns)
pd.DataFrame(x_normalized1, columns=X_num.columns)

print(type(data_validate_normalized))

(1999, 43)
<class 'pandas.core.frame.DataFrame'>


In [581]:
y1.shape

(1999,)

In [582]:
y.shape

(11701,)

In [583]:
X1.shape

(1999, 48)

In [584]:
# OneHotencoder1

X1_cat_encoded = X1_cat['bp']
X1_cat_encoded = pd.DataFrame(X1_cat_encoded, columns=['bp'])
encoder1 = OneHotEncoder(drop='first').fit(X1_cat_encoded)
encoded1 = encoder1.transform(X1_cat_encoded).toarray()
cols1 = encoder1.get_feature_names(input_features=X1_cat_encoded.columns)
onehot_encoded1 = pd.DataFrame(encoded1, columns=cols)
onehot_encoded1.head()

# Create a list with the values for the label encoder1

# just for a/w
X_cat_aw1 = X1_cat['a/w']
X_cat_aw1 = pd.DataFrame(X_cat_aw1, columns=['a/w'])

# just for d/w
X_cat_dw1 = X1_cat['d/w']
X_cat_dw1 = pd.DataFrame(X_cat_dw1, columns=['d/w'])

# # just for sm
X_cat_sm1 = X1_cat['sm']
X_cat_sm1 = pd.DataFrame(X_cat_sm1, columns=['sm'])

# just for a/w
label_encoded1_aw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_aw1) # give a value to each type
label_encoded1_aw = pd.DataFrame(label_encoded1_aw,columns=X_cat_aw1.columns)
display(label_encoded1_aw.head())

# just for d/w
label_encoded1_dw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_dw1) # give a value to each type
label_encoded1_dw = pd.DataFrame(label_encoded1_dw,columns=X_cat_dw1.columns)
display(label_encoded1_dw.head())

# just for sm
label_encoded1_sm = LabelEncoder().fit(['1★','2★', '3★', '4★','5★']).transform(X_cat_sm1) # give a value to each type
label_encoded1_sm = pd.DataFrame(label_encoded1_sm,columns=X_cat_sm1.columns)
display(label_encoded1_sm.head())

  y = column_or_1d(y, warn=True)


Unnamed: 0,a/w
0,1
1,0
2,2
3,2
4,2


  y = column_or_1d(y, warn=True)


Unnamed: 0,d/w
0,0
1,1
2,2
3,2
4,2


  y = column_or_1d(y, warn=True)


Unnamed: 0,sm
0,1
1,2
2,0
3,1
4,1


In [585]:
X1 = pd.concat([data_validate_normalized,label_encoded1_aw, label_encoded1_dw, label_encoded1_sm, onehot_encoded1], axis=1)

In [586]:
X1.shape

(1999, 60)

In [587]:
X1.corr()

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits,a/w,d/w,sm,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
age,1.000000,-0.847279,0.128084,0.077483,0.168714,0.163375,0.175141,0.017475,0.165960,0.218996,0.216199,0.096651,-0.203754,-0.197350,-0.103246,0.503592,-0.103246,0.303372,0.192164,0.072315,0.350474,0.167229,0.269711,0.199524,0.087477,0.214840,0.169520,0.393834,0.167820,0.115871,0.091746,0.122985,0.130638,0.128984,0.136974,0.127802,-0.190757,0.288417,0.388771,0.216944,0.235851,0.443681,-0.053900,-0.047627,-0.117701,0.067944,0.065970,0.029115,0.077619,0.041125,0.092014,-0.014347,-0.030456,-0.027643,-0.052712,0.015623,-0.067006,0.055514,-0.034090,-0.022513
growth,-0.847279,1.000000,-0.238087,-0.171555,-0.220878,-0.261099,-0.261025,-0.137375,-0.261625,-0.279317,-0.292086,-0.202073,0.032199,0.020035,-0.031545,-0.565648,-0.031545,-0.365754,-0.252921,-0.234247,-0.371938,-0.257818,-0.304321,-0.228002,-0.187122,-0.302075,-0.217937,-0.453752,-0.200602,-0.156840,-0.137167,-0.043649,-0.048622,-0.033814,-0.049271,-0.049277,0.031670,-0.314518,-0.430657,-0.298014,-0.251498,-0.486094,0.050338,0.092870,0.132285,-0.185634,-0.041346,-0.021034,-0.037891,-0.043987,-0.006138,-0.003015,-0.002347,0.004598,0.031202,-0.011017,0.019176,-0.088928,0.010113,0.021774
crossing,0.128084,-0.238087,1.000000,0.673771,0.447780,0.803333,0.689397,0.863283,0.836166,0.756667,0.738984,0.842456,0.648057,0.610583,0.693037,0.344306,0.693037,0.546096,0.053566,0.646068,-0.075869,0.748208,0.458120,0.416515,0.805101,0.661235,0.660008,0.581071,0.423657,0.411715,0.390473,-0.669428,-0.663609,-0.661843,-0.664562,-0.661470,0.284709,0.198744,0.472073,0.408679,0.216395,-0.027699,0.051162,-0.300796,-0.116922,0.734215,-0.229564,0.042781,0.034105,0.153049,-0.687393,0.176125,0.161041,0.056266,0.075705,0.146715,0.196505,0.117837,0.096342,-0.058742
finishing,0.077483,-0.171555,0.673771,1.000000,0.436494,0.662336,0.886301,0.826771,0.761831,0.700072,0.501434,0.784798,0.580843,0.550472,0.611753,0.301929,0.611753,0.713263,0.022170,0.489534,-0.057886,0.881553,0.218280,-0.023808,0.893657,0.685086,0.835540,0.554016,0.009724,-0.044543,-0.079675,-0.583821,-0.585256,-0.580444,-0.581543,-0.582285,0.262949,0.583609,0.320949,0.447039,-0.244036,-0.117444,0.068517,-0.379935,-0.107844,0.737360,-0.382280,-0.002965,0.093503,0.118042,-0.607462,-0.091525,0.137816,0.097763,-0.052915,-0.091107,0.191815,0.135246,-0.020050,0.418786
heading_accuracy,0.168714,-0.220878,0.447780,0.436494,1.000000,0.640696,0.475048,0.535781,0.409685,0.365941,0.487017,0.652465,0.277964,0.336529,0.162541,0.314915,0.162541,0.370494,0.376093,0.586509,0.493937,0.483273,0.708127,0.559203,0.513273,0.227720,0.529798,0.504171,0.592359,0.557130,0.530304,-0.727690,-0.729195,-0.720415,-0.727509,-0.727759,-0.189480,-0.168068,-0.127422,-0.195818,0.416532,0.388323,0.030672,-0.166602,-0.244961,0.387219,0.392789,0.078182,0.033222,0.036992,-0.755469,0.045972,-0.074856,-0.026817,-0.027790,0.037417,-0.084396,-0.018696,0.014516,0.284112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bp_RB,0.015623,-0.011017,0.146715,-0.091107,0.037417,0.032706,-0.073256,0.047906,0.028209,-0.031399,0.002750,0.039456,0.140795,0.145256,0.097517,-0.000679,0.097517,-0.147867,0.052204,0.130396,-0.034039,-0.068562,0.085438,0.163194,0.047566,-0.048898,-0.056850,-0.032359,0.165208,0.179312,0.192331,-0.073893,-0.064996,-0.070106,-0.069512,-0.074001,0.133721,-0.182905,-0.027233,-0.019450,0.167488,0.019782,0.033859,-0.039843,0.013095,0.001780,-0.106319,-0.062873,-0.016245,-0.056403,-0.072817,-0.050902,-0.049585,-0.023561,-0.025554,1.000000,-0.065288,-0.031602,-0.026957,-0.089781
bp_RM,-0.067006,0.019176,0.196505,0.191815,-0.084396,0.086901,0.145751,0.178704,0.157385,0.122136,0.093007,0.125968,0.254852,0.246690,0.234625,-0.070041,0.234625,0.073964,-0.088473,0.088984,-0.199214,0.115290,-0.081797,-0.121538,0.159978,0.125966,0.114666,0.024482,-0.102386,-0.102855,-0.092076,-0.097445,-0.103305,-0.105012,-0.095780,-0.097876,0.254514,0.097053,0.081171,0.132898,-0.171866,-0.187533,-0.019667,-0.099528,0.015714,0.198976,-0.145540,-0.086066,-0.022238,-0.077210,-0.099679,-0.069679,-0.067878,-0.032253,-0.034981,-0.065288,1.000000,-0.043260,-0.036901,-0.122901
bp_RW,0.055514,-0.088928,0.117837,0.135246,-0.018696,0.050321,0.115094,0.118023,0.108489,0.089578,0.035862,0.095742,0.156057,0.152156,0.150225,0.030884,0.150225,0.093501,-0.041867,0.024236,-0.100099,0.113269,-0.023366,-0.052646,0.106170,0.101370,0.088921,0.048563,-0.062148,-0.078271,-0.083514,-0.049074,-0.050161,-0.042514,-0.045128,-0.050025,0.166736,0.108236,0.065757,0.121750,-0.100147,-0.094538,-0.009175,-0.069525,-0.006317,0.128056,-0.070447,-0.041660,-0.010764,-0.037373,-0.048249,-0.033728,-0.032855,-0.015612,-0.016932,-0.031602,-0.043260,1.000000,-0.017862,-0.059489
bp_RWB,-0.034090,0.010113,0.096342,-0.020050,0.014516,0.045476,-0.032832,0.062811,0.041880,-0.024192,0.029596,0.050188,0.079080,0.084484,0.055117,0.022191,0.055117,-0.028319,0.021835,0.093973,-0.004040,-0.011393,0.068439,0.095133,0.045690,-0.004038,-0.048611,0.010394,0.098049,0.102436,0.110120,-0.042528,-0.040169,-0.044965,-0.036537,-0.044792,0.077078,-0.064676,0.017696,0.036354,0.096101,0.038008,-0.005319,-0.058423,0.026547,0.031738,-0.060092,-0.035536,-0.009182,-0.031879,-0.041156,-0.028770,-0.028026,-0.013317,-0.014443,-0.026957,-0.036901,-0.017862,1.000000,-0.050744


In [588]:
X.shape

(11701, 60)

In [589]:
X1.shape

(1999, 60)

In [594]:
X1.reset_index(drop=True)

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits,a/w,d/w,sm,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
0,0.225806,0.28,0.493976,0.195402,0.635294,0.604938,0.470588,0.448276,0.467391,0.337209,0.5125,0.543210,0.578313,0.679012,0.602151,0.524590,0.602151,0.418919,0.827957,0.481928,0.885714,0.247059,0.690476,0.690476,0.390805,0.428571,0.376471,0.4625,0.707865,0.719512,0.722222,0.095238,0.066667,0.152174,0.125000,0.079545,0.531250,0.138889,0.328125,0.327869,0.688312,0.719298,0.001148,1,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.193548,0.20,0.674699,0.643678,0.423529,0.629630,0.823529,0.816092,0.739130,0.569767,0.6250,0.740741,0.783133,0.728395,0.870968,0.540984,0.870968,0.797297,0.666667,0.626506,0.314286,0.835294,0.738095,0.261905,0.689655,0.703297,0.623529,0.6625,0.157303,0.182927,0.211111,0.130952,0.122222,0.086957,0.079545,0.068182,0.687500,0.652778,0.515625,0.721311,0.142857,0.421053,0.000333,0,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.096774,0.68,0.012048,0.011494,0.035294,0.123457,0.047059,0.011494,0.086957,0.011628,0.1250,0.000000,0.192771,0.148148,0.381720,0.163934,0.381720,0.310811,0.526882,0.132530,0.414286,0.023529,0.119048,0.059524,0.034483,0.285714,0.082353,0.2375,0.033708,0.012195,0.111111,0.630952,0.555556,0.565217,0.579545,0.602273,0.406250,0.444444,0.375000,0.409836,0.168831,0.350877,0.000000,2,2,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.92,0.385542,0.310345,0.552941,0.592593,0.376471,0.528736,0.369565,0.360465,0.4625,0.543210,0.566265,0.555556,0.591398,0.278689,0.591398,0.418919,0.516129,0.590361,0.471429,0.282353,0.607143,0.607143,0.333333,0.439560,0.400000,0.5875,0.539326,0.536585,0.600000,0.047619,0.033333,0.130435,0.034091,0.045455,0.437500,0.194444,0.281250,0.360656,0.519481,0.456140,0.000111,2,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.258065,0.20,0.614458,0.632184,0.435294,0.827160,0.658824,0.758621,0.619565,0.511628,0.8000,0.790123,0.674699,0.654321,0.655914,0.540984,0.655914,0.756757,0.602151,0.759036,0.671429,0.682353,0.666667,0.880952,0.563218,0.681319,0.564706,0.6375,0.752809,0.731707,0.788889,0.107143,0.055556,0.130435,0.147727,0.125000,0.578125,0.583333,0.609375,0.655738,0.727273,0.666667,0.001556,2,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,0.161290,0.44,0.638554,0.218391,0.400000,0.493827,0.258824,0.528736,0.326087,0.255814,0.3875,0.567901,0.746988,0.802469,0.763441,0.311475,0.763441,0.189189,0.709677,0.686747,0.357143,0.305882,0.583333,0.571429,0.540230,0.373626,0.341176,0.5875,0.528090,0.597561,0.644444,0.035714,0.144444,0.065217,0.090909,0.125000,0.718750,0.111111,0.265625,0.409836,0.519481,0.421053,0.000037,0,2,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1995,0.161290,0.36,0.084337,0.057471,0.070588,0.283951,0.082353,0.045977,0.141304,0.034884,0.1875,0.037037,0.228916,0.209877,0.387097,0.459016,0.387097,0.351351,0.473118,0.144578,0.500000,0.035294,0.130952,0.047619,0.000000,0.395604,0.141176,0.2625,0.101124,0.036585,0.122222,0.678571,0.588889,0.608696,0.590909,0.681818,0.468750,0.486111,0.437500,0.524590,0.220779,0.368421,0.000000,2,2,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.354839,0.00,0.843373,0.781609,0.305882,0.839506,0.835294,0.827586,0.826087,0.674419,0.8375,0.839506,0.626506,0.641975,0.801075,0.688525,0.801075,0.621622,0.645161,0.867470,0.457143,0.729412,0.369048,0.428571,0.816092,0.890110,0.729412,0.7875,0.348315,0.426829,0.433333,0.107143,0.144444,0.163043,0.079545,0.045455,0.546875,0.680556,0.765625,0.770492,0.337662,0.473684,0.000444,0,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1997,0.580645,0.00,0.626506,0.459770,0.623529,0.629630,0.505882,0.586207,0.510870,0.348837,0.6250,0.629630,0.506024,0.481481,0.650538,0.491803,0.650538,0.486486,0.698925,0.686747,0.614286,0.494118,0.761905,0.690476,0.574713,0.615385,0.517647,0.6000,0.629213,0.658537,0.655556,0.083333,0.055556,0.086957,0.079545,0.068182,0.359375,0.375000,0.453125,0.459016,0.623377,0.631579,0.000037,2,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [595]:
y1.reset_index(drop=True)

0       67
1       68
2       54
3       55
4       70
        ..
1994    60
1995    59
1996    76
1997    63
1998    60
Name: ova, Length: 1999, dtype: int64

In [590]:
# Original information 

print('r2score / y train prediction: ',r2_score(y_train, predictions))
print('r2score / y test prediction: ',r2_score(y_test, predictions_test))
print('mse: ',mse)
print('rmse: ',rmse)

# y train prediction:  0.9109614973408771
# y test prediction:  0.9023903598322012
# mse:  4.645956272771824
# rmse:  2.1554480445540376

r2score / y train prediction:  0.9085801230059186
r2score / y test prediction:  0.9122196516921816
mse:  4.102988027787845
rmse:  2.0255833796187814


In [596]:
# Final Results
# Why Ooohh whyyyy OOHHHAAAAIIOOOOOO

predictions2 = lm.predict(X1)
print('r2_score predictions: ',r2_score(y1, predictions2))
rmse1 = np.sqrt(mean_squared_error(y1,predictions2))
print('rmse: ',rmse1)
mse1=mean_squared_error(y1,predictions2)
print('mse: ',mse1)

r2_score predictions:  -1.458632859075726e+24
rmse:  8170018355297.172
mse:  6.674919992589271e+25


In [592]:
X.columns

Index(['age', 'growth', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'fk_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'composure', 'marking', 'standing_tackle',
       'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking',
       'gk_positioning', 'gk_reflexes', 'pac', 'sho', 'pas', 'dri', 'def',
       'phy', 'hits', 'a/w', 'd/w', 'sm', 'bp_CB', 'bp_CDM', 'bp_CF', 'bp_CM',
       'bp_GK', 'bp_LB', 'bp_LM', 'bp_LW', 'bp_LWB', 'bp_RB', 'bp_RM', 'bp_RW',
       'bp_RWB', 'bp_ST'],
      dtype='object')

In [593]:
X1.columns

Index(['age', 'growth', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'fk_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'composure', 'marking', 'standing_tackle',
       'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking',
       'gk_positioning', 'gk_reflexes', 'pac', 'sho', 'pas', 'dri', 'def',
       'phy', 'hits', 'a/w', 'd/w', 'sm', 'bp_CB', 'bp_CDM', 'bp_CF', 'bp_CM',
       'bp_GK', 'bp_LB', 'bp_LM', 'bp_LW', 'bp_LWB', 'bp_RB', 'bp_RM', 'bp_RW',
       'bp_RWB', 'bp_ST'],
      dtype='object')