# Cleaning

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
import ast
import tqdm
%matplotlib inline

In [52]:
sofifa = pd.read_csv('../data/sofifa_data.csv')
sofifa['date'] = pd.to_datetime(sofifa['date'], utc=True)

sofifa.head()

Unnamed: 0,team,date,attack,defence,ext_stats,full_age,midfield,name,overall,player_overall,potential,prestige,start_age,value,wage
0,Brazil,2018-11-15 00:00:00+00:00,84.0,80.0,"['52', '74', '51', '61', '44', '68', '75', '54...",29.48,80.0,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...",81.0,"['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...",10.0,29.64,"['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
1,Brazil,2018-11-13 00:00:00+00:00,84.0,80.0,"['52', '74', '51', '61', '44', '68', '75', '54...",29.48,80.0,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...",81.0,"['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...",10.0,29.64,"['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
2,Brazil,2018-11-05 00:00:00+00:00,84.0,80.0,"['52', '74', '51', '61', '44', '68', '75', '54...",29.48,80.0,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...",81.0,"['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...",10.0,29.64,"['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
3,Brazil,2018-11-01 00:00:00+00:00,84.0,80.0,"['52', '74', '51', '61', '44', '68', '75', '54...",29.48,80.0,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...",81.0,"['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...",10.0,29.64,"['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
4,Brazil,2018-10-29 00:00:00+00:00,84.0,80.0,"['52', '74', '51', '61', '44', '68', '75', '54...",29.48,80.0,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...",81.0,"['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...",10.0,29.64,"['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."


The sofifa data that we have scraped consists of both team-based and individual player-based data. Let us split them accordingly for now.

In [53]:
sofifa_team = sofifa.loc[:,['team', 'date', 'overall', 'attack',
                            'midfield', 'defence', 'prestige', 
                           'start_age', 'full_age', 'ext_stats']]

sofifa_player = sofifa.loc[:,['team', 'date', 'name', 'player_overall',
                             'potential', 'value', 'wage']]

## Team Statistics

We then start by cleaning up the team-based data. Specifically, let us extract `ext_stats` which is a bunch of additional team based statistics. 

In [54]:
def extract_team_stats(row):
    try:
        return re.findall("\['(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)'\]",row.ext_stats)[0]
    except IndexError:
        return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)

In [55]:
# convert from string to numeric
sofifa_team['ext_stats'] = sofifa_team.apply(extract_team_stats, axis=1)

In [56]:
# 'flatten' to dataframe
team_stats = sofifa_team.apply(lambda i: pd.Series(i.ext_stats), axis=1)
sofifa_team = pd.concat([sofifa_team.drop('ext_stats', axis=1), team_stats], axis=1)

In [57]:
# rename columns
sofifa_team.rename({0: 'bup_speed', 1: 'bup_dribbling', 2: 'bup_passing',
                   3: 'cc_passing', 4: 'cc_crossing', 5: 'cc_shooting',
                   6: 'd_pressure', 7: 'd_aggresion', 8: 'd_width'}, inplace=True, axis=1)

display(sofifa_team.head())
sofifa_team.dtypes

Unnamed: 0,team,date,overall,attack,midfield,defence,prestige,start_age,full_age,bup_speed,bup_dribbling,bup_passing,cc_passing,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width
0,Brazil,2018-11-15 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52,74,51,61,44,68,75,54,64
1,Brazil,2018-11-13 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52,74,51,61,44,68,75,54,64
2,Brazil,2018-11-05 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52,74,51,61,44,68,75,54,64
3,Brazil,2018-11-01 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52,74,51,61,44,68,75,54,64
4,Brazil,2018-10-29 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52,74,51,61,44,68,75,54,64


team                          object
date             datetime64[ns, UTC]
overall                      float64
attack                       float64
midfield                     float64
defence                      float64
prestige                     float64
start_age                    float64
full_age                     float64
bup_speed                     object
bup_dribbling                 object
bup_passing                   object
cc_passing                    object
cc_crossing                   object
cc_shooting                   object
d_pressure                    object
d_aggresion                   object
d_width                       object
dtype: object

In [58]:
# https://stackoverflow.com/questions/36814100/pandas-to-numeric-for-multiple-columns
cols = sofifa_team.columns[sofifa_team.dtypes.eq('object')].values[1:]
sofifa_team[cols] = sofifa_team[cols].apply(pd.to_numeric)

sofifa_team.dtypes

team                          object
date             datetime64[ns, UTC]
overall                      float64
attack                       float64
midfield                     float64
defence                      float64
prestige                     float64
start_age                    float64
full_age                     float64
bup_speed                    float64
bup_dribbling                float64
bup_passing                  float64
cc_passing                   float64
cc_crossing                  float64
cc_shooting                  float64
d_pressure                   float64
d_aggresion                  float64
d_width                      float64
dtype: object

In [59]:
print(sofifa_team[sofifa_team['bup_dribbling'].isnull()].shape)
print(sofifa_team[sofifa_team['bup_speed'].isnull()].shape)

(2058, 18)
(250, 18)


We can thus see that there are missing values in `bup_dribbling`. We check that there are 2058 missing values. This is because the statistic was not collected for earlier versions of the game. For the rest of the team based strategy statistics, we note that there are 250 missing values. This is because those statistics were only collected from FIFA 10 onwards. 

In [60]:
sofifa_team.isna().values.any()

True

## Individual Player Statistics

After dealing with the extended team statistics, we can now look at the individual player data that we have.

In [61]:
sofifa_player.head()

Unnamed: 0,team,date,name,player_overall,potential,value,wage
0,Brazil,2018-11-15 00:00:00+00:00,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...","['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...","['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
1,Brazil,2018-11-13 00:00:00+00:00,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...","['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...","['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
2,Brazil,2018-11-05 00:00:00+00:00,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...","['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...","['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
3,Brazil,2018-11-01 00:00:00+00:00,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...","['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...","['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."
4,Brazil,2018-10-29 00:00:00+00:00,"['Raphaelito Anjos', 'Maikel Catarino', 'Juian...","['82', '79', '82', '83', '80', '80', '82', '79...","['82', '79', '82', '83', '80', '80', '82', '79...","['€14M', '€3.6M', '€6.5M', '€22.5M', '€10.5M',...","['€32K', '€38K', '€39K', '€43K', '€34K', '€34K..."


In [62]:
# loop through each row
player_stats = {}
for i in tqdm.tqdm(np.arange(sofifa_player.shape[0])):
    name = ast.literal_eval(sofifa_player.loc[i, 'name'])
    overall = ast.literal_eval(sofifa_player.loc[i, 'player_overall'])
    potential = ast.literal_eval(sofifa_player.loc[i, 'potential'])
    value = ast.literal_eval(sofifa_player.loc[i, 'value'])
    wage = ast.literal_eval(sofifa_player.loc[i, 'wage'])
    
    dat = pd.DataFrame({'name': name,
                       'player_overall': overall,
                       'potential': potential,
                       'value': value,
                       'wage': wage})
    player_stats[(sofifa_player.team[i], sofifa_player.date[i])] = dat

100%|████████████████████████████████████████████████████████████████████| 20594/20594 [01:52<00:00, 183.65it/s]


In [63]:
sofifa_player = pd.concat(player_stats, axis=0).reset_index()
sofifa_player.drop('level_2', axis=1, inplace=True)
sofifa_player.rename({'level_0': 'team',
                     'level_1': 'date'},
                    axis=1, inplace=True)
sofifa_player.head()

Unnamed: 0,team,date,name,player_overall,potential,value,wage
0,Argentina,2006-08-30 00:00:00+00:00,R. Abbondanzieri,84,85,€0,€0
1,Argentina,2006-08-30 00:00:00+00:00,N. Burdisso,72,76,€0,€0
2,Argentina,2006-08-30 00:00:00+00:00,R. Ayala,86,86,€0,€0
3,Argentina,2006-08-30 00:00:00+00:00,G. Heinze,85,84,€0,€0
4,Argentina,2006-08-30 00:00:00+00:00,J. Sorín,76,78,€0,€0


Now that we have our data in tabular format, we should clean up some of the columns.

In [64]:
# edit value/wage columns into numeric data
sofifa_player['wage'] = (sofifa_player['wage']
                         .str.replace('€', '')
                         .str.replace('K', '')
                         .astype(float))
sofifa_player['value'] = [float(val
                                .replace('€', '')
                                .replace('K', ''))*1000 if val[-1] == 'K' else float(val
                                                                                     .replace('€', '')
                                                                                     .replace('M', ''))*1000000 for val in sofifa_player['value']]
    
sofifa_player['value'] = sofifa_player['value'] / 1e6

sofifa_player.rename(columns={
    'wage': 'wage_euros_thousands',
    'value': 'value_euros_millions'
}, inplace=True)

sofifa_player.head()

Unnamed: 0,team,date,name,player_overall,potential,value_euros_millions,wage_euros_thousands
0,Argentina,2006-08-30 00:00:00+00:00,R. Abbondanzieri,84,85,0.0,0.0
1,Argentina,2006-08-30 00:00:00+00:00,N. Burdisso,72,76,0.0,0.0
2,Argentina,2006-08-30 00:00:00+00:00,R. Ayala,86,86,0.0,0.0
3,Argentina,2006-08-30 00:00:00+00:00,G. Heinze,85,84,0.0,0.0
4,Argentina,2006-08-30 00:00:00+00:00,J. Sorín,76,78,0.0,0.0


Let us come up with some team statistics from these player data.

In [65]:
sofifa_player['player_overall'] = pd.to_numeric(sofifa_player['player_overall'])
sofifa_player['potential'] = pd.to_numeric(sofifa_player['potential'])
sofifa_player['growth'] = sofifa_player['potential'] - sofifa_player['player_overall']

In [66]:
sofifa_player_grp = sofifa_player.groupby(['team', 'date'], 
                                          as_index=False).agg({'wage_euros_thousands': np.mean,
                                                               'value_euros_millions': np.mean,
                                                               'growth': np.mean,
                                                               'player_overall': 'first'}) # goalkeeper's overall
sofifa_player_grp.rename({'player_overall': 'goalkeeper_overall'}, axis=1, inplace=True)
sofifa_player_grp.head()

Unnamed: 0,team,date,wage_euros_thousands,value_euros_millions,growth,goalkeeper_overall
0,Argentina,2006-08-30 00:00:00+00:00,0.0,0.0,4.88,84
1,Argentina,2007-02-22 00:00:00+00:00,0.0,0.0,5.32,84
2,Argentina,2007-08-30 00:00:00+00:00,0.0,0.0,3.84,79
3,Argentina,2008-02-22 00:00:00+00:00,0.0,0.0,3.84,79
4,Argentina,2008-08-30 00:00:00+00:00,0.0,0.0,4.0,82


## Merge Team and Individual Player Group Statistics

We can then merge our datasets together.

In [67]:
sofifa = sofifa_team.merge(sofifa_player_grp, how='left',
                           on=['team', 'date'])

In [68]:
display(sofifa.head())
sofifa.dtypes

Unnamed: 0,team,date,overall,attack,midfield,defence,prestige,start_age,full_age,bup_speed,...,cc_passing,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width,wage_euros_thousands,value_euros_millions,growth,goalkeeper_overall
0,Brazil,2018-11-15 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52.0,...,61.0,44.0,68.0,75.0,54.0,64.0,43.869565,15.795652,0.043478,82.0
1,Brazil,2018-11-13 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52.0,...,61.0,44.0,68.0,75.0,54.0,64.0,43.869565,15.795652,0.043478,82.0
2,Brazil,2018-11-05 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52.0,...,61.0,44.0,68.0,75.0,54.0,64.0,43.869565,15.795652,0.043478,82.0
3,Brazil,2018-11-01 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52.0,...,61.0,44.0,68.0,75.0,54.0,64.0,43.869565,15.795652,0.043478,82.0
4,Brazil,2018-10-29 00:00:00+00:00,81.0,84.0,80.0,80.0,10.0,29.64,29.48,52.0,...,61.0,44.0,68.0,75.0,54.0,64.0,43.869565,15.795652,0.043478,82.0


team                                 object
date                    datetime64[ns, UTC]
overall                             float64
attack                              float64
midfield                            float64
defence                             float64
prestige                            float64
start_age                           float64
full_age                            float64
bup_speed                           float64
bup_dribbling                       float64
bup_passing                         float64
cc_passing                          float64
cc_crossing                         float64
cc_shooting                         float64
d_pressure                          float64
d_aggresion                         float64
d_width                             float64
wage_euros_thousands                float64
value_euros_millions                float64
growth                              float64
goalkeeper_overall                  float64
dtype: object

We note that we are missing the aggregated individual player data for Netherlands on Feb 22 2009, Aug 30 2008, Feb 22 2008, Aug 30 2007. sofifa.com does not have the data for some reason. We impute them as zero for now.

In [70]:
print(sofifa.isna().values.any())

True


In [71]:
# save data
sofifa.to_csv('../data/sofifa_final.csv', index=False)