In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
import ast
import tqdm
%matplotlib inline

In [2]:
personal = pd.read_csv('data/player_stats.csv')
personal['date'] = pd.to_datetime(personal['date'], utc=True)


There's a bunch of team statistics mixed up in this personal player data. We extract the team statistic first. Note that these team statistics were only available from FIFA 10 onwards.

In [3]:
# restrict dates
earliest_date = '2010-01-01'

team_ext = personal.loc[:,['team', 'date', 'team_stats']]
team_ext = team_ext[team_ext['date'] > earliest_date]
team_ext.head()

Unnamed: 0,team,date,team_stats
0,Brazil,2018-07-15 00:00:00+00:00,"['51', '73', '50', '39', '33', '67', '76', '73..."
1,England,2018-07-15 00:00:00+00:00,"['36', '20', '39', '31', '41', '41', '43', '55..."
2,Italy,2018-07-15 00:00:00+00:00,"['73', '60', '67', '64', '80', '78', '29', '26..."
3,Spain,2018-07-15 00:00:00+00:00,"['32', '41', '28', '27', '32', '20', '75', '62..."
4,France,2018-07-15 00:00:00+00:00,"['35', '52', '30', '24', '53', '35', '47', '47..."


In [4]:
def extract_team_stats(row):
    return re.findall("\['(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)', '(.*)'\]",row.team_stats)[0]

In [None]:
# convert from string to numeric
team_ext['team_stats'] = team_ext.apply(extract_team_stats, axis=1)

In [None]:
# 'flatten' to dataframe
team_stats = team_ext.apply(lambda i: pd.Series(i.team_stats), axis=1)
team_ext = pd.concat([team_ext.drop('team_stats', axis=1), team_stats], axis=1)

In [None]:
team_ext.columns = ['team', 'date', 'bup_speed', 'bup_dribbling', 'bup_passing',
                   'cc_passing', 'cc_crossing', 'cc_shooting', 'd_pressure',
                   'd_aggresion', 'd_width']
display(team_ext.head())
team_ext.dtypes

In [None]:
# note that there are some missing values, which have no data in them
team_ext['bup_speed'] = pd.to_numeric(team_ext.loc[:,'bup_speed'])
team_ext['bup_dribbling'] = pd.to_numeric(team_ext.loc[:,'bup_dribbling'])
team_ext['bup_passing'] = pd.to_numeric(team_ext.loc[:,'bup_passing'])
team_ext['cc_passing'] = pd.to_numeric(team_ext.loc[:,'cc_passing'])
team_ext['cc_crossing'] = pd.to_numeric(team_ext.loc[:,'cc_crossing'])
team_ext['cc_shooting'] = pd.to_numeric(team_ext.loc[:,'cc_shooting'])
team_ext['d_pressure'] = pd.to_numeric(team_ext.loc[:,'d_pressure'])
team_ext['d_aggresion'] = pd.to_numeric(team_ext.loc[:,'d_aggresion'])
team_ext['d_width'] = pd.to_numeric(team_ext.loc[:,'d_width'])

team_ext.dtypes

In [None]:
team_ext[team_ext['bup_dribbling'].isnull()].shape

We can thus see that there are missing values in `bup_dribbling`. We check that there are 2193 missing values. We replace them with zeros for now.

In [None]:
team_ext.fillna(0, inplace=True)
team_ext.isna().values.any()

In [None]:
team_ext.to_csv('data/team_stats_ext.csv', index=False)

After dealing with the extended team statistics, we can now look at the individual player data that we have.

In [5]:
personal.drop('team_stats', axis=1, inplace=True)
personal.head()

Unnamed: 0,team,date,name,overall,potential,value,wage
0,Brazil,2018-07-15 00:00:00+00:00,"['Alisson', 'Fagner', 'Thiago Silva', 'Miranda...","['84', '77', '86', '87', '87', '84', '87', '88...","['88', '77', '86', '87', '87', '84', '87', '91...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
1,England,2018-07-15 00:00:00+00:00,"['J. Pickford', 'K. Trippier', 'K. Walker', 'J...","['80', '80', '83', '80', '79', '79', '80', '80...","['87', '81', '84', '86', '84', '79', '83', '82...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
2,Italy,2018-07-15 00:00:00+00:00,"['G. Buffon', 'A. Florenzi', 'G. Chiellini', '...","['88', '82', '86', '81', '77', '80', '81', '85...","['88', '82', '86', '88', '81', '80', '84', '88...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
3,Spain,2018-07-15 00:00:00+00:00,"['De Gea', 'Nacho Fernández', 'Piqué', 'Sergio...","['91', '82', '87', '90', '85', '85', '87', '87...","['93', '84', '87', '90', '85', '88', '87', '87...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
4,France,2018-07-15 00:00:00+00:00,"['H. Lloris', 'B. Pavard', 'R. Varane', 'S. Um...","['87', '78', '85', '85', '79', '85', '86', '88...","['87', '84', '90', '89', '89', '94', '91', '90...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."


In [6]:
# loop through each row
player_stats = {}
for i in tqdm.tqdm(np.arange(personal.shape[0])):
    name = ast.literal_eval(personal.loc[i, 'name'])
    overall = ast.literal_eval(personal.loc[i, 'overall'])
    potential = ast.literal_eval(personal.loc[i, 'potential'])
    value = ast.literal_eval(personal.loc[i, 'value'])
    wage = ast.literal_eval(personal.loc[i, 'wage'])
    
    dat = pd.DataFrame({'name': name,
                       'overall': overall,
                       'potential': potential,
                       'value': value,
                       'wage': wage})
    player_stats[(personal.team[i], personal.date[i])] = dat

100%|███████████████████████████████████████████████████████████████████████████| 26908/26908 [02:15<00:00, 199.22it/s]


In [7]:
player_stats_df = pd.concat(player_stats, axis=0).reset_index()
player_stats_df.drop('level_2', axis=1, inplace=True)
player_stats_df.columns = ['team', 'date', 'name', 'overall',
                          'potential', 'value', 'wage']
player_stats_df.head()

Unnamed: 0,team,date,name,overall,potential,value,wage
0,Argentina,2006-08-30 00:00:00+00:00,R. Abbondanzieri,84,85,€0,€0
1,Argentina,2006-08-30 00:00:00+00:00,N. Burdisso,72,76,€0,€0
2,Argentina,2006-08-30 00:00:00+00:00,R. Ayala,86,86,€0,€0
3,Argentina,2006-08-30 00:00:00+00:00,G. Heinze,85,84,€0,€0
4,Argentina,2006-08-30 00:00:00+00:00,J. Sorín,76,78,€0,€0


Now that we have our data in tabular format, we should clean up some of the columns.

In [53]:
# edit value/wage columns into numeric data
player_stats_df['wage'] = (player_stats_df['wage']
                           .str.replace('€', '')
                           .str.replace('K', '')
                           .astype(float))
player_stats_df['value'] = [float(val
                           .replace('€', '')
                           .replace('K', ''))*1000 if val[-1] == 'K' else float(val
                                                                         .replace('€', '')
                                                                         .replace('M', ''))*1000000 for val in player_stats_df['value']]
    
player_stats_df['value'] = player_stats_df['value'] / 1e6

player_stats_df.rename(columns={
    'wage': 'wage_euros_thousands',
    'value': 'value_euros_millions'
}, inplace=True)

player_stats_df.head()

Unnamed: 0,team,date,name,overall,potential,value_euros_millions,wage_euros_thousands
0,Argentina,2006-08-30 00:00:00+00:00,R. Abbondanzieri,84,85,0.0,0.0
1,Argentina,2006-08-30 00:00:00+00:00,N. Burdisso,72,76,0.0,0.0
2,Argentina,2006-08-30 00:00:00+00:00,R. Ayala,86,86,0.0,0.0
3,Argentina,2006-08-30 00:00:00+00:00,G. Heinze,85,84,0.0,0.0
4,Argentina,2006-08-30 00:00:00+00:00,J. Sorín,76,78,0.0,0.0


In [54]:
player_stats_df.to_csv('data/player_stats_clean.csv', index=False)

Let us come up with some team statistics from these player data.

In [55]:
player_stats_df['overall'] = pd.to_numeric(player_stats_df['overall'])
player_stats_df['potential'] = pd.to_numeric(player_stats_df['potential'])
player_stats_df['growth'] = player_stats_df['potential'] - player_stats_df['overall']

In [57]:
team_stats_players = player_stats_df.groupby(['team', 'date'], 
                                             as_index=False).agg({'wage_euros_thousands': np.mean,
                                                                  'value_euros_millions': np.mean,
                                                                  'growth': np.mean,
                                                                  'overall': 'first'}) # goalkeeper's overall
team_stats_players.rename({'overall': 'goalkeeeper_overall'}, axis=1, inplace=True)

In [58]:
team_stats_players.head()

Unnamed: 0,team,date,wage_euros_thousands,value_euros_millions,growth,goalkeeeper_overall
0,Argentina,2006-08-30 00:00:00+00:00,0.0,0.0,4.88,84
1,Argentina,2007-02-22 00:00:00+00:00,0.0,0.0,5.32,84
2,Argentina,2007-08-30 00:00:00+00:00,0.0,0.0,3.84,79
3,Argentina,2008-02-22 00:00:00+00:00,0.0,0.0,3.84,79
4,Argentina,2008-08-30 00:00:00+00:00,0.0,0.0,4.0,82


In [59]:
team_stats_players.to_csv('data/team_stats_players.csv', index=False)