In [2]:
import numpy as np
import pandas as pd
import requests
from datetime import datetime
import json

#Set Display options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width' , 1000)

In [3]:
# Variables
season = "2020-21"

In [4]:
all_players = pd.read_csv (r'./data/' + season + '/Raw_data/all_players.csv', parse_dates= ["news_added"])
all_teams = pd.read_csv (r'./data/' + season + '/Raw_data/all_teams.csv')
all_events = pd.read_csv (r'./data/' + season + '/Raw_data/all_events.csv', parse_dates= ["deadline_time"])
player_types = pd.read_csv(r'./data/' + season + '/Raw_data/player_types.csv')
game_phases = pd.read_csv(r'./data/' + season + '/Raw_data/game_phases.csv')

player_season_history = pd.read_csv(r'./data/' + season + '/Raw_data/player_season_history.csv')
#player_gameweek_history = pd.read_csv(r'./data/' + season + '/player_past_history/player_gameweek_history_raw.csv')
player_gameweek_history = pd.read_csv(r'./data/2019-20/player_past_history/player_gameweek_history_raw.csv')
player_future_fixture = pd.read_csv(r'./data/' + season + '/Raw_data/player_future_fixture.csv', parse_dates = ["kickoff_time"])



## Rename columns

In [5]:
player_gameweek_history.head()

Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,red_cards,saves,bonus,bps,influence,creativity,threat,ict_index,value,transfers_balance,selected,transfers_in,transfers_out
0,1,10,13,0,False,2019-08-11T13:00:00Z,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,55,0,33117,0,0
1,1,11,5,0,True,2019-08-17T11:30:00Z,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,55,-5280,36709,2868,8148
2,1,24,10,0,False,2019-08-24T16:30:00Z,3,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,54,-6882,30975,534,7416
3,1,31,17,0,True,2019-09-01T15:30:00Z,2,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,54,-3872,28096,346,4218
4,1,49,18,0,False,2019-09-15T15:30:00Z,2,2,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,53,-2073,26902,581,2654


In [6]:
player_gameweek_history.rename(columns = {"element" : "player_id" , "opponent_team" : "opponent_team_id" , "fixture" : "fixture_id" } , inplace = True)
all_players.rename(columns = {"code" : "player_code", "element_type" : "player_type_id" , "id" : "player_id" , "team" : "team_id" , "points_per_game" : "PPG"} , inplace=True)
all_teams.rename(columns = {"id" : "team_id" , "code" : "team_code" , "name" : "team_name" , "short_name" : "team_short_name"} , inplace = True)
all_events.rename(columns = {"id" : "gameweek_id"} , inplace = True)
player_types.rename(columns = {"id" : "player_type_id" , "singular_name" : "position_name" , "singular_name_short" : "position_name_short"} , inplace = True)
game_phases.rename(columns = {"id" : "phase_id"} , inplace = True)
player_season_history.rename(columns = {"element_code" : "player_code"} , inplace = True)
player_future_fixture.rename(columns = {"id" : "fixture_id" , "team_h" : "home_team_id" , "team_a" : "away_team_id" , "events" : "gameweek_id" , "code" : "fixture_code"} , inplace = True)
#player_gameweek_history.rename(columns = {} , inplace = True)


## Drop columns

In [7]:
all_events.head(5)

Unnamed: 0,gameweek_id,name,deadline_time,average_entry_score,finished,data_checked,highest_scoring_entry,deadline_time_epoch,deadline_time_game_offset,highest_score,is_previous,is_current,is_next,chip_plays,most_selected,most_transferred_in,top_element,top_element_info,transfers_made,most_captained,most_vice_captained
0,1,Gameweek 1,2020-09-12 12:30:00+00:00,0,False,False,,1599913800,0,,False,False,True,[],,,,,0,,
1,2,Gameweek 2,2020-09-19 12:30:00+00:00,0,False,False,,1600518600,0,,False,False,False,[],,,,,0,,
2,3,Gameweek 3,2020-09-26 12:30:00+00:00,0,False,False,,1601123400,0,,False,False,False,[],,,,,0,,
3,4,Gameweek 4,2020-10-03 12:30:00+00:00,0,False,False,,1601728200,0,,False,False,False,[],,,,,0,,
4,5,Gameweek 5,2020-10-17 12:30:00+00:00,0,False,False,,1602937800,0,,False,False,False,[],,,,,0,,


In [8]:
all_events.drop(columns = ["deadline_time_epoch"] , inplace = True)
player_types.drop(columns = ["plural_name" , "plural_name_short"] , inplace = True)
all_players.drop(columns = ["squad_number" , "special" , "transfers_in" , "transfers_out" , "first_name" , "second_name"] , inplace = True)

## Change date columns

In [9]:
all_players['news_added'] = all_players['news_added'].dt.tz_localize(None)
all_events['deadline_time'] = all_events['deadline_time'].dt.tz_localize(None)
player_future_fixture['kickoff_time'] = player_future_fixture['kickoff_time'].dt.tz_localize(None)

player_gameweek_history['kickoff_time'] = pd.to_datetime(player_gameweek_history['kickoff_time'])
player_gameweek_history['kickoff_time'] = player_gameweek_history['kickoff_time'].dt.tz_localize(None)


## Add Columns

In [10]:
player_future_fixture.head()

Unnamed: 0,fixture_id,fixture_code,home_team_id,team_h_score,away_team_id,team_a_score,event,finished,minutes,provisional_start_time,kickoff_time,event_name,is_home,difficulty,player_id
0,2,2128288,8,,1,,1.0,False,0,False,2020-09-12 14:00:00,Gameweek 1,False,2,1
1,9,2128296,1,,19,,2.0,False,0,False,2020-09-19 14:00:00,Gameweek 2,True,2,1
2,23,2128310,11,,1,,3.0,False,0,False,2020-09-26 14:00:00,Gameweek 3,False,5,1
3,29,2128316,1,,15,,4.0,False,0,False,2020-10-03 14:00:00,Gameweek 4,True,3,1
4,44,2128331,12,,1,,5.0,False,0,False,2020-10-17 14:00:00,Gameweek 5,False,5,1


In [11]:
def opponent_team(row):
    if row['is_home'] == True:
        val = row['away_team_id']
    elif row['is_home'] == False:
        val = row['home_team_id']
    return val

def team(row):
    if row['is_home'] == False:
        val = row['away_team_id']
    elif row['is_home'] == True:
        val = row['home_team_id']
    return val


player_future_fixture['opponent_team_id'] = player_future_fixture.apply(opponent_team,axis=1)
player_future_fixture['team_id'] = player_future_fixture.apply(team,axis=1)


## Calculate New Columns

In [12]:
all_players.columns

Index(['chance_of_playing_next_round', 'chance_of_playing_this_round', 'player_code', 'cost_change_event', 'cost_change_event_fall', 'cost_change_start', 'cost_change_start_fall', 'dreamteam_count', 'player_type_id', 'ep_next', 'ep_this', 'event_points', 'form', 'player_id', 'in_dreamteam', 'news', 'news_added', 'now_cost', 'photo', 'PPG', 'selected_by_percent', 'status', 'team_id', 'team_code', 'total_points', 'transfers_in_event', 'transfers_out_event', 'value_form', 'value_season', 'web_name', 'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded', 'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity', 'threat', 'ict_index', 'influence_rank', 'influence_rank_type', 'creativity_rank', 'creativity_rank_type', 'threat_rank', 'threat_rank_type', 'ict_index_rank', 'ict_index_rank_type'], dtype='object')

In [13]:
all_players.head()

Unnamed: 0,chance_of_playing_next_round,chance_of_playing_this_round,player_code,cost_change_event,cost_change_event_fall,cost_change_start,cost_change_start_fall,dreamteam_count,player_type_id,ep_next,ep_this,event_points,form,player_id,in_dreamteam,news,news_added,now_cost,photo,PPG,selected_by_percent,status,team_id,team_code,total_points,...,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,red_cards,saves,bonus,bps,influence,creativity,threat,ict_index,influence_rank,influence_rank_type,creativity_rank,creativity_rank_type,threat_rank,threat_rank_type,ict_index_rank,ict_index_rank_type
0,,,37605,0,0,0,0,0,3,3.4,,0,0.0,1,False,,NaT,70,37605.jpg,2.9,2.1,a,1,3,53,...,1439,1,3,5,20,0,0,0,1,0,0,1,256,223.6,582.9,190.0,99.8,253,109,35,29,174,95,135,72
1,,,39476,0,0,0,0,0,2,3.1,,0,0.0,2,False,,NaT,50,39476.jpg,3.0,0.6,a,1,3,57,...,1696,2,0,4,25,0,0,0,6,0,0,5,305,436.2,36.8,110.0,58.5,144,53,293,105,222,64,233,74
2,,,41270,0,0,0,0,0,2,3.5,,0,0.0,3,False,,NaT,55,41270.jpg,2.8,1.7,a,1,3,94,...,2809,2,1,8,42,0,0,0,5,2,0,10,494,701.6,106.7,211.0,102.1,50,18,221,61,160,32,130,35
3,,,54694,0,0,0,0,0,3,5.0,,0,0.0,4,False,,NaT,120,54694.jpg,5.7,32.9,a,1,3,205,...,3136,22,5,10,44,0,0,0,3,1,0,37,807,1006.0,479.6,1369.0,285.2,8,4,54,42,9,3,11,7
4,,,58822,0,0,0,0,0,2,3.1,,0,0.0,5,False,,NaT,50,58822.jpg,2.9,0.4,a,1,3,61,...,1553,1,1,4,20,0,0,0,1,0,0,3,286,349.0,218.9,118.0,68.7,182,68,158,34,221,63,201,59


In [14]:
all_players['PP90'] = np.divide(all_players['total_points'] , all_players['minutes'])*90 

In [15]:
all_players['PPMM'] = np.divide(all_players['PPG'] , all_players['now_cost']/10)

In [16]:
all_players['VAPM'] = np.divide(all_players['PPG']-2 , all_players['now_cost']/10)

In [17]:
all_players[['web_name' , 'player_type_id', 'total_points' , 'minutes' , 'PP90' , 'PPG' , 'PPMM' , 'VAPM']][all_players['minutes']>1000].sort_values(by='VAPM' , ascending = False).head()

Unnamed: 0,web_name,player_type_id,total_points,minutes,PP90,PPG,PPMM,VAPM
302,Fernandes,3,117,1187,8.871104,8.4,0.8,0.609524
104,Alonso,2,100,1429,6.298111,5.6,0.933333,0.6
387,Lloris,1,98,1808,4.878319,4.7,0.854545,0.490909
259,Alexander-Arnold,2,210,3173,5.956508,5.5,0.733333,0.466667
303,Martial,4,200,2625,6.857143,6.2,0.688889,0.466667


## Save cleaned data

In [18]:
all_players.to_csv (r'./data/' + season + '/Cleaned_data/all_players.csv', index = False, header=True)
all_teams.to_csv (r'./data/' + season + '/Cleaned_data/all_teams.csv', index = False, header = True)
all_events.to_csv (r'./data/' + season + '/Cleaned_data/all_events.csv', index = False, header = True)
player_types.to_csv(r'./data/' + season + '/Cleaned_data/player_types.csv', index = False, header = True)
game_phases.to_csv(r'./data/' + season + '/Cleaned_data/game_phases.csv', index = False, header = True)

player_season_history.to_csv (r'./data/' + season + '/Cleaned_data/player_season_history.csv', index = False, header=True)
player_gameweek_history.to_csv (r'./data/' + season + '/Cleaned_data/player_gameweek_history.csv', index = False, header=True)
player_future_fixture.to_csv (r'./data/' + season + '/Cleaned_data/player_future_fixture.csv', index = False, header=True)