**Importing libraries**

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

**Function to build player dataframe from one season**

In [2]:
def build_players(path, season_paths, season_names, teams):
    
    # read in player information for each season and add to list
    season_players = []
    
    for season_path in season_paths:
        player = pd.read_csv(season_path/'players_raw.csv',
                            usecols =  ['id','first_name','second_name',
                                        'element_type','team_code',
                                        'bps','chance_of_playing_next_round',
                                        'now_cost','selected_by_percent',
                                        'transfers_in','transfers_in_event',
                                        'transfers_out','transfers_out_event'
                                       ])
        season_players.append(player)
                   
    # getting full name from first and last name 
    for player in season_players:
        player['full_name'] = player['first_name'] + '_' + player['second_name']
        player.drop(['first_name', 'second_name'], axis = 1, inplace = True)
        
    #concatenating all players from the 4 seasons into a single DF
    all_players = pd.concat(season_players, axis = 0, ignore_index = True, sort =False)
         
    #creating a series of unique names only
    all_players = pd.DataFrame(all_players['full_name'].drop_duplicates())

                               
#     storing all the values for all players from across all season
#   in a DF by matching with the previously stored series of unique names
                               
    for player, season in zip(season_players, season_names):
        all_players = all_players.merge(player, on = "full_name", how = 'left')
        all_players.rename(index=str,
                                         columns= {'id': 'id_' + season,
                                                 'element_type': 'position_' + season,
                                                 'team_code': 'team_' + season,
                                                 'bps': 'bps_' + season,
                                                 'chance_of_playing_next_round': 'play_proba_' + season,
                                                 'now_cost': 'cost_' + season,
                                                 'selected_by_percent': 'selected_by_percent_' + season,
                                                 'transfers_in': 'transfers_in_' + season,
                                                 'transfers_in_event': 'transfers_in_event_' + season,
                                                 'transfers_out': 'transfers_out_' + season,
                                                 'transfers_out_event': 'transfers_out_event_' + season
                                                   },inplace = True)
                                  
    return all_players     

**Function to build season dataframe for one season**

In [3]:
def build_season(season_path,season_name, teams, all_players, gw = range(1,39)):
    gws_list = [] #list containing all GW data
    
    id_season = 'id_' + season_name
    id_team = 'team_' + season_name
    id_position = 'position_' + season_name
    
    for i in gw:
        gw_path = 'gws/gw' + str(i) + '.csv'
        gw_df = pd.read_csv(season_path/gw_path, encoding= 'latin')
        gw_df['gw']= str(i)
        gws_list.append(gw_df)
    
    #concatenating entire season
    df_season = pd.concat(gws_list, axis = 0,sort = False) #####sort parameter was added later; look for errors
    
    df_season = df_season.merge(all_players, left_on = 'element', right_on = id_season, how = 'left')
    df_season = df_season.merge(teams, left_on = 'opponent_team', right_on = id_team, how = 'left')
    df_season = df_season.merge(teams, left_on = id_team + '_x'  , right_on = 'team_code', how = 'left')  
    df_season = df_season[['full_name', 'gw', 
                           'minutes', 'team_y', id_position,
                           'team_x', 'was_home', 'total_points',
                           'assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 
                           'goals_conceded', 'goals_scored', 'ict_index', 'influence', 
                           'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 
                           'saves', 'selected', 'team_a_score', 'team_h_score', 'threat', 
                           'transfers_balance', 'transfers_in', 'transfers_out', 
                           'yellow_cards', 'kickoff_time']]
    df_season.columns = ['player', 'gw', 
                         'minutes', 'team',   'position',
                         'opponent_team', 'was_home', 'total_points',
                         'assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 
                         'goals_conceded', 'goals_scored', 'ict_index', 'influence', 
                         'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 
                         'saves', 'selected', 'team_a_score', 'team_h_score', 'threat', 
                         'transfers_balance', 'transfers_in', 'transfers_out', 
                         'yellow_cards', 'kickoff_time']
    df_season['season'] = season_name
    df_season['gw'] = df_season['gw'].astype(int)
    df_season['position'] = df_season['position'].astype(int)
    
    return df_season 

**Function to label positions for each player**

In [4]:
def label_player_positions(df):
    conditions = [df['position']==1,
                df['position']==2,
                df['position']==3,
                df['position']==4]
    outputs = ['GK','DEF','MID','FWD']
    res = np.select(conditions, outputs)
    s = pd.Series(res)
    df.insert(5, 'position_name',s, True)
    
    return df

**Function to label positions for each player**

In [5]:
def xg_data_cleanup(df,season):
    df = df.loc[:,['team','xG','xGA', 'npxG','npxGA']]
    
    team_names = {'Tottenham': 'Tottenham Hotspur',
                  'Leicester': 'Leicester City',
                  'West Ham': 'West Ham United',
                  'Stoke': 'Stoke City',
                  'Swansea': 'Swansea City',
                  'Hull': 'Hull City',
                  'Huddersfield': 'Huddersfield Town',
                  'Brighton': 'Brighton and Hove Albion',
                  'Cardiff': 'Cardiff City'}
    df['team'] = df.replace(team_names)
    df['season'] = str(season)
#     df['season'] = df['season'].astype(str)
    df.reset_index(drop= True)
    
    
    return df