Some unimplemented ideas:
- instead of the mean, use a weighted update of alpha for stat calculation
- instead of splitting identical variables between home and away, try to combine it into one variable, e.g. difference
- Only use the top 11 players rating (others considered substitute and not playing)
- lineup stat currently using sum. Try mean?

New implementation:
- divide lineup to each player feature by position
- select top 11 player by player rating?
- use weighted statistics for team and players stat
- predict goal difference
- predict each team goal in a match

In [1]:
import json
import pandas as pd
import collections
import numpy as np

In [2]:
#GLOBAL VAR
fpl_data_cols = ['minutes', 'goals', 'assits', 'cs', 'points', 'bonus', 'price']
fpl_merge_cols = ['fpl_minutes', 'fpl_goals', 'fpl_assits', 'fpl_cs', 'fpl_points', 'fpl_bonus', 'fpl_cprice']
available_season = ["14-15", "15-16", "16-17", "17-18"]
teamWeightUpdate = 0.4
playerWeightUpdate = 0.4
fplPlayerWeightUpdate = 0.6
suffix = "_weight_update_040406"

In [3]:
def getAllStatKeys():
    season_list = ["14-15", "15-16", "16-17", "17-18"]
    aggregate_stats_keys = []
    player_stats_keys = []
    for season in season_list:
        season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
        with open(season_stats_path, encoding="utf8") as json_file:
            print("processing season", season)
            data = json.load(json_file)
            for key in data.keys():
                teams = data[key]
                for team_key in teams.keys():
                    team = teams[team_key]
                    for aggStatKey in team['aggregate_stats'].keys():
                        if not aggStatKey in aggregate_stats_keys:
                            aggregate_stats_keys.append(aggStatKey)
                    for playerKey in team['Player_stats'].keys():
                        player = team['Player_stats'][playerKey]
                        player_stat = player['Match_stats']
                        for playerStatKey in player_stat.keys():
                            if not playerStatKey in player_stats_keys:
                                player_stats_keys.append(playerStatKey)
    return aggregate_stats_keys, player_stats_keys

def constructSeasonalMatchTable(season = "14-15", save = False):
    homeTeamStatCol = []
    awayTeamStatCol = []
    homePlayerStatCol = []
    awayPlayerStatCol = []
    player_data_path = "players_data/fpl_"+season+"_COMPLETED_CLEAN.csv"
    team_data_path = "teams_data/fpl_team_"+season+"_COMPLETED.csv"
    player_df = pd.read_csv(player_data_path)
    team_df = pd.read_csv(team_data_path)
    for col in player_df.columns[2:]:
        homePlayerStatCol.append("home_lineup_" + col)
        awayPlayerStatCol.append("away_lineup_" + col)
    for col in team_df.columns[1:]:
        homeTeamStatCol.append("home_team_" + col)
        awayTeamStatCol.append("away_team_" + col)
    
    season_match_path = "datafilev2/datafile/season"+season+"/season_match_stats.json"
    season_match_dict = {}
    with open(season_match_path, encoding="utf8") as json_file:
        season_match_data = json.load(json_file)
        for matchId in season_match_data.keys():
            season_match_dict[matchId] = {}
            for stat, value in season_match_data[matchId].items():
                season_match_dict[matchId][stat] = value
    match_stat_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    match_lineup_dict = {}
    with open(match_stat_path, encoding="utf8") as json_file:
        match_stat_data = json.load(json_file)
        for matchId in match_stat_data.keys():
            matchUp = match_stat_data[matchId]
            match_lineup_dict[matchId] = {}
            for teamId, teamStat in matchUp.items():
#                 print(matchUp)
                teamName = teamStat['team_details']['team_name']
                playerLineup = [player for player in teamStat['Player_stats'].keys()]
                match_lineup_dict[matchId][teamName] = playerLineup
    
    df_cols = ['match_id', 'date', 'home_team_name', 'away_team_name', 'winner'] + homeTeamStatCol + homePlayerStatCol + awayTeamStatCol + awayPlayerStatCol
    master_df = pd.DataFrame(columns=df_cols)
    for idx, matchId in enumerate(season_match_dict.keys()):
        newRow = []
        newRow.append(matchId)
        match = season_match_dict[matchId]
        date = match['date_string']
        homeTeam = match['home_team_name']
        awayteam = match['away_team_name']
        score = match['full_time_score'].split(" : ")
        winner = None
        if score[0] > score[1]:
            winner = 1
        elif score[0] < score[1]:
            winner = -1
        else:
            winner = 0
        newRow.append(date)
        newRow.append(homeTeam)
        newRow.append(awayteam)
        newRow.append(winner)
        
        homeTeamStat = team_df[team_df['teamName'] == homeTeam]
        for value in homeTeamStat.values[0,1:]:
            newRow.append(value)
        homeMatchLineup = match_lineup_dict[matchId][homeTeam]
        homeLineupStatList = [player_df[player_df['playerName'] == x].values[0,2:] for x in homeMatchLineup]
        homeLineupStat = np.array(homeLineupStatList).sum(axis=0)
        for value in homeLineupStat:
            newRow.append(value)
            
        awayTeamStat = team_df[team_df['teamName'] == awayteam]
        for value in awayTeamStat.values[0,1:]:
            newRow.append(value)
        awayMatchLineup = match_lineup_dict[matchId][awayteam]
        awayLineupStatList = [player_df[player_df['playerName'] == x].values[0,2:] for x in awayMatchLineup]
        awayLineupStat = np.array(awayLineupStatList).sum(axis=0)
        for value in awayLineupStat:
            newRow.append(value)
        master_df.loc[idx] = newRow
    if save:
        master_df.to_csv ("season_matches/season"+season+"_matches_clean.csv", index = None, header=True)
    return master_df
def constructAllSeason(save = False):
    df = constructSeasonalMatchTable(available_season[0])
    for season in available_season[1:]:
        df = df.append(constructSeasonalMatchTable(season))
    if save:
        df.to_csv ("season_matches/all_season_matches_clean.csv", index = None, header=True)
    return df
def seasonalTeamStats(aggregate_stats_keys, season = "14-15", constructDataFrame = False, saveCsv = False):
    season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    teamStats = {}
    with open(season_stats_path, encoding="utf8") as json_file:
        data = json.load(json_file)
        for key in data.keys():
            teams = data[key]
            for team_key in teams.keys():
                team = teams[team_key]
                teamName = team['team_details']['team_name']
                teamRating = team['team_details']['team_rating']
                if not teamName in teamStats:
                    teamStats[teamName] = {}
                    teamStats[teamName]['team_rating'] = []
                    for stat_keys in aggregate_stats_keys:
                        teamStats[teamName][stat_keys] = []
                matchStatData = team['aggregate_stats']
                #add data
                teamStats[teamName]['team_rating'].append(float(teamRating))
                for k in matchStatData.keys():
                    teamStats[teamName][k].append(float(matchStatData[k]))
        for v in teamStats.values():
            for k,v2 in v.items():
                if len(v2) > 0:
#                     print(v2)
                    v[k] = sum(v2)/len(v2)
                else:
                    v[k] = 0
    if constructDataFrame or saveCsv:
        df_cols = ['teamName', 'team_rating'] + aggregate_stats_keys
        complete_team_df = pd.DataFrame(columns=df_cols)
        for i, teamName in enumerate(teamStats.keys()):
            newRow = []
            newRow.append(teamName)
            stat = teamStats[teamName]
            for statKey in df_cols[1:]:
                newRow.append(stat[statKey])
            complete_team_df.loc[i] = newRow
        complete_team_df.to_csv ("teams_data/fpl_team_"+season+"_COMPLETED.csv", index = None, header=True)
        if constructDataFrame:
            return complete_team_df
    result = collections.OrderedDict(sorted(teamStats.items()))
    return result

def seasonalPlayerStats(player_stats_keys, season = "14-15", constructDataFrame = False, saveCsv = False):
    print("Processing season", season)
    season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    playerStats = {}
    with open(season_stats_path, encoding="utf8") as json_file:
        data = json.load(json_file)
        for key in data.keys():
            teams = data[key]
            for team_key in teams.keys():
                team = teams[team_key]
                lineup = team['Player_stats']
                teamName = team['team_details']['team_name']
                for k in lineup.keys():
                    if not k in playerStats:
                        playerStats[k] = {}
                        playerStats[k]['team_name'] = []
                        playerStats[k]['player_rating'] = []
                        for stat in player_stats_keys:
                            playerStats[k][stat] = []
        
                    playerRating = lineup[k]['player_details']['player_rating']
                    playerStats[k]['player_rating'].append(float(playerRating))
                    if not teamName in playerStats[k]['team_name']:
                        playerStats[k]['team_name'].append(teamName)
                    playerMatchStat = lineup[k]['Match_stats']
                    for match_stat, value in playerMatchStat.items():
                        playerStats[k][match_stat].append(float(value))
        for v in playerStats.values():
            for k,v2 in v.items():
                if k!='team_name':
                    if len(v2) > 0:
                        v[k] = sum(v2)/len(v2)
                    else:
                        v[k] = 0
    #aggregate with fpl and team abbr
    missing_player_fpl = []
    fpl_player_data_path = "fpl_players_data/fpl_"+season+"_CLEAN.csv"
    fpl_player_df = pd.read_csv(fpl_player_data_path)
    for player in playerStats.keys():
        for col in fpl_merge_cols:
            playerStats[player][col] = "NA"
    fplLastNameDict = {}
    tupleList=[]
    for i, row in fpl_player_df.iterrows():
        fullName = row['player']
        lastName = row['player'].split()[-1]
        teamName = row['team']
        if not (lastName, teamName) in fplLastNameDict:
            if not (lastName, teamName) in tupleList:
                tupleList.append((lastName, teamName))
                fplLastNameDict[(lastName, teamName)] = (i, fullName, teamName)
        else:
            print("FOUND DUPLICATE ON NAME AND TEAM FOR", fullName, "and",fplLastNameDict[(lastName, teamName)][1])
            fplLastNameDict.pop((lastName, teamName))
    for realName in playerStats.keys():
#         print("finding", fpl_name)
        lastRealName = realName.split()[-1]
        realTeam = playerStats[realName]['team_name']
        for teamChanges in realTeam:
            wantedKey = (lastRealName, teamChanges)
            if wantedKey in fplLastNameDict:
                index = fplLastNameDict[wantedKey][0]
                player_fpl_stat = fpl_player_df.iloc[index]
                for i,col in enumerate(fpl_data_cols):
                    playerStats[realName][fpl_merge_cols[i]] = player_fpl_stat[col]
            else:
                missing_player_fpl.append((realName, realTeam))
    print("NEED MANUAL LOOKUP", season,"NUMBER:", len(missing_player_fpl), missing_player_fpl)
    #make new pd and saves to CSV
    if constructDataFrame or saveCsv:
        pd_cols = ['playerName', 'team_name'] + fpl_merge_cols + ['player_rating'] + player_stats_keys
        complete_player_df = pd.DataFrame(columns=pd_cols)
        for i, playerName in enumerate(playerStats.keys()):
            newRow = []
            newRow.append(playerName)
            stat = playerStats[playerName]
            for statKey in pd_cols[1:]:
                if statKey == "team_name":
                    teamVal = stat[statKey][0]
                    for team in stat[statKey][1:]:
                        teamVal += "," + team
                    newRow.append(teamVal)
                else:    
                    newRow.append(stat[statKey])
            complete_player_df.loc[i] = newRow
        complete_player_df.to_csv ("fpl_players_data/fpl_"+season+"_COMPLETED.csv", index = None, header=True)
        if constructDataFrame:
            return complete_player_df
    result = collections.OrderedDict(sorted(playerStats.items()))
    return result

# def seasonalTeamStatsVer2(aggregate_stats_keys, matchId, season, constructDataFrame = False, saveCsv = False, updateRate = 0.7):
#     seasonIteration = available_season.index(season)
#     teamStats = {}
#     teamInfoTracker = {}
#     currSeason_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
#     with open(currSeason_stats_path, encoding="utf8") as json_file:
#         data = json.load(json_file)
#         for key in data.keys():
#             teams = data[key]
#             for team_key in teams.keys():
#                 team = teams[team_key]
#                 teamName = team['team_details']['team_name']
#                 if not teamName in teamInfoTracker:
#                     teamInfoTracker[teamName] = {}
#                     teamInfoTracker[teamName]['team_rating'] = False
#                     for stat_keys in aggregate_stats_keys:
#                         teamStats[teamName][stat_keys] = False
                        
#                     teamStats[teamName] = {}
#                     teamStats[teamName]['team_rating'] = 'NA'
#                     for stat_keys in aggregate_stats_keys:
#                         teamStats[teamName][stat_keys] = 'NA'
#     for selectedSeason in available_season[:seasonIteration + 1]:
#         season_stats_path = "datafilev2/datafile/season"+selectedSeason+"/season_stats.json"
#         with open(season_stats_path, encoding="utf8") as json_file:
#             data = json.load(json_file)
#             for key in data.keys():
#                 teams = data[key]
#                 for team_key in teams.keys():
#                     team = teams[team_key]
#                     teamName = team['team_details']['team_name']
#                     teamRating = team['team_details']['team_rating']
#                     if teamName in teamInfoTracker:
#                         if not teamInfoTracker[teamName]['team_rating']:
#                             teamInfoTracker[teamName]['team_rating'] = True
#                             teamStats[teamName]['team_rating'] = teamRating
#                         else:
#                             teamStats[teamName]['team_rating'] = ((1-updateRate) * teamStats[teamName]['team_rating']) + (updateRate * teamRating)
#                         matchStatData = team['aggregate_stats']
#                         for k in matchStatData.keys():
#                             if not teamInfoTracker[teamName][k]:
#                                 teamInfoTracker[teamName][k] = True
#                                 teamStats[teamName][k] = float(matchStatData[k])
#                             else:
#                                 teamStats[teamName][k] = ((1-updateRate) * teamStats[teamName][k]) + (updateRate * float(matchStatData[k]))    
#         if constructDataFrame or saveCsv:
#             df_cols = ['teamName', 'team_rating'] + aggregate_stats_keys
#             complete_team_df = pd.DataFrame(columns=df_cols)
#             for i, teamName in enumerate(teamStats.keys()):
#                 newRow = []
#                 newRow.append(teamName)
#                 stat = teamStats[teamName]
#                 for statKey in df_cols[1:]:
#                     newRow.append(stat[statKey])
#                 complete_team_df.loc[i] = newRow
#             complete_team_df.to_csv ("teams_data/fpl_team_"+selectedSeason+"_COMPLETED.csv", index = None, header=True)
#             if constructDataFrame:
#                 return complete_team_df
#         result = collections.OrderedDict(sorted(teamStats.items()))
#         return result


In [4]:
aggregate_stats_keys, player_stats_keys = getAllStatKeys()

processing season 14-15
processing season 15-16
processing season 16-17
processing season 17-18


In [9]:
#Include each player goal
teamWeightUpdate = 0.7
playerWeightUpdate = 0.7
fplPlayerWeightUpdate = 0.7

homeTeamStatCol = []
awayTeamStatCol = []
homePlayerStatCol = []
awayPlayerStatCol = []
home_player_goals = []
away_player_goals = []


teamBoolean = {}
playerBoolean = {}
teamBase = {}
playerBase = {}

player_data_path = "players_data/fpl_14-15_COMPLETED_CLEAN.csv"
team_data_path = "teams_data/fpl_team_14-15_COMPLETED.csv"
player_df = pd.read_csv(player_data_path)
team_df = pd.read_csv(team_data_path)

team_stat_cols = team_df.columns[1:]
player_stat_cols = ['player_rating'] + player_stats_keys

for i in range(1,12):
    home_player_goals.append("home_player_"+ str(i) + "_match_goal")
    away_player_goals.append("away_player_"+ str(i) + "_match_goal")
    for col in player_df.columns[2:]:
        homePlayerStatCol.append("home_player_"+ str(i) + "_" + col)
        awayPlayerStatCol.append("away_player_"+ str(i) + "_" + col)
for col in team_df.columns[1:]:
    homeTeamStatCol.append("home_team_" + col)
    awayTeamStatCol.append("away_team_" + col)
    

fpl_player_dict = {}
for outerSeason in available_season:
    seasonIndex = available_season.index(outerSeason)
    fpl_data_path = "fpl_players_data/fpl_"+outerSeason+"_lookedup_clean_complete.csv"
    fpl_df = pd.read_csv(fpl_data_path)
    players = fpl_df['playerName'].values
    for player in players:
        exist = False
        for season in available_season[:seasonIndex+1]:
            if season != outerSeason:
                if (player, season) in fpl_player_dict:
                    if not exist:
                        exist = True
                        fpl_player_dict[(player, outerSeason)] = {}
                        for col in fpl_merge_cols:
                            fpl_player_dict[(player, outerSeason)][col] = fpl_player_dict[(player, season)][col]
                    else:
                        for col in fpl_merge_cols:
                            fpl_player_dict[(player, outerSeason)][col] = (1-fplPlayerWeightUpdate)*fpl_player_dict[(player, outerSeason)][col] + fplPlayerWeightUpdate * fpl_player_dict[(player, season)][col]
            else:
                if not exist:
                    fpl_player_dict[(player, outerSeason)] = {}
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = fpl_df[fpl_df['playerName'] == player][col].values[0]
                else:
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = (1-fplPlayerWeightUpdate)*fpl_player_dict[(player, outerSeason)][col] + fplPlayerWeightUpdate * fpl_df[fpl_df['playerName'] == player][col].values[0]

df_cols = ['match_id', 'date', 'home_team_name', 'away_team_name', 'winner', 'home_goal', 'away_goal', 'goal_difference'] + home_player_goals + away_player_goals + homeTeamStatCol + homePlayerStatCol + awayTeamStatCol + awayPlayerStatCol
master_df = pd.DataFrame(columns=df_cols)
indx = 0
for season in available_season:
    player_sub_path = "players_data/fpl_"+season+"_COMPLETED_CLEAN.csv"
    player_sub_df = pd.read_csv(player_sub_path)
    team_sub_path = "teams_data/fpl_team_"+season+"_COMPLETED.csv"
    team_sub_df = pd.read_csv(team_sub_path)
    season_match_path = "datafilev2/datafile/season"+season+"/season_match_stats.json"
    season_match_dict = {}
    with open(season_match_path, encoding="utf8") as json_file:
        season_match_data = json.load(json_file)
        for matchId in season_match_data.keys():
            season_match_dict[matchId] = {}
            for stat, value in season_match_data[matchId].items():
                season_match_dict[matchId][stat] = value

    match_stat_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    match_lineup_dict = {}
    with open(match_stat_path, encoding="utf8") as json_file:
        match_stat_data = json.load(json_file)
        for matchId in match_stat_data.keys():
            matchUp = match_stat_data[matchId]
            match_lineup_dict[matchId] = {}
            for teamId, teamStat in matchUp.items():
    #                 print(matchUp)
                teamName = teamStat['team_details']['team_name']
                playerLineup = [player for player in teamStat['Player_stats'].keys()]
                match_lineup_dict[matchId][teamName] = playerLineup
                if not teamName in teamBoolean:
                    teamBoolean[teamName] = {}
                    for stat in team_stat_cols:
                        teamBoolean[teamName][stat] = False
                for player in teamStat['Player_stats'].keys():
                    if not player in playerBoolean:
                        playerBoolean[player] = {}
                        for stat in player_stat_cols:
                            playerBoolean[player][stat] = False

    season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
#     seasonIndex = available_season.index(season)
    with open(season_stats_path, encoding="utf8") as json_file:
        data = json.load(json_file)
        for _, matchId in enumerate(data.keys()):
            newRow = []
            
            match = season_match_dict[matchId]
            score = match['full_time_score'].split(" : ")
            winner = None
            if score[0] > score[1]:
                winner = 1
            elif score[0] < score[1]:
                winner = -1
            else:
                winner = 0
                
            newRow.append(matchId)
            newRow.append(match['date_string'])
            newRow.append(match['home_team_name'])
            newRow.append(match['away_team_name'])
            newRow.append(winner)
            newRow.append(int(score[0]))
            newRow.append(int(score[1]))
            newRow.append(int(score[0]) - int(score[1]))
            
            #goal scorer
            teamAllignment = [match['home_team_id'], match['away_team_id']]
            teams = data[matchId]
            for teamId in teamAllignment:
                teamInfos = teams[teamId]
                lineup = teamInfos['Player_stats']
                for playerName, playerInfo in lineup.items():
                    playerRoleValue = playerInfo['player_details']['player_position_value']
                    if playerRoleValue != '5':
                        player_stats = playerInfo['Match_stats']
                        if 'goals' in player_stats:
                            newRow.append(int(player_stats['goals']))
                        else:
                            newRow.append(0)
                        
                        
            #team and players stat processing
            for teamId in teamAllignment:
                teamInfos = teams[teamId]
                teamName = teamInfos['team_details']['team_name']
                teamRating = teamInfos['team_details']['team_rating']
                if not teamName in teamBase:
                    teamBase[teamName] = {}
                    for stat in team_stat_cols:
                        teamBase[teamName][stat] = 'NA'
                    teamBase[teamName]['team_rating'] = 'NA'
                    
                for stato in team_stat_cols:
                    if teamBase[teamName][stato] != 'NA':
                        newRow.append(teamBase[teamName][stato])
                    else:
                        newRow.append(team_sub_df[team_sub_df["teamName"] == teamName][stato].values[0])
                #TEAM STAT PROCESSING
                agg_stats = teamInfos['aggregate_stats']
                if not teamBoolean[teamName]['team_rating']:
                    teamBoolean[teamName]['team_rating'] = True
                    teamBase[teamName]['team_rating'] = float(teamRating)
                else:
                    oldTeamRating = teamBase[teamName]['team_rating']
                    teamBase[teamName]['team_rating'] = (1-teamWeightUpdate)*teamBase[teamName]['team_rating'] + teamWeightUpdate*float(teamRating)
#                     print("TEAM RATING UPDATE: ",matchId, teamName, "FROM", oldTeamRating, "TO", teamBase[teamName]['team_rating'])
                for ag_k, ag_v in agg_stats.items():
                    if not teamBoolean[teamName][ag_k]:
                        teamBoolean[teamName][ag_k] = True
                        teamBase[teamName][ag_k] = float(ag_v)
                    else:
                        old = teamBase[teamName][ag_k]
                        teamBase[teamName][ag_k] = (1-teamWeightUpdate)*teamBase[teamName][ag_k] + teamWeightUpdate*float(ag_v)
#                         print("TEAM STAT UPDATE: ", matchId, teamName, ag_k, "FROM", old, "TO", teamBase[teamName][ag_k])
               
                #PLAYERSTATPROCESSING
                lineup = teamInfos['Player_stats']
                for playerName, playerInfo in lineup.items():
                    playerRoleValue = playerInfo['player_details']['player_position_value']
                    if playerRoleValue != '5':
                        for fpl_col in fpl_merge_cols:
                            newRow.append(fpl_player_dict[(playerName, season)][fpl_col])
                        
                        playerRating = playerInfo['player_details']['player_rating']
                        if not playerName in playerBase:
                            playerBase[playerName] = {}
                            for stat in player_stat_cols:
                                playerBase[playerName][stat] = 'NA'
                            playerBase[playerName]['player_rating'] = 'NA'
                        
                        
                        for stato in player_stat_cols:
                            if playerBase[playerName][stato] != 'NA':
                                newRow.append(playerBase[playerName][stato])
                            else:
                                newRow.append(player_sub_df[player_sub_df["playerName"] == playerName][stato].values[0])
                        
                        if not playerBoolean[playerName]['player_rating']:
                            playerBase[playerName]['player_rating'] = float(playerRating)
                            playerBoolean[playerName]['player_rating'] = True
                        else:
                            oldPlayerRating = playerBase[playerName]['player_rating']
                            playerBase[playerName]['player_rating'] = (1-playerWeightUpdate)*playerBase[playerName]['player_rating'] + playerWeightUpdate*float(playerRating)
#                             print("PLAYER RATING UPDATE: ",matchId, playerName, "FROM", oldPlayerRating, "TO", playerBase[playerName]['player_rating'])
                        player_stats = playerInfo['Match_stats']
                        for pl_k, pl_v in player_stats.items():
                            if not playerBoolean[playerName][pl_k]:
                                playerBoolean[playerName][pl_k] = True
                                playerBase[playerName][pl_k] = float(pl_v)
                            else:
                                playerBase[playerName][pl_k] = (1-playerWeightUpdate)*playerBase[playerName][pl_k] + playerWeightUpdate*float(pl_v)
#                                 print('UPDATED PLAYER', matchId, playerName, pl_k)
                        
#             print(len(newRow), len(df_cols), player_counter)
            master_df.loc[indx] = newRow
            indx += 1
    print('DONE SEASON', season)
master_df.to_csv ("master_data_ver2/all_season_master_data_each_player_goals" + suffix +".csv", index = None, header=True)
master_df

DONE SEASON 14-15
DONE SEASON 15-16
DONE SEASON 16-17
DONE SEASON 17-18


Unnamed: 0,match_id,date,home_team_name,away_team_name,winner,home_goal,away_goal,goal_difference,home_player_1_match_goal,home_player_2_match_goal,...,away_player_11_six_yard_block,away_player_11_post_scoring_att,away_player_11_att_pen_target,away_player_11_penalty_save,away_player_11_penalty_conceded,away_player_11_clearance_off_line,away_player_11_att_pen_goal,away_player_11_att_pen_miss,away_player_11_own_goals,away_player_11_att_pen_post
0,829513,16/08/2014 17:30:00,Arsenal,Crystal Palace,1,2,1,1,0,0,...,1.00000,0.000000,0,0,0,0.0,0.0,0,0,0
1,829514,18/08/2014 20:00:00,Burnley,Chelsea,-1,1,3,-2,0,0,...,1.00000,1.000000,0,0,0,0.0,1.0,0,0,0
2,829515,16/08/2014 15:00:00,Leicester,Everton,0,2,2,0,0,0,...,1.00000,1.000000,0,0,0,0.0,1.0,0,1,0
3,829516,17/08/2014 13:30:00,Liverpool,Southampton,1,2,1,1,0,0,...,0.00000,1.000000,0,0,1,0.0,1.0,0,0,0
4,829517,16/08/2014 12:45:00,Manchester United,Swansea,-1,1,2,-1,0,0,...,1.00000,1.000000,0,0,0,0.0,1.0,0,0,0
5,829518,17/08/2014 16:00:00,Newcastle United,Manchester City,-1,0,2,-2,0,0,...,0.00000,0.000000,0,0,0,0.0,0.0,0,0,0
6,829519,16/08/2014 15:00:00,Queens Park Rangers,Hull,-1,0,1,-1,0,0,...,0.00000,0.000000,0,0,0,0.0,0.0,0,0,0
7,829520,16/08/2014 15:00:00,Stoke,Aston Villa,-1,0,1,-1,0,0,...,1.00000,1.000000,0,0,0,0.0,0.0,0,0,0
8,829521,16/08/2014 15:00:00,West Bromwich Albion,Sunderland,0,2,2,0,0,0,...,0.00000,1.000000,0,0,0,0.0,0.0,0,0,0
9,829522,16/08/2014 15:00:00,West Ham,Tottenham,-1,0,1,-1,0,0,...,1.00000,0.000000,0,0,0,0.0,0.0,0,0,0


In [10]:


homeTeamStatCol = []
awayTeamStatCol = []
homePlayerStatCol = []
awayPlayerStatCol = []

teamBoolean = {}
playerBoolean = {}
teamBase = {}
playerBase = {}

player_data_path = "players_data/fpl_14-15_COMPLETED_CLEAN.csv"
team_data_path = "teams_data/fpl_team_14-15_COMPLETED.csv"
player_df = pd.read_csv(player_data_path)
team_df = pd.read_csv(team_data_path)

team_stat_cols = team_df.columns[1:]
player_stat_cols = ['player_rating'] + player_stats_keys

for i in range(1,12):
    for col in player_df.columns[2:]:
        homePlayerStatCol.append("home_player_"+ str(i) + "_" + col)
        awayPlayerStatCol.append("away_player_"+ str(i) + "_" + col)
for col in team_df.columns[1:]:
    homeTeamStatCol.append("home_team_" + col)
    awayTeamStatCol.append("away_team_" + col)
    

fpl_player_dict = {}
for outerSeason in available_season:
    seasonIndex = available_season.index(outerSeason)
    fpl_data_path = "fpl_players_data/fpl_"+outerSeason+"_lookedup_clean_complete.csv"
    fpl_df = pd.read_csv(fpl_data_path)
    players = fpl_df['playerName'].values
    for player in players:
        exist = False
        for season in available_season[:seasonIndex+1]:
            if season != outerSeason:
                if (player, season) in fpl_player_dict:
                    if not exist:
                        exist = True
                        fpl_player_dict[(player, outerSeason)] = {}
                        for col in fpl_merge_cols:
                            fpl_player_dict[(player, outerSeason)][col] = fpl_player_dict[(player, season)][col]
                    else:
                        for col in fpl_merge_cols:
                            fpl_player_dict[(player, outerSeason)][col] = (1-fplPlayerWeightUpdate)*fpl_player_dict[(player, outerSeason)][col] + fplPlayerWeightUpdate * fpl_player_dict[(player, season)][col]
            else:
                if not exist:
                    fpl_player_dict[(player, outerSeason)] = {}
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = fpl_df[fpl_df['playerName'] == player][col].values[0]
                else:
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = (1-fplPlayerWeightUpdate)*fpl_player_dict[(player, outerSeason)][col] + fplPlayerWeightUpdate * fpl_df[fpl_df['playerName'] == player][col].values[0]

df_cols = ['match_id', 'date', 'home_team_name', 'away_team_name', 'winner', 'home_goal', 'away_goal', 'goal_difference'] + homeTeamStatCol + homePlayerStatCol + awayTeamStatCol + awayPlayerStatCol
master_df = pd.DataFrame(columns=df_cols)
indx = 0
for season in available_season:
    player_sub_path = "players_data/fpl_"+season+"_COMPLETED_CLEAN.csv"
    player_sub_df = pd.read_csv(player_sub_path)
    team_sub_path = "teams_data/fpl_team_"+season+"_COMPLETED.csv"
    team_sub_df = pd.read_csv(team_sub_path)
    season_match_path = "datafilev2/datafile/season"+season+"/season_match_stats.json"
    season_match_dict = {}
    with open(season_match_path, encoding="utf8") as json_file:
        season_match_data = json.load(json_file)
        for matchId in season_match_data.keys():
            season_match_dict[matchId] = {}
            for stat, value in season_match_data[matchId].items():
                season_match_dict[matchId][stat] = value

    match_stat_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    match_lineup_dict = {}
    with open(match_stat_path, encoding="utf8") as json_file:
        match_stat_data = json.load(json_file)
        for matchId in match_stat_data.keys():
            matchUp = match_stat_data[matchId]
            match_lineup_dict[matchId] = {}
            for teamId, teamStat in matchUp.items():
    #                 print(matchUp)
                teamName = teamStat['team_details']['team_name']
                playerLineup = [player for player in teamStat['Player_stats'].keys()]
                match_lineup_dict[matchId][teamName] = playerLineup
                if not teamName in teamBoolean:
                    teamBoolean[teamName] = {}
                    for stat in team_stat_cols:
                        teamBoolean[teamName][stat] = False
                for player in teamStat['Player_stats'].keys():
                    if not player in playerBoolean:
                        playerBoolean[player] = {}
                        for stat in player_stat_cols:
                            playerBoolean[player][stat] = False

    season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
#     seasonIndex = available_season.index(season)
    with open(season_stats_path, encoding="utf8") as json_file:
        data = json.load(json_file)
        for _, matchId in enumerate(data.keys()):
            newRow = []
            
            match = season_match_dict[matchId]
            score = match['full_time_score'].split(" : ")
            winner = None
            if score[0] > score[1]:
                winner = 1
            elif score[0] < score[1]:
                winner = -1
            else:
                winner = 0
                
            newRow.append(matchId)
            newRow.append(match['date_string'])
            newRow.append(match['home_team_name'])
            newRow.append(match['away_team_name'])
            newRow.append(winner)
            newRow.append(int(score[0]))
            newRow.append(int(score[1]))
            newRow.append(int(score[0]) - int(score[1]))
            teamAllignment = [match['home_team_id'], match['away_team_id']]
            teams = data[matchId]
            for teamId in teamAllignment:
                teamInfos = teams[teamId]
                teamName = teamInfos['team_details']['team_name']
                teamRating = teamInfos['team_details']['team_rating']
                if not teamName in teamBase:
                    teamBase[teamName] = {}
                    for stat in team_stat_cols:
                        teamBase[teamName][stat] = 'NA'
                    teamBase[teamName]['team_rating'] = 'NA'
                    
                for stato in team_stat_cols:
                    if teamBase[teamName][stato] != 'NA':
                        newRow.append(teamBase[teamName][stato])
                    else:
                        newRow.append(team_sub_df[team_sub_df["teamName"] == teamName][stato].values[0])
                #TEAM STAT PROCESSING
                agg_stats = teamInfos['aggregate_stats']
                if not teamBoolean[teamName]['team_rating']:
                    teamBoolean[teamName]['team_rating'] = True
                    teamBase[teamName]['team_rating'] = float(teamRating)
                else:
                    oldTeamRating = teamBase[teamName]['team_rating']
                    teamBase[teamName]['team_rating'] = (1-teamWeightUpdate)*teamBase[teamName]['team_rating'] + teamWeightUpdate*float(teamRating)
#                     print("TEAM RATING UPDATE: ",matchId, teamName, "FROM", oldTeamRating, "TO", teamBase[teamName]['team_rating'])
                for ag_k, ag_v in agg_stats.items():
                    if not teamBoolean[teamName][ag_k]:
                        teamBoolean[teamName][ag_k] = True
                        teamBase[teamName][ag_k] = float(ag_v)
                    else:
                        old = teamBase[teamName][ag_k]
                        teamBase[teamName][ag_k] = (1-teamWeightUpdate)*teamBase[teamName][ag_k] + teamWeightUpdate*float(ag_v)
#                         print("TEAM STAT UPDATE: ", matchId, teamName, ag_k, "FROM", old, "TO", teamBase[teamName][ag_k])
               
                #PLAYERSTATPROCESSING
                lineup = teamInfos['Player_stats']
                player_counter = 0
                for playerName, playerInfo in lineup.items():
                    playerRoleValue = playerInfo['player_details']['player_position_value']
                    if playerRoleValue != '5':
                        player_counter += 1
                        for fpl_col in fpl_merge_cols:
                            newRow.append(fpl_player_dict[(playerName, season)][fpl_col])
                        
                        playerRating = playerInfo['player_details']['player_rating']
                        if not playerName in playerBase:
                            playerBase[playerName] = {}
                            for stat in player_stat_cols:
                                playerBase[playerName][stat] = 'NA'
                            playerBase[playerName]['player_rating'] = 'NA'
                        
                        
                        for stato in player_stat_cols:
                            if playerBase[playerName][stato] != 'NA':
                                newRow.append(playerBase[playerName][stato])
                            else:
                                newRow.append(player_sub_df[player_sub_df["playerName"] == playerName][stato].values[0])
                        
                        if not playerBoolean[playerName]['player_rating']:
                            playerBase[playerName]['player_rating'] = float(playerRating)
                            playerBoolean[playerName]['player_rating'] = True
                        else:
                            oldPlayerRating = playerBase[playerName]['player_rating']
                            playerBase[playerName]['player_rating'] = (1-playerWeightUpdate)*playerBase[playerName]['player_rating'] + playerWeightUpdate*float(playerRating)
#                             print("PLAYER RATING UPDATE: ",matchId, playerName, "FROM", oldPlayerRating, "TO", playerBase[playerName]['player_rating'])
                        player_stats = playerInfo['Match_stats']
                        for pl_k, pl_v in player_stats.items():
                            if not playerBoolean[playerName][pl_k]:
                                playerBoolean[playerName][pl_k] = True
                                playerBase[playerName][pl_k] = float(pl_v)
                            else:
                                playerBase[playerName][pl_k] = (1-playerWeightUpdate)*playerBase[playerName][pl_k] + playerWeightUpdate*float(pl_v)
#                                 print('UPDATED PLAYER', matchId, playerName, pl_k)
                        
#             print(len(newRow), len(df_cols), player_counter)
            master_df.loc[indx] = newRow
            indx += 1
    print('DONE SEASON', season)
master_df.to_csv ("master_data_ver2/all_season_master_data" + suffix +".csv", index = None, header=True)
master_df

DONE SEASON 14-15
DONE SEASON 15-16
DONE SEASON 16-17
DONE SEASON 17-18


Unnamed: 0,match_id,date,home_team_name,away_team_name,winner,home_goal,away_goal,goal_difference,home_team_team_rating,home_team_att_goal_low_left,...,away_player_11_six_yard_block,away_player_11_post_scoring_att,away_player_11_att_pen_target,away_player_11_penalty_save,away_player_11_penalty_conceded,away_player_11_clearance_off_line,away_player_11_att_pen_goal,away_player_11_att_pen_miss,away_player_11_own_goals,away_player_11_att_pen_post
0,829513,16/08/2014 17:30:00,Arsenal,Crystal Palace,1,2,1,1,7.149141,1.250000,...,1.00000,0.000000,0,0,0,0.0,0.0,0,0,0
1,829514,18/08/2014 20:00:00,Burnley,Chelsea,-1,1,3,-2,6.785846,1.000000,...,1.00000,1.000000,0,0,0,0.0,1.0,0,0,0
2,829515,16/08/2014 15:00:00,Leicester,Everton,0,2,2,0,6.843094,1.454545,...,1.00000,1.000000,0,0,0,0.0,1.0,0,1,0
3,829516,17/08/2014 13:30:00,Liverpool,Southampton,1,2,1,1,6.902160,1.153846,...,0.00000,1.000000,0,0,1,0.0,1.0,0,0,0
4,829517,16/08/2014 12:45:00,Manchester United,Swansea,-1,1,2,-1,6.990919,1.294118,...,1.00000,1.000000,0,0,0,0.0,1.0,0,0,0
5,829518,17/08/2014 16:00:00,Newcastle United,Manchester City,-1,0,2,-2,6.740591,1.272727,...,0.00000,0.000000,0,0,0,0.0,0.0,0,0,0
6,829519,16/08/2014 15:00:00,Queens Park Rangers,Hull,-1,0,1,-1,6.765569,1.333333,...,0.00000,0.000000,0,0,0,0.0,0.0,0,0,0
7,829520,16/08/2014 15:00:00,Stoke,Aston Villa,-1,0,1,-1,6.868352,1.083333,...,1.00000,1.000000,0,0,0,0.0,0.0,0,0,0
8,829521,16/08/2014 15:00:00,West Bromwich Albion,Sunderland,0,2,2,0,6.765031,1.076923,...,0.00000,1.000000,0,0,0,0.0,0.0,0,0,0
9,829522,16/08/2014 15:00:00,West Ham,Tottenham,-1,0,1,-1,6.854401,1.076923,...,1.00000,0.000000,0,0,0,0.0,0.0,0,0,0


In [11]:
#VERSION 2
teamWeightUpdate = 0.7
playerWeightUpdate = 0.7
fplPlayerWeightUpdate = 0.7

homeTeamStatCol = []
awayTeamStatCol = []
homePlayerStatCol = []
awayPlayerStatCol = []

teamBoolean = {}
playerBoolean = {}
teamBase = {}
playerBase = {}

player_data_path = "players_data/fpl_14-15_COMPLETED_CLEAN.csv"
team_data_path = "teams_data/fpl_team_14-15_COMPLETED.csv"
player_df = pd.read_csv(player_data_path)
team_df = pd.read_csv(team_data_path)

team_stat_cols = team_df.columns[1:]
player_stat_cols = ['player_rating'] + player_stats_keys

for i in range(1,12):
    for col in player_df.columns[9:]:
        homePlayerStatCol.append("home_player_"+ str(i) + "_" + col)
        awayPlayerStatCol.append("away_player_"+ str(i) + "_" + col)
for col in team_df.columns[1:]:
    homeTeamStatCol.append("home_team_" + col)
    awayTeamStatCol.append("away_team_" + col)
    

df_cols = ['match_id', 'date', 'home_team_name', 'away_team_name', 'winner', 'home_goal', 'away_goal', 'goal_difference'] + homeTeamStatCol + homePlayerStatCol + awayTeamStatCol + awayPlayerStatCol
master_df = pd.DataFrame(columns=df_cols)
indx = 0
for season in available_season:
    player_sub_path = "players_data/fpl_"+season+"_COMPLETED_CLEAN.csv"
    player_sub_df = pd.read_csv(player_sub_path)
    team_sub_path = "teams_data/fpl_team_"+season+"_COMPLETED.csv"
    team_sub_df = pd.read_csv(team_sub_path)
    season_match_path = "datafilev2/datafile/season"+season+"/season_match_stats.json"
    season_match_dict = {}
    with open(season_match_path, encoding="utf8") as json_file:
        season_match_data = json.load(json_file)
        for matchId in season_match_data.keys():
            season_match_dict[matchId] = {}
            for stat, value in season_match_data[matchId].items():
                season_match_dict[matchId][stat] = value

    match_stat_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    match_lineup_dict = {}
    with open(match_stat_path, encoding="utf8") as json_file:
        match_stat_data = json.load(json_file)
        for matchId in match_stat_data.keys():
            matchUp = match_stat_data[matchId]
            match_lineup_dict[matchId] = {}
            for teamId, teamStat in matchUp.items():
    #                 print(matchUp)
                teamName = teamStat['team_details']['team_name']
                playerLineup = [player for player in teamStat['Player_stats'].keys()]
                match_lineup_dict[matchId][teamName] = playerLineup
                if not teamName in teamBoolean:
                    teamBoolean[teamName] = {}
                    for stat in team_stat_cols:
                        teamBoolean[teamName][stat] = False
                for player in teamStat['Player_stats'].keys():
                    if not player in playerBoolean:
                        playerBoolean[player] = {}
                        for stat in player_stat_cols:
                            playerBoolean[player][stat] = False

    season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
#     seasonIndex = available_season.index(season)
    with open(season_stats_path, encoding="utf8") as json_file:
        data = json.load(json_file)
        for _, matchId in enumerate(data.keys()):
            newRow = []
            
            match = season_match_dict[matchId]
            score = match['full_time_score'].split(" : ")
            winner = None
            if score[0] > score[1]:
                winner = 1
            elif score[0] < score[1]:
                winner = -1
            else:
                winner = 0
                
            newRow.append(matchId)
            newRow.append(match['date_string'])
            newRow.append(match['home_team_name'])
            newRow.append(match['away_team_name'])
            newRow.append(winner)
            newRow.append(int(score[0]))
            newRow.append(int(score[1]))
            newRow.append(int(score[0]) - int(score[1]))
            
            teams = data[matchId]
            teamAllignment = [match['home_team_id'], match['away_team_id']]
            for teamId in teamAllignment:
                teamInfos = teams[teamId]
                teamName = teamInfos['team_details']['team_name']
                teamRating = teamInfos['team_details']['team_rating']
                if not teamName in teamBase:
                    teamBase[teamName] = {}
                    for stat in team_stat_cols:
                        teamBase[teamName][stat] = 'NA'
                    teamBase[teamName]['team_rating'] = 'NA'
                    
                #TEAM STAT PROCESSING
                
                for stato in team_stat_cols:
                    if teamBase[teamName][stato] != 'NA':
                        newRow.append(teamBase[teamName][stato])
                    else:
                        newRow.append(team_sub_df[team_sub_df["teamName"] == teamName][stato].values[0])
                agg_stats = teamInfos['aggregate_stats']
                if not teamBoolean[teamName]['team_rating']:
                    teamBoolean[teamName]['team_rating'] = True
                    teamBase[teamName]['team_rating'] = float(teamRating)
                else:
                    oldTeamRating = teamBase[teamName]['team_rating']
                    teamBase[teamName]['team_rating'] = (1-teamWeightUpdate)*teamBase[teamName]['team_rating'] + teamWeightUpdate*float(teamRating)
#                     print("TEAM RATING UPDATE: ",matchId, teamName, "FROM", oldTeamRating, "TO", teamBase[teamName]['team_rating'])
                for ag_k, ag_v in agg_stats.items():
                    if not teamBoolean[teamName][ag_k]:
                        teamBoolean[teamName][ag_k] = True
                        teamBase[teamName][ag_k] = float(ag_v)
                    else:
                        old = teamBase[teamName][ag_k]
                        teamBase[teamName][ag_k] = (1-teamWeightUpdate)*teamBase[teamName][ag_k] + teamWeightUpdate*float(ag_v)
#                         print("TEAM STAT UPDATE: ", matchId, teamName, ag_k, "FROM", old, "TO", teamBase[teamName][ag_k])
               
                #PLAYERSTATPROCESSING
                lineup = teamInfos['Player_stats']
                player_counter = 0
                for playerName, playerInfo in lineup.items():
                    playerRoleValue = playerInfo['player_details']['player_position_value']
                    if playerRoleValue != '5':
                        playerRating = playerInfo['player_details']['player_rating']
                        if not playerName in playerBase:
                            playerBase[playerName] = {}
                            for stat in player_stat_cols:
                                playerBase[playerName][stat] = 'NA'
                            playerBase[playerName]['player_rating'] = 'NA'
                        for stato in player_stat_cols:
                            if playerBase[playerName][stato] != 'NA':
                                newRow.append(playerBase[playerName][stato])
                            else:
                                newRow.append(player_sub_df[player_sub_df["playerName"] == playerName][stato].values[0])
                                
                        if not playerBoolean[playerName]['player_rating']:
                            playerBase[playerName]['player_rating'] = float(playerRating)
                            playerBoolean[playerName]['player_rating'] = True
                        else:
                            oldPlayerRating = playerBase[playerName]['player_rating']
                            playerBase[playerName]['player_rating'] = (1-playerWeightUpdate)*playerBase[playerName]['player_rating'] + playerWeightUpdate*float(playerRating)
#                             print("PLAYER RATING UPDATE: ",matchId, playerName, "FROM", oldPlayerRating, "TO", playerBase[playerName]['player_rating'])
                        player_stats = playerInfo['Match_stats']
                        for pl_k, pl_v in player_stats.items():
                            if not playerBoolean[playerName][pl_k]:
                                playerBoolean[playerName][pl_k] = True
                                playerBase[playerName][pl_k] = float(pl_v)
                            else:
                                playerBase[playerName][pl_k] = (1-playerWeightUpdate)*playerBase[playerName][pl_k] + playerWeightUpdate*float(pl_v)
#                                 print('UPDATED PLAYER', matchId, playerName, pl_k)
                        
#             print(len(newRow), len(df_cols), player_counter)
            master_df.loc[indx] = newRow
            indx += 1
    print('DONE SEASON', season)
master_df.to_csv ("master_data_ver2/all_season_master_data_NO_FPL" + suffix + ".csv", index = None, header=True)
master_df

DONE SEASON 14-15
DONE SEASON 15-16
DONE SEASON 16-17
DONE SEASON 17-18


Unnamed: 0,match_id,date,home_team_name,away_team_name,winner,home_goal,away_goal,goal_difference,home_team_team_rating,home_team_att_goal_low_left,...,away_player_11_six_yard_block,away_player_11_post_scoring_att,away_player_11_att_pen_target,away_player_11_penalty_save,away_player_11_penalty_conceded,away_player_11_clearance_off_line,away_player_11_att_pen_goal,away_player_11_att_pen_miss,away_player_11_own_goals,away_player_11_att_pen_post
0,829513,16/08/2014 17:30:00,Arsenal,Crystal Palace,1,2,1,1,7.149141,1.250000,...,1.00000,0.000000,0,0,0,0.0,0.0,0,0,0
1,829514,18/08/2014 20:00:00,Burnley,Chelsea,-1,1,3,-2,6.785846,1.000000,...,1.00000,1.000000,0,0,0,0.0,1.0,0,0,0
2,829515,16/08/2014 15:00:00,Leicester,Everton,0,2,2,0,6.843094,1.454545,...,1.00000,1.000000,0,0,0,0.0,1.0,0,1,0
3,829516,17/08/2014 13:30:00,Liverpool,Southampton,1,2,1,1,6.902160,1.153846,...,0.00000,1.000000,0,0,1,0.0,1.0,0,0,0
4,829517,16/08/2014 12:45:00,Manchester United,Swansea,-1,1,2,-1,6.990919,1.294118,...,1.00000,1.000000,0,0,0,0.0,1.0,0,0,0
5,829518,17/08/2014 16:00:00,Newcastle United,Manchester City,-1,0,2,-2,6.740591,1.272727,...,0.00000,0.000000,0,0,0,0.0,0.0,0,0,0
6,829519,16/08/2014 15:00:00,Queens Park Rangers,Hull,-1,0,1,-1,6.765569,1.333333,...,0.00000,0.000000,0,0,0,0.0,0.0,0,0,0
7,829520,16/08/2014 15:00:00,Stoke,Aston Villa,-1,0,1,-1,6.868352,1.083333,...,1.00000,1.000000,0,0,0,0.0,0.0,0,0,0
8,829521,16/08/2014 15:00:00,West Bromwich Albion,Sunderland,0,2,2,0,6.765031,1.076923,...,0.00000,1.000000,0,0,0,0.0,0.0,0,0,0
9,829522,16/08/2014 15:00:00,West Ham,Tottenham,-1,0,1,-1,6.854401,1.076923,...,1.00000,0.000000,0,0,0,0.0,0.0,0,0,0


In [12]:
#VERSION 2
teamWeightUpdate = 0.7
playerWeightUpdate = 0.7
fplPlayerWeightUpdate = 0.7

homeTeamStatCol = []
awayTeamStatCol = []
homePlayerStatCol = []
awayPlayerStatCol = []

teamBoolean = {}
playerBoolean = {}
teamBase = {}
playerBase = {}

player_data_path = "players_data/fpl_14-15_COMPLETED_CLEAN.csv"
team_data_path = "teams_data/fpl_team_14-15_COMPLETED.csv"
player_df = pd.read_csv(player_data_path)
team_df = pd.read_csv(team_data_path)

team_stat_cols = team_df.columns[1:]
player_stat_cols = ['player_rating'] + player_stats_keys

for i in range(1,12):
    for col in player_df.columns[2:9]:
        homePlayerStatCol.append("home_player_"+ str(i) + "_" + col)
        awayPlayerStatCol.append("away_player_"+ str(i) + "_" + col)
for col in team_df.columns[1:]:
    homeTeamStatCol.append("home_team_" + col)
    awayTeamStatCol.append("away_team_" + col)
    

fpl_player_dict = {}
for outerSeason in available_season:
    seasonIndex = available_season.index(outerSeason)
    fpl_data_path = "fpl_players_data/fpl_"+outerSeason+"_lookedup_clean_complete.csv"
    fpl_df = pd.read_csv(fpl_data_path)
    players = fpl_df['playerName'].values
    for player in players:
        exist = False
        for season in available_season[:seasonIndex+1]:
            if season != outerSeason:
                if (player, season) in fpl_player_dict:
                    if not exist:
                        exist = True
                        fpl_player_dict[(player, outerSeason)] = {}
                        for col in fpl_merge_cols:
                            fpl_player_dict[(player, outerSeason)][col] = fpl_player_dict[(player, season)][col]
                    else:
                        if outerSeason == "15-16" and player == 'Aaron Ramsey':
                            print(outerSeason, season, exist)
                        for col in fpl_merge_cols:
                            fpl_player_dict[(player, outerSeason)][col] = (1-fplPlayerWeightUpdate)*fpl_player_dict[(player, outerSeason)][col] + fplPlayerWeightUpdate * fpl_player_dict[(player, season)][col]
            else:
                if not exist:
                    fpl_player_dict[(player, outerSeason)] = {}
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = fpl_df[fpl_df['playerName'] == player][col].values[0]
                else:
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = (1-fplPlayerWeightUpdate)*fpl_player_dict[(player, outerSeason)][col] + fplPlayerWeightUpdate * fpl_df[fpl_df['playerName'] == player][col].values[0]

df_cols = ['match_id', 'date', 'home_team_name', 'away_team_name', 'winner', 'home_goal', 'away_goal', 'goal_difference'] + homeTeamStatCol + homePlayerStatCol + awayTeamStatCol + awayPlayerStatCol
master_df = pd.DataFrame(columns=df_cols)
indx = 0
for season in available_season:
    player_sub_path = "players_data/fpl_"+season+"_COMPLETED_CLEAN.csv"
    player_sub_df = pd.read_csv(player_sub_path)
    team_sub_path = "teams_data/fpl_team_"+season+"_COMPLETED.csv"
    team_sub_df = pd.read_csv(team_sub_path)
    season_match_path = "datafilev2/datafile/season"+season+"/season_match_stats.json"
    season_match_dict = {}
    with open(season_match_path, encoding="utf8") as json_file:
        season_match_data = json.load(json_file)
        for matchId in season_match_data.keys():
            season_match_dict[matchId] = {}
            for stat, value in season_match_data[matchId].items():
                season_match_dict[matchId][stat] = value

    match_stat_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    match_lineup_dict = {}
    with open(match_stat_path, encoding="utf8") as json_file:
        match_stat_data = json.load(json_file)
        for matchId in match_stat_data.keys():
            matchUp = match_stat_data[matchId]
            match_lineup_dict[matchId] = {}
            for teamId, teamStat in matchUp.items():
    #                 print(matchUp)
                teamName = teamStat['team_details']['team_name']
                playerLineup = [player for player in teamStat['Player_stats'].keys()]
                match_lineup_dict[matchId][teamName] = playerLineup
                if not teamName in teamBoolean:
                    teamBoolean[teamName] = {}
                    for stat in team_stat_cols:
                        teamBoolean[teamName][stat] = False
                for player in teamStat['Player_stats'].keys():
                    if not player in playerBoolean:
                        playerBoolean[player] = {}
                        for stat in player_stat_cols:
                            playerBoolean[player][stat] = False

    season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
#     seasonIndex = available_season.index(season)
    with open(season_stats_path, encoding="utf8") as json_file:
        data = json.load(json_file)
        for _, matchId in enumerate(data.keys()):
            newRow = []
            
            match = season_match_dict[matchId]
            score = match['full_time_score'].split(" : ")
            winner = None
            if score[0] > score[1]:
                winner = 1
            elif score[0] < score[1]:
                winner = -1
            else:
                winner = 0
                
            newRow.append(matchId)
            newRow.append(match['date_string'])
            newRow.append(match['home_team_name'])
            newRow.append(match['away_team_name'])
            newRow.append(winner)
            newRow.append(int(score[0]))
            newRow.append(int(score[1]))
            newRow.append(int(score[0]) - int(score[1]))
            
            teamAllignment = [match['home_team_id'], match['away_team_id']]
            teams = data[matchId]
            
            for teamId in teamAllignment:
                teamInfos = teams[teamId]
                teamName = teamInfos['team_details']['team_name']
                teamRating = teamInfos['team_details']['team_rating']
                if not teamName in teamBase:
                    teamBase[teamName] = {}
                    for stat in team_stat_cols:
                        teamBase[teamName][stat] = 'NA'
                    teamBase[teamName]['team_rating'] = 'NA'
                    
                #TEAM STAT PROCESSING
                for stato in team_stat_cols:
                    if teamBase[teamName][stato] != 'NA':
                        newRow.append(teamBase[teamName][stato])
                    else:
                        newRow.append(team_sub_df[team_sub_df["teamName"] == teamName][stato].values[0])
                        
                agg_stats = teamInfos['aggregate_stats']
                if not teamBoolean[teamName]['team_rating']:
                    teamBoolean[teamName]['team_rating'] = True
                    teamBase[teamName]['team_rating'] = float(teamRating)
                else:
                    oldTeamRating = teamBase[teamName]['team_rating']
                    teamBase[teamName]['team_rating'] = (1-teamWeightUpdate)*teamBase[teamName]['team_rating'] + teamWeightUpdate*float(teamRating)
#                     print("TEAM RATING UPDATE: ",matchId, teamName, "FROM", oldTeamRating, "TO", teamBase[teamName]['team_rating'])
                for ag_k, ag_v in agg_stats.items():
                    if not teamBoolean[teamName][ag_k]:
                        teamBoolean[teamName][ag_k] = True
                        teamBase[teamName][ag_k] = float(ag_v)
                    else:
                        old = teamBase[teamName][ag_k]
                        teamBase[teamName][ag_k] = (1-teamWeightUpdate)*teamBase[teamName][ag_k] + teamWeightUpdate*float(ag_v)
#                         print("TEAM STAT UPDATE: ", matchId, teamName, ag_k, "FROM", old, "TO", teamBase[teamName][ag_k])
                
                #PLAYERSTATPROCESSING
                lineup = teamInfos['Player_stats']
                player_counter = 0
                for playerName, playerInfo in lineup.items():
                    playerRoleValue = playerInfo['player_details']['player_position_value']
                    if playerRoleValue != '5':
                        player_counter += 1
                        for fpl_col in fpl_merge_cols:
                            newRow.append(fpl_player_dict[(playerName, season)][fpl_col])
                        
            master_df.loc[indx] = newRow
            indx += 1
    print('DONE SEASON', season)
master_df.to_csv ("master_data_ver2/all_season_master_data_fpl_ONLY" + suffix + ".csv", index = None, header=True)
master_df

DONE SEASON 14-15
DONE SEASON 15-16
DONE SEASON 16-17
DONE SEASON 17-18


Unnamed: 0,match_id,date,home_team_name,away_team_name,winner,home_goal,away_goal,goal_difference,home_team_team_rating,home_team_att_goal_low_left,...,away_player_10_fpl_points,away_player_10_fpl_bonus,away_player_10_fpl_cprice,away_player_11_fpl_minutes,away_player_11_fpl_goals,away_player_11_fpl_assits,away_player_11_fpl_cs,away_player_11_fpl_points,away_player_11_fpl_bonus,away_player_11_fpl_cprice
0,829513,16/08/2014 17:30:00,Arsenal,Crystal Palace,1,2,1,1,7.149141,1.250000,...,45,4,5.200000,1174,4,1,2,51,5,5.000000
1,829514,18/08/2014 20:00:00,Burnley,Chelsea,-1,1,3,-2,6.785846,1.000000,...,233,42,10.800000,2069,20,3,12,150,18,10.600000
2,829515,16/08/2014 15:00:00,Leicester,Everton,0,2,2,0,6.843094,1.454545,...,18,1,5.700000,2872,10,6,9,135,12,8.700000
3,829516,17/08/2014 13:30:00,Liverpool,Southampton,1,2,1,1,6.902160,1.153846,...,117,13,7.400000,1476,1,8,6,72,2,4.800000
4,829517,16/08/2014 12:45:00,Manchester United,Swansea,-1,1,2,-1,6.990919,1.294118,...,85,5,5.200000,1598,11,3,8,108,14,8.100000
5,829518,17/08/2014 16:00:00,Newcastle United,Manchester City,-1,0,2,-2,6.740591,1.272727,...,61,3,7.900000,784,5,3,4,62,9,7.800000
6,829519,16/08/2014 15:00:00,Queens Park Rangers,Hull,-1,0,1,-1,6.765569,1.333333,...,84,6,5.600000,39,0,0,0,1,0,5.600000
7,829520,16/08/2014 15:00:00,Stoke,Aston Villa,-1,0,1,-1,6.868352,1.083333,...,58,0,5.200000,2693,6,1,8,93,12,5.000000
8,829521,16/08/2014 15:00:00,West Bromwich Albion,Sunderland,0,2,2,0,6.765031,1.076923,...,105,6,5.300000,1784,5,2,6,76,6,5.000000
9,829522,16/08/2014 15:00:00,West Ham,Tottenham,-1,0,1,-1,6.854401,1.076923,...,62,4,6.200000,813,2,2,3,40,6,7.900000


In [51]:
team_sub_path = "teams_data/fpl_team_"+"14-15"+"_COMPLETED.csv"
team_sub_df = pd.read_csv(team_sub_path)


2.25

In [46]:
fpl_player_dict = {}
for outerSeason in available_season:
    seasonIndex = available_season.index(outerSeason)
    fpl_data_path = "fpl_players_data/fpl_"+outerSeason+"_lookedup_clean_complete.csv"
    fpl_df = pd.read_csv(fpl_data_path)
    players = fpl_df['playerName'].values
    for player in players:
        exist = False
        for season in available_season[:seasonIndex+1]:
            if (player, season) in fpl_player_dict:
                if not exist:
                    exist = True
                    fpl_player_dict[(player, outerSeason)] = {}
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = fpl_player_dict[(player, season)][col]
                else:
                    for col in fpl_merge_cols:
                        fpl_player_dict[(player, outerSeason)][col] = (1-fplPlayerWeightUpdate)*fpl_player_dict[(player, outerSeason)][col] + fplPlayerWeightUpdate * fpl_player_dict[(player, season)][col]
            else:
                if season == outerSeason:
                    if not exist:
                        fpl_player_dict[(player, outerSeason)] = {}
                        for col in fpl_merge_cols:
                            fpl_player_dict[(player, outerSeason)][col] = fpl_df[fpl_df['playerName'] == player][col].values

UPDATED Aaron Ramsey 15-16
UPDATED Alex Oxlade Chamberlain 15-16
UPDATED Alexis Sánchez 15-16
UPDATED Calum Chambers 15-16
UPDATED Danny Welbeck 15-16
UPDATED David Ospina 15-16
UPDATED Francis Coquelin 15-16
UPDATED Gabriel Paulista 15-16
UPDATED Héctor Bellerín 15-16
UPDATED Jack Wilshere 15-16
UPDATED Joel Campbell 15-16
UPDATED Kieran Gibbs 15-16
UPDATED Laurent Koscielny 15-16
UPDATED Mathieu Debuchy 15-16
UPDATED Mathieu Flamini 15-16
UPDATED Matt Macey 15-16
UPDATED Mesut Özil 15-16
UPDATED Mikel Arteta 15-16
UPDATED Nacho Monreal 15-16
UPDATED Olivier Giroud 15-16
UPDATED Per Mertesacker 15-16
UPDATED Petr Cech 15-16
UPDATED Santi Cazorla 15-16
UPDATED Theo Walcott 15-16
UPDATED Alan Hutton 15-16
UPDATED Aly Cissokho 15-16
UPDATED Andre Green 15-16
UPDATED Ashley Westwood 15-16
UPDATED Brad Guzan 15-16
UPDATED Carles Gil 15-16
UPDATED Carlos Sánchez 15-16
UPDATED Charles N'Zogbia 15-16
UPDATED Ciaran Clark 15-16
UPDATED Gabriel Agbonlahor 15-16
UPDATED Jack Grealish 15-16
UPDAT

UPDATED Nathan Aké 15-16
UPDATED Alex Palmer 15-16
UPDATED Anders Lindegaard 15-16
UPDATED Ben Foster 15-16
UPDATED Boaz Myhill 15-16
UPDATED Brown Ideye 15-16
UPDATED Callum McManaman 15-16
UPDATED Chris Brunt 15-16
UPDATED Claudio Yacob 15-16
UPDATED Craig Dawson 15-16
UPDATED Craig Gardner 15-16
UPDATED Cristian Gamboa 15-16
UPDATED Darren Fletcher 15-16
UPDATED Gareth McAuley 15-16
UPDATED Jack Rose 15-16
UPDATED James Chester 15-16
UPDATED James Morrison 15-16
UPDATED Jonas Olsson 15-16
UPDATED Jonny Evans 15-16
UPDATED Rickie Lambert 15-16
UPDATED Sébastien Pocognoli 15-16
UPDATED Saido Berahino 15-16
UPDATED Sandro 15-16
UPDATED Stéphane Sessegnon 15-16
UPDATED Tyler Roberts 15-16
UPDATED Victor Anichebe 15-16
UPDATED Joleon Lescott 15-16
UPDATED Aaron Cresswell 15-16
UPDATED Adrián 15-16
UPDATED Alexandre Song 15-16
UPDATED Andy Carroll 15-16
UPDATED Carl Jenkinson 15-16
UPDATED Cheikhou Kouyaté 15-16
UPDATED Diafra Sakho 15-16
UPDATED Diego Poyet 15-16
UPDATED Elliot Lee 15-16

UPDATED Shinji Okazaki 16-17
UPDATED Wes Morgan 16-17
UPDATED Wes Morgan 16-17
UPDATED Yohan Benalouane 16-17
UPDATED Adam Lallana 16-17
UPDATED Adam Lallana 16-17
UPDATED Alberto Moreno 16-17
UPDATED Alberto Moreno 16-17
UPDATED Connor Randall 16-17
UPDATED Daniel Sturridge 16-17
UPDATED Daniel Sturridge 16-17
UPDATED Dejan Lovren 16-17
UPDATED Dejan Lovren 16-17
UPDATED Divock Origi 16-17
UPDATED Emre Can 16-17
UPDATED Emre Can 16-17
UPDATED Georginio Wijnaldum 16-17
UPDATED James Milner 16-17
UPDATED James Milner 16-17
UPDATED Jordan Henderson 16-17
UPDATED Jordan Henderson 16-17
UPDATED Joseph Gomez 16-17
UPDATED Kevin Stewart 16-17
UPDATED Lucas Leiva 16-17
UPDATED Lucas Leiva 16-17
UPDATED Nathaniel Clyne 16-17
UPDATED Nathaniel Clyne 16-17
UPDATED Philippe Coutinho 16-17
UPDATED Philippe Coutinho 16-17
UPDATED Roberto Firmino 16-17
UPDATED Sadio Mané 16-17
UPDATED Sadio Mané 16-17
UPDATED Simon Mignolet 16-17
UPDATED Simon Mignolet 16-17
UPDATED Aleix García 16-17
UPDATED Aleksa

UPDATED Kyle Walker 16-17
UPDATED Michel Vorm 16-17
UPDATED Michel Vorm 16-17
UPDATED Mousa Dembélé 16-17
UPDATED Mousa Dembélé 16-17
UPDATED Moussa Sissoko 16-17
UPDATED Moussa Sissoko 16-17
UPDATED Son Heung-Min 16-17
UPDATED Toby Alderweireld 16-17
UPDATED Toby Alderweireld 16-17
UPDATED Victor Wanyama 16-17
UPDATED Victor Wanyama 16-17
UPDATED Ryan Mason 16-17
UPDATED Ryan Mason 16-17
UPDATED Adrian Mariappa 16-17
UPDATED Adrian Mariappa 16-17
UPDATED Ben Watson 16-17
UPDATED Costel Pantilimon 16-17
UPDATED Costel Pantilimon 16-17
UPDATED Craig Cathcart 16-17
UPDATED Daryl Janmaat 16-17
UPDATED Daryl Janmaat 16-17
UPDATED Etienne Capoue 16-17
UPDATED Etienne Capoue 16-17
UPDATED Giedrius Arlauskis 16-17
UPDATED Heurelho Gomes 16-17
UPDATED Ikechi Anya 16-17
UPDATED Jerome Sinclair 16-17
UPDATED Jerome Sinclair 16-17
UPDATED José Holebas 16-17
UPDATED Matej Vydra 16-17
UPDATED Mauro Zárate 16-17
UPDATED Mauro Zárate 16-17
UPDATED Miguel Britos 16-17
UPDATED Nordin Amrabat 16-17
UPDA

UPDATED Leighton Baines 17-18
UPDATED Luke Garbutt 17-18
UPDATED Maarten Stekelenburg 17-18
UPDATED Maarten Stekelenburg 17-18
UPDATED Mason Holgate 17-18
UPDATED Mason Holgate 17-18
UPDATED Michael Keane 17-18
UPDATED Michael Keane 17-18
UPDATED Morgan Schneiderlin 17-18
UPDATED Morgan Schneiderlin 17-18
UPDATED Morgan Schneiderlin 17-18
UPDATED Muhamed Besic 17-18
UPDATED Muhamed Besic 17-18
UPDATED Muhamed Besic 17-18
UPDATED Oumar Niasse 17-18
UPDATED Oumar Niasse 17-18
UPDATED Phil Jagielka 17-18
UPDATED Phil Jagielka 17-18
UPDATED Phil Jagielka 17-18
UPDATED Ramiro Funes Mori 17-18
UPDATED Ramiro Funes Mori 17-18
UPDATED Seamus Coleman 17-18
UPDATED Seamus Coleman 17-18
UPDATED Seamus Coleman 17-18
UPDATED Tom Davies 17-18
UPDATED Tom Davies 17-18
UPDATED Wayne Rooney 17-18
UPDATED Wayne Rooney 17-18
UPDATED Wayne Rooney 17-18
UPDATED Yannick Bolasie 17-18
UPDATED Yannick Bolasie 17-18
UPDATED Yannick Bolasie 17-18
UPDATED Aaron Lennon 17-18
UPDATED Aaron Lennon 17-18
UPDATED Aar

UPDATED Charlie Adam 17-18
UPDATED Charlie Adam 17-18
UPDATED Charlie Adam 17-18
UPDATED Darren Fletcher 17-18
UPDATED Darren Fletcher 17-18
UPDATED Darren Fletcher 17-18
UPDATED Erik Pieters 17-18
UPDATED Erik Pieters 17-18
UPDATED Erik Pieters 17-18
UPDATED Geoff Cameron 17-18
UPDATED Geoff Cameron 17-18
UPDATED Geoff Cameron 17-18
UPDATED Glen Johnson 17-18
UPDATED Glen Johnson 17-18
UPDATED Glen Johnson 17-18
UPDATED Ibrahim Afellay 17-18
UPDATED Ibrahim Afellay 17-18
UPDATED Jack Butland 17-18
UPDATED Jack Butland 17-18
UPDATED Jack Butland 17-18
UPDATED Jakob Haugaard 17-18
UPDATED Jakob Haugaard 17-18
UPDATED Joe Allen 17-18
UPDATED Joe Allen 17-18
UPDATED Joe Allen 17-18
UPDATED Josh Tymon 17-18
UPDATED Julien Ngoy 17-18
UPDATED Kurt Zouma 17-18
UPDATED Kurt Zouma 17-18
UPDATED Kurt Zouma 17-18
UPDATED Lee Grant 17-18
UPDATED Mame Biram Diouf 17-18
UPDATED Mame Biram Diouf 17-18
UPDATED Mame Biram Diouf 17-18
UPDATED Peter Crouch 17-18
UPDATED Peter Crouch 17-18
UPDATED Peter C

In [92]:
fpl_player_dict

{('Aaron Ramsey', '14-15'): {'fpl_minutes': 2007,
  'fpl_goals': 6,
  'fpl_assits': 6,
  'fpl_cs': 10,
  'fpl_points': 116,
  'fpl_bonus': 13,
  'fpl_cprice': 8.8},
 ('Abou Diaby', '14-15'): {'fpl_minutes': 0,
  'fpl_goals': 0,
  'fpl_assits': 0,
  'fpl_cs': 0,
  'fpl_points': 0,
  'fpl_bonus': 0,
  'fpl_cprice': 4.7},
 ('Ainsley Maitland-Niles', '14-15'): {'fpl_minutes': 1,
  'fpl_goals': 0,
  'fpl_assits': 0,
  'fpl_cs': 0,
  'fpl_points': 1,
  'fpl_bonus': 0,
  'fpl_cprice': 4.4},
 ('Alex Oxlade Chamberlain', '14-15'): {'fpl_minutes': 1491,
  'fpl_goals': 1,
  'fpl_assits': 2,
  'fpl_cs': 7,
  'fpl_points': 57,
  'fpl_bonus': 3,
  'fpl_cprice': 6.2},
 ('Alexis Sánchez', '14-15'): {'fpl_minutes': 2944,
  'fpl_goals': 16,
  'fpl_assits': 8,
  'fpl_cs': 12,
  'fpl_points': 207,
  'fpl_bonus': 29,
  'fpl_cprice': 11.6},
 ('Calum Chambers', '14-15'): {'fpl_minutes': 1572,
  'fpl_goals': 1,
  'fpl_assits': 1,
  'fpl_cs': 7,
  'fpl_points': 68,
  'fpl_bonus': 8,
  'fpl_cprice': 4.5},
 ('Ch

In [35]:
for i in range(1,12):
    print(i)

1
2
3
4
5
6
7
8
9
10
11


In [33]:
allPlayerPosition = []
playerValueCombination = []
for season in available_season:
    season_stats_path = "datafilev2/datafile/season"+season+"/season_stats.json"
    with open(season_stats_path, encoding="utf8") as json_file:
        data = json.load(json_file)
        for key in data.keys():
            teams = data[key]
            for team_key in teams.keys():
                combi = [0,0,0,0]
                team = teams[team_key]
                lineup = team['Player_stats']
                tank = 0
                for k in lineup.keys():
                        tank += 1
                        playerPositionInfo = lineup[k]['player_details']['player_position_info']
                        playerPositionValue = lineup[k]['player_details']['player_position_value']
                        if tank <= 11 and playerPositionValue == '5':
                            print('pepeHands')
                        if not (playerPositionInfo, playerPositionValue) in allPlayerPosition:
                            allPlayerPosition.append((playerPositionInfo, playerPositionValue))
                        if playerPositionValue != '5':
#                             print(playerPositionValue)
                            combi[int(playerPositionValue) - 1] += 1
                if not tuple(combi) in playerValueCombination:
#                     if combi[1] == 5:
#                         print(team['team_details']['team_name'])
                    playerValueCombination.append(tuple(combi))
allPlayerPosition, playerValueCombination

([('GK', '1'),
  ('DC', '2'),
  ('DR', '2'),
  ('DL', '2'),
  ('DMC', '3'),
  ('ML', '3'),
  ('MR', '3'),
  ('MC', '3'),
  ('FW', '4'),
  ('Sub', '5'),
  ('AMC', '4'),
  ('AMC', '3'),
  ('AMR', '3'),
  ('AML', '3'),
  ('FWL', '4'),
  ('FWR', '4'),
  ('DMR', '3'),
  ('DML', '3'),
  ('FWR', '3'),
  ('FWL', '3'),
  ('FW', '3')],
 [(1, 4, 5, 1),
  (1, 4, 4, 2),
  (1, 4, 3, 3),
  (1, 3, 4, 3),
  (1, 3, 5, 2),
  (1, 5, 3, 2),
  (1, 5, 4, 1),
  (1, 4, 6, 0)])

In [21]:
pd_cols = ['player_with_role_value_1', 'player_with_role_value_2', 'player_with_role_value_3', 'player_with_role_value_4'] 
lulw = pd.DataFrame(columns=pd_cols)
for i, v in enumerate(playerValueCombination):
    lulw.loc[i] = list(v)
lulw

Unnamed: 0,player_with_role_value_1,player_with_role_value_2,player_with_role_value_3,player_with_role_value_4
0,1,4,5,1
1,1,4,4,2
2,1,4,3,3
3,1,3,4,3
4,1,3,5,2
5,1,5,3,2
6,1,5,4,1
7,1,4,6,0


In [11]:
season = "17-18"
seasonIteration = available_season.index(season)
available_season[:seasonIteration+1]

['14-15', '15-16', '16-17', '17-18']

In [52]:
player_stats_keys

['touches',
 'saves',
 'total_pass',
 'aerial_won',
 'formation_place',
 'accurate_pass',
 'total_tackle',
 'aerial_lost',
 'fouls',
 'yellow_card',
 'total_scoring_att',
 'man_of_the_match',
 'goals',
 'won_contest',
 'blocked_scoring_att',
 'goal_assist',
 'good_high_claim',
 'second_yellow',
 'red_card',
 'error_lead_to_goal',
 'last_man_tackle',
 'six_yard_block',
 'post_scoring_att',
 'att_pen_target',
 'penalty_save',
 'penalty_conceded',
 'clearance_off_line',
 'att_pen_goal',
 'att_pen_miss',
 'own_goals',
 'att_pen_post']

In [5]:
seasonalTeamStats1415 = seasonalTeamStats(aggregate_stats_keys, '14-15')
seasonalTeamStats1516 = seasonalTeamStats(aggregate_stats_keys, '15-16')
seasonalTeamStats1617 = seasonalTeamStats(aggregate_stats_keys, '16-17')
seasonalTeamStats1718 = seasonalTeamStats(aggregate_stats_keys, '17-18')
# seasonalTeamStats1415 = seasonalTeamStats(aggregate_stats_keys, '14-15', True, True)
# seasonalTeamStats1516 = seasonalTeamStats(aggregate_stats_keys, '15-16', True, True)
# seasonalTeamStats1617 = seasonalTeamStats(aggregate_stats_keys, '16-17', True, True)
# seasonalTeamStats1718 = seasonalTeamStats(aggregate_stats_keys, '17-18', True, True)

In [6]:
#1415 cleaning team name
fpl_player_df = pd.read_csv("fpl_players_data/fpl_14-15.csv")
missing1415 = []
fplTeamNameList = np.unique(fpl_player_df['team'])
for fplTeam in fplTeamNameList:
    if not fplTeam in seasonalTeamStats1415.keys():
        missing1415.append(fplTeam)
fpl_player_df.loc[fpl_player_df['team'] == 'Man City', 'team'] = 'Manchester City'
fpl_player_df.loc[fpl_player_df['team'] == 'Man Utd', 'team'] = 'Manchester United'
fpl_player_df.loc[fpl_player_df['team'] == 'Newcastle', 'team'] = 'Newcastle United'
fpl_player_df.loc[fpl_player_df['team'] == 'QPR', 'team'] = 'Queens Park Rangers'
fpl_player_df.loc[fpl_player_df['team'] == 'Spurs', 'team'] = 'Tottenham'
fpl_player_df.loc[fpl_player_df['team'] == 'West Brom', 'team'] = 'West Bromwich Albion'
fplTeamNameList = np.unique(fpl_player_df['team'])
for fplTeam in fplTeamNameList:
    if not fplTeam in seasonalTeamStats1415.keys():
        print("PROBLEM PERSISTS!!")
# fpl_player_df.to_csv ("fpl_players_data/fpl_14-15_CLEAN.csv", index = None, header=True)

In [7]:
#1516 cleaning team name
fpl_player_df = pd.read_csv("fpl_players_data/fpl_15-16.csv")
missing1516 = []
fplTeamNameList = np.unique(fpl_player_df['team'])
for fplTeam in fplTeamNameList:
    if not fplTeam in seasonalTeamStats1516.keys():
        missing1516.append(fplTeam)
fpl_player_df.loc[fpl_player_df['team'] == 'Man City', 'team'] = 'Manchester City'
fpl_player_df.loc[fpl_player_df['team'] == 'Man Utd', 'team'] = 'Manchester United'
fpl_player_df.loc[fpl_player_df['team'] == 'Newcastle', 'team'] = 'Newcastle United'
fpl_player_df.loc[fpl_player_df['team'] == 'QPR', 'team'] = 'Queens Park Rangers'
fpl_player_df.loc[fpl_player_df['team'] == 'Spurs', 'team'] = 'Tottenham'
fpl_player_df.loc[fpl_player_df['team'] == 'West Brom', 'team'] = 'West Bromwich Albion'
fplTeamNameList = np.unique(fpl_player_df['team'])
for fplTeam in fplTeamNameList:
    if not fplTeam in seasonalTeamStats1516.keys():
        print("PROBLEM PERSISTS!!")
# fpl_player_df.to_csv ("fpl_players_data/fpl_15-16_CLEAN.csv", index = None, header=True)

In [8]:
#1617 cleaning team name
fpl_player_df = pd.read_csv("fpl_players_data/fpl_16-17.csv")
abbr_df = pd.read_csv("team_abbreviations.csv")
fpl_player_df = fpl_player_df.replace(dict(zip(abbr_df['abb'], abbr_df['name'])))
fpl_player_df.loc[fpl_player_df['team'] == 'Arsenal Tula', 'team'] = 'Arsenal'
fpl_player_df.loc[fpl_player_df['team'] == 'Hull City', 'team'] = 'Hull'
fpl_player_df.loc[fpl_player_df['team'] == 'Leicester City', 'team'] = 'Leicester'
fpl_player_df.loc[fpl_player_df['team'] == 'Swansea City', 'team'] = 'Swansea'
fpl_player_df.loc[fpl_player_df['team'] == 'Tottenham Hotspur', 'team'] = 'Tottenham'
fpl_player_df.loc[fpl_player_df['team'] == 'West Ham United', 'team'] = 'West Ham'
fpl_player_df.loc[fpl_player_df['team'] == 'STK', 'team'] = 'Stoke'
fpl_player_df.loc[fpl_player_df['team'] == 'CRY', 'team'] = 'Crystal Palace'
fpl_player_df.loc[fpl_player_df['team'] == 'MCI', 'team'] = 'Manchester City'
fpl_player_df.loc[fpl_player_df['team'] == 'Bayern Munich', 'team'] = 'Manchester United'
fpl_player_df.loc[fpl_player_df['team'] == 'Bursaspor', 'team'] = 'Burnley'
fpl_player_df.loc[fpl_player_df['team'] == 'Chesterfield', 'team'] = 'Chelsea'
fpl_player_df.loc[fpl_player_df['team'] == 'Southend United', 'team'] = 'Southampton'
fplTeamNameList = np.unique(fpl_player_df['team'])
missing1617 = []
for fplTeam in fplTeamNameList:
    if not fplTeam in seasonalTeamStats1617.keys():
        missing1617.append(fplTeam)
opt = []
for k in seasonalTeamStats1617.keys():
    if not k in fplTeamNameList:
        opt.append(k)
missing1617, opt
# fpl_player_df.to_csv ("fpl_players_data/fpl_16-17_CLEAN.csv", index = None, header=True)

([], [])

In [9]:
#1718 cleaning team name
fpl_player_df = pd.read_csv("fpl_players_data/fpl_17-18.csv")
abbr_df = pd.read_csv("team_abbreviations.csv")
fpl_player_df = fpl_player_df.replace(dict(zip(abbr_df['abb'], abbr_df['name'])))
fpl_player_df.loc[fpl_player_df['team'] == 'Arsenal Tula', 'team'] = 'Arsenal'
fpl_player_df.loc[fpl_player_df['team'] == 'Leicester City', 'team'] = 'Leicester'
fpl_player_df.loc[fpl_player_df['team'] == 'Swansea City', 'team'] = 'Swansea'
fpl_player_df.loc[fpl_player_df['team'] == 'Tottenham Hotspur', 'team'] = 'Tottenham'
fpl_player_df.loc[fpl_player_df['team'] == 'West Ham United', 'team'] = 'West Ham'
fpl_player_df.loc[fpl_player_df['team'] == 'STK', 'team'] = 'Stoke'
fpl_player_df.loc[fpl_player_df['team'] == 'CRY', 'team'] = 'Crystal Palace'
fpl_player_df.loc[fpl_player_df['team'] == 'MCI', 'team'] = 'Manchester City'
fpl_player_df.loc[fpl_player_df['team'] == 'Bayern Munich', 'team'] = 'Manchester United'
fpl_player_df.loc[fpl_player_df['team'] == 'Bursaspor', 'team'] = 'Burnley'
fpl_player_df.loc[fpl_player_df['team'] == 'Chesterfield', 'team'] = 'Chelsea'
fpl_player_df.loc[fpl_player_df['team'] == 'Southend United', 'team'] = 'Southampton'
fpl_player_df.loc[fpl_player_df['team'] == 'Newport', 'team'] = 'Newcastle United'
fplTeamNameList = np.unique(fpl_player_df['team'])
missing1718 = []
for fplTeam in fplTeamNameList:
    if not fplTeam in seasonalTeamStats1718.keys():
        missing1718.append(fplTeam)
opt = []
for k in seasonalTeamStats1718.keys():
    if not k in fplTeamNameList:
        opt.append(k)
# missing1718, opt
# fpl_player_df.to_csv ("fpl_players_data/fpl_17-18_CLEAN.csv", index = None, header=True)

In [10]:
# seasonalPlayerStats1415 = seasonalPlayerStats(player_stats_keys, '14-15', True, True)
# seasonalPlayerStats1516 = seasonalPlayerStats(player_stats_keys, '15-16', True, True)
# seasonalPlayerStats1617 = seasonalPlayerStats(player_stats_keys, '16-17', True, True)
# seasonalPlayerStats1718 = seasonalPlayerStats(player_stats_keys, '17-18', True, True)
seasonalPlayerStats1415 = seasonalPlayerStats(player_stats_keys, '14-15')
seasonalPlayerStats1516 = seasonalPlayerStats(player_stats_keys, '15-16')
seasonalPlayerStats1617 = seasonalPlayerStats(player_stats_keys, '16-17')
seasonalPlayerStats1718 = seasonalPlayerStats(player_stats_keys, '17-18')

Processing season 14-15
FOUND DUPLICATE ON NAME AND TEAM FOR Jon�s Guti�rrez and Ayoze P�rez Guti�rrez
FOUND DUPLICATE ON NAME AND TEAM FOR Luke Daniels and Donervon Daniels
FOUND DUPLICATE ON NAME AND TEAM FOR Steven Davis and Kelvin Davis
FOUND DUPLICATE ON NAME AND TEAM FOR Steven Taylor and Ryan Taylor
NEED MANUAL LOOKUP 14-15 NUMBER: 92 [('Alexis Sánchez', ['Arsenal']), ('Yaya Sanogo', ['Arsenal', 'Crystal Palace']), ('Alex Oxlade Chamberlain', ['Arsenal']), ('Emiliano Martínez', ['Arsenal']), ('Oscar', ['Chelsea']), ('André Schürrle', ['Chelsea']), ('Cesc Fàbregas', ['Chelsea']), ('Willian', ['Chelsea']), ('Filipe Luis', ['Chelsea']), ('Lucas Leiva', ['Liverpool']), ('Kolo Touré', ['Liverpool']), ('Steven Davis', ['Southampton']), ('Graziano Pellè', ['Southampton']), ('Jack Cork', ['Southampton', 'Swansea']), ('Saphir Taïder', ['Southampton']), ('Kelvin Davis', ['Southampton']), ('Darren Fletcher', ['Manchester United', 'West Bromwich Albion']), ('Chicharito', ['Manchester United

FOUND DUPLICATE ON NAME AND TEAM FOR Brad Smith and Adam Smith
FOUND DUPLICATE ON NAME AND TEAM FOR David Silva and Bernardo Silva
FOUND DUPLICATE ON NAME AND TEAM FOR Jordan Ayew and Andr� Ayew
FOUND DUPLICATE ON NAME AND TEAM FOR Murphy and Murphy
FOUND DUPLICATE ON NAME AND TEAM FOR Pereira and Joel Pereira
FOUND DUPLICATE ON NAME AND TEAM FOR Steve Cook and Lewis Cook
NEED MANUAL LOOKUP 17-18 NUMBER: 72 [('Jonas Lössl', ['Huddersfield']), ('Chris Löwe', ['Huddersfield']), ('Sadio Mané', ['Liverpool']), ('Virgil van Dijk', ['Liverpool', 'Southampton']), ('Alex Oxlade Chamberlain', ['Liverpool', 'Arsenal']), ('Davy Pröpper', ['Brighton']), ('Pascal Groß', ['Brighton']), ('Adam Smith', ['Bournemouth']), ('Steve Cook', ['Bournemouth']), ('Lewis Cook', ['Bournemouth']), ('Nathan Aké', ['Bournemouth']), ('Antonio Rüdiger', ['Chelsea']), ("N'Golo Kanté", ['Chelsea']), ('Cesc Fàbregas', ['Chelsea']), ('Islam Slimani', ['Leicester', 'Newcastle United']), ('Federico Fernández', ['Swansea']),

In [11]:
# #SAVE AGGREGATED DATA
# for season in available_season:
#     constructSeasonalMatchTable(season, True)
# constructAllSeason(True)

In [12]:
combined_data_path = "all_season_matches_clean.csv"
combineed_df = pd.read_csv(combined_data_path)
combineed_df.shape

(1520, 163)

In [6]:
for season in available_season:
    players_path = "players_data/fpl_"+season+"_COMPLETED_CLEAN.csv"
    player_df = pd.read_csv(players_path)
#     player_df.iloc[:,:9].to_csv ("fpl_players_data/fpl_" + season + "_lookedup_clean_complete.csv", index = None, header=True)