# Games data

This file takes the player_seasons data and the player_matches_data, aggregates them, and creates a new dataframe in a form that can be easily input to a model

In [2]:
import pandas as pd
import numpy as np

In [3]:
NUM_PLAYERS_CONSIDERED_PER_TEAM = 7
ROLLING_GAMES_WINDOW = 5 # must match player_matches_data (should clean this up, non-dup)
PLAYER_MATCHES_FILENAME = "../data/20100101_20191118_player_matches.csv"
PLAYER_SEASONS_FILENAME = "../data/2009_2019_player_seasons_unique.csv"

In [4]:
player_matches = pd.read_csv(PLAYER_MATCHES_FILENAME)
player_seasons = pd.read_csv(PLAYER_SEASONS_FILENAME)

In [40]:
def limit_players_per_game(player_matches, players_per_team, recent_stats_suffix):
    df = player_matches.copy()
    df = df.sort_values(by=['game_id', 'fp' + recent_stats_suffix], ascending=[True, False])

    team_game_group = df.groupby(by=['game_id', 'home_game'])
    df = team_game_group.head(players_per_team).reset_index(drop=True)

    return df

In [32]:
def merge_match_and_season_data(matches, seasons, season_fields):
    return pd.merge(
        matches,
        seasons[season_fields],
        how='left',
        left_on=['player_id', 'season_start_year'],
        right_on = ['player_id', 'end_year']
    ).fillna(0)

In [33]:
def move_players_to_columns(games_data, stats):
    per_game_df = pd.DataFrame()
    game_group = games_data.groupby('game_id')

    for field in stats:
        v = game_group.apply(lambda x: x[field].tolist())
        df = pd.DataFrame(v.tolist(), index=v.index).rename(columns=lambda x: x + 1).add_prefix(field + "_p").reset_index(drop=True)
        per_game_df = pd.concat([per_game_df, df], axis=1, sort=False)
    
    return per_game_df.fillna(0)

In [41]:
def create_games_data(player_matches, player_seasons, players_per_team, rolling_games_window):
    recent_stats_suffix = "_l" + str(rolling_games_window)
    stats = ['fgm', 'fga', '3pm', '3pa', 'ftm', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tvr', 'pf', 'fp']
    season_stats = ['player_id'] + [s + "_seas_avg" for s in stats]
    recent_stats = [s + recent_stats_suffix for s in stats]

    adj_player_matches = limit_players_per_game(player_matches, players_per_team, recent_stats_suffix)
    
    season_fields = ['end_year'] + season_stats
    games_data = merge_match_and_season_data(adj_player_matches, player_seasons, season_fields)
    
    all_stats = season_stats + recent_stats + ['fp']
    
    games_data = move_players_to_columns(games_data, all_stats)

    return games_data

In [35]:
per_game_df = create_games_data(player_matches, player_seasons, NUM_PLAYERS_CONSIDERED_PER_TEAM, ROLLING_GAMES_WINDOW)

In [36]:
per_game_df

Unnamed: 0,player_id_p1,player_id_p2,player_id_p3,player_id_p4,player_id_p5,player_id_p6,player_id_p7,player_id_p8,player_id_p9,player_id_p10,...,fp_p5,fp_p6,fp_p7,fp_p8,fp_p9,fp_p10,fp_p11,fp_p12,fp_p13,fp_p14
0,robinna01,horfoal01,johnsjo02,chandwi01,smithjo03,bibbymi01,gallida01,harrial01,leeda02,duhonch01,...,49.5,31.9,30.7,27.8,25.4,24.4,16.9,14.2,12.2,11.6
1,bryanko01,hawessp01,udrihbe01,gasolpa01,casspom01,odomla01,bynuman01,udokaim01,fishede01,rodrise01,...,43.7,43.5,34.3,25.5,19.1,15.5,10.2,10.1,9.9,7.9
2,flynnjo01,howardw01,anderry01,nelsoja01,lewisra02,barnema02,cartevi01,jeffeal01,loveke01,ellinwa01,...,37.9,34.7,34.4,33.5,30.5,23.3,21.1,18.2,16.7,14.2
3,wallara01,boshch01,allenra02,allento01,turkohe01,jackja01,bargnan01,perkike01,houseed01,davisgl01,...,35.9,35.9,34.3,33.5,29.8,24.0,14.5,14.1,12.4,11.2
4,howardw01,anderry01,nelsoja01,lewisra02,barnema02,cartevi01,rosede01,denglu01,gibsota01,salmojo01,...,45.3,20.6,47.7,39.8,36.4,27.8,22.8,22.7,20.1,19.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12600,youngtr01,jamesle01,davisan02,parkeja01,rondora01,mcgeeja01,huntede01,crabbal01,kuzmaky01,howardw01,...,28.9,22.4,21.9,16.2,33.5,23.3,21.3,29.1,40.4,11.8
12601,jokicni01,murraja01,crowdja01,jacksja02,millspa01,moranja01,valanjo01,anderky01,clarkbr01,morrimo01,...,47.1,26.4,32.0,19.8,30.5,21.4,31.3,27.8,30.9,32.6
12602,holidjr01,redicjj01,pascher01,greendr01,willike04,alexani01,burksal01,mooreet01,mellini01,bowmaky01,...,26.0,44.0,19.3,22.4,36.5,36.4,27.3,26.2,27.4,34.0
12603,bealbr01,vucevni01,gordoaa01,fournev01,bryanth01,bertada01,wagnemo01,milescj01,rosste01,thomais02,...,32.2,42.1,24.1,37.5,29.3,26.5,27.1,32.6,16.2,22.3


In [37]:
path = "../data/games_" + str(NUM_PLAYERS_CONSIDERED_PER_TEAM) + "_players" + ".csv"
per_game_df.to_csv(path, index = False)

## Duplicate the rows (for each game, each player gets to be player_1)

In [93]:
def swap_colums_orders(colnames, i, j):
    pi = "_p" + str(i)
    pj = "_p" + str(j)
    swapped_colnames = []
    for col in colnames :
        new_col = col
        if col.find(pi, len(col)-len(pi)) > -1:
            new_col = col.replace(pi, pj)
        if col.find(pj, len(col)-len(pj)) > -1:
            new_col = col.replace(pj, pi)
        swapped_colnames.append(new_col)
    return swapped_colnames
def swap_team_columns(colnames):
    swaped_teams = colnames
    for i in range(1,8):
        swaped_teams = swap_colums_orders(swaped_teams, i, 7+i)
    return swaped_teams

In [None]:
colnames_all = per_game_df.columns
all_per_game_df = per_game_df

for team in range(2) :
    for i in range(2+7*team, 8+7*team):
        temps_df = per_game_df[swap_colum_orders(colnames_all, 1+7*team, i)]
        if team==1:
            temps_df = temps_df[swap_team_columns(colnames_all)]
        temps_df.columns = colnames_all
        all_per_game_df = pd.concat([all_per_game_df, temps_df])

In [113]:
path = "../data/all_games_" + str(NUM_PLAYERS_CONSIDERED_PER_TEAM) + "_players" + ".csv"
per_game_df.to_csv(path, index = False)