# Games data

This file takes the player_seasons data and the player_matches_data, aggregates them, and creates a new dataframe in a form that can be easily input to a model

In [2]:
import pandas as pd
import numpy as np

In [3]:
NUM_PLAYERS_CONSIDERED_PER_TEAM = 7
ROLLING_GAMES_WINDOW = 5 # must match player_matches_data (should clean this up, non-dup)
PLAYER_MATCHES_FILENAME = "../data/20100101_20191118_player_matches.csv"
PLAYER_SEASONS_FILENAME = "../data/2009_2019_player_seasons_unique.csv"

In [4]:
player_matches = pd.read_csv(PLAYER_MATCHES_FILENAME)
player_seasons = pd.read_csv(PLAYER_SEASONS_FILENAME)

In [18]:
def limit_players_per_game(player_matches, players_per_team, recent_stats_suffix):
    df = player_matches.copy()
    df = df.sort_values(by=['game_id', 'fp' + recent_stats_suffix], ascending=[True, False])

    team_game_group = df.groupby(by=['game_id', 'home_game'])
    df = team_game_group.head(players_per_team).reset_index(drop=True)

    return df

In [19]:
def merge_match_and_season_data(matches, seasons, season_fields):
    return pd.merge(
        matches,
        seasons[season_fields],
        how='left',
        left_on=['player_id', 'season_start_year'],
        right_on = ['player_id', 'end_year']
    ).fillna(0)

In [20]:
def move_players_to_columns(games_data, stats):
    per_game_df = pd.DataFrame()
    game_group = games_data.groupby('game_id')

    for field in stats:
        v = game_group.apply(lambda x: x[field].tolist())
        df = pd.DataFrame(v.tolist(), index=v.index).rename(columns=lambda x: x + 1).add_prefix(field + "_p").reset_index(drop=True)
        per_game_df = pd.concat([per_game_df, df], axis=1, sort=False)
    
    return per_game_df.fillna(0)

In [23]:
def create_games_data(player_matches, player_seasons, players_per_team, rolling_games_window):
    recent_stats_suffix = "_l" + str(rolling_games_window)
    stats = ['fgm', 'fga', '3pm', '3pa', 'ftm', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tvr', 'pf', 'fp']
    season_stats = ['player_id'] + [s + "_seas_avg" for s in stats]
    recent_stats = [s + recent_stats_suffix for s in stats]

    adj_player_matches = limit_players_per_game(player_matches, players_per_team, recent_stats_suffix)
    
    season_fields = ['end_year'] + season_stats
    games_data = merge_match_and_season_data(player_matches, player_seasons, season_fields)
    
    all_stats = season_stats + recent_stats + ['fp']
    
    games_data = move_players_to_columns(games_data, all_stats)

    return games_data

In [24]:
per_game_df = create_games_data(player_matches, player_seasons, NUM_PLAYERS_CONSIDERED_PER_TEAM, ROLLING_GAMES_WINDOW)

In [26]:
path = "../data/games_" + str(NUM_PLAYERS_CONSIDERED_PER_TEAM) + "_players" + ".csv"
per_game_df.to_csv(path, index = False)

## Duplicate the rows (for each game, each player gets to be player_1)

In [93]:
def swap_colums_orders(colnames, i, j):
    pi = "_p" + str(i)
    pj = "_p" + str(j)
    swapped_colnames = []
    for col in colnames :
        new_col = col
        if col.find(pi, len(col)-len(pi)) > -1:
            new_col = col.replace(pi, pj)
        if col.find(pj, len(col)-len(pj)) > -1:
            new_col = col.replace(pj, pi)
        swapped_colnames.append(new_col)
    return swapped_colnames
def swap_team_columns(colnames):
    swaped_teams = colnames
    for i in range(1,8):
        swaped_teams = swap_colums_orders(swaped_teams, i, 7+i)
    return swaped_teams

In [None]:
colnames_all = per_game_df.columns
all_per_game_df = per_game_df

for team in range(2) :
    for i in range(2+7*team, 8+7*team):
        temps_df = per_game_df[swap_colum_orders(colnames_all, 1+7*team, i)]
        if team==1:
            temps_df = temps_df[swap_team_columns(colnames_all)]
        temps_df.columns = colnames_all
        all_per_game_df = pd.concat([all_per_game_df, temps_df])

In [113]:
path = "../data/all_games_" + str(NUM_PLAYERS_CONSIDERED_PER_TEAM) + "_players" + ".csv"
per_game_df.to_csv(path, index = False)