# Games data

This file takes the player_seasons data and the player_matches_data, aggregates them, and creates a new dataframe in a form that can be easily input to a model

In [53]:
import pandas as pd
import numpy as np

In [54]:
NUM_PLAYERS_ON_TEAM = 7
ROLLING_GAMES_WINDOW = 5 # must match player_matches_data (should clean this up, non-dup)
PLAYER_MATCHES_FILENAME = "../data/raw_data/20100101_20191129_player_matches.csv"
PLAYER_SEASONS_FILENAME = "../data/raw_data/2009_2019_player_seasons_unique.csv"

In [55]:
player_matches = pd.read_csv(PLAYER_MATCHES_FILENAME)
player_seasons = pd.read_csv(PLAYER_SEASONS_FILENAME)

## Duplicate the rows (for each game, each player gets to be player_1)

In [56]:
def swap_colums_orders(colnames, i, j):
    pi = "_p" + str(i)
    pj = "_p" + str(j)
    swapped_colnames = []
    for col in colnames :
        new_col = col
        if col.find(pi, len(col)-len(pi)) > -1:
            new_col = col.replace(pi, pj)
        if col.find(pj, len(col)-len(pj)) > -1:
            new_col = col.replace(pj, pi)
        swapped_colnames.append(new_col)
    return swapped_colnames

def swap_team_columns(colnames, nb_players):
    swaped_teams = colnames
    for i in range(1,nb_players+1):
        swaped_teams = swap_colums_orders(swaped_teams, i, nb_players+i)
    return swaped_teams

In [59]:
def duplicate_rows(per_game_df, players_per_team):
    colnames_all = per_game_df.columns
    all_per_game_df = per_game_df
    for team in range(2) :
        if team==1:
            per_game_df = per_game_df[swap_team_columns(colnames_all, players_per_team)]
            per_game_df.columns = colnames_all
            all_per_game_df = pd.concat([all_per_game_df, per_game_df])
        for i in range(2, 8):
            temps_df = per_game_df[swap_colums_orders(colnames_all, 1, i)]
            temps_df.columns = colnames_all
            all_per_game_df = pd.concat([all_per_game_df, temps_df])

    all_per_game_df = all_per_game_df.drop(["fp_p"+str(i) for i in range(2,players_per_team*2+1)], axis=1)
    all_per_game_df = all_per_game_df.drop(["player_id_p"+str(i) for i in range(2,players_per_team*2+1)], axis=1)
    return all_per_game_df

## Final function

In [58]:
def create_games_data(player_matches, player_seasons, num_players, rolling_games_window, duplicate=False):
    recent_stats_suffix = "_l" + str(rolling_games_window)
    player_matches = player_matches.sort_values(by=['game_id', 'fp' + recent_stats_suffix], ascending=[True, False])

    team_game_group = player_matches.groupby(by=['game_id', 'home_game'])
    player_matches = team_game_group.head(num_players).reset_index(drop=True)

    stats = ['fgm', 'fga', '3pm', '3pa', 'ftm', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tvr', 'pf', 'fp']
    season_stats = ['player_id'] + [s + "_seas_avg" for s in stats]
    recent_stats = [s + recent_stats_suffix for s in stats]

    merge_season_fields = ['end_year'] + season_stats
    to_merge_season_data = player_seasons[merge_season_fields]

    merged_data = pd.merge(
        player_matches,
        to_merge_season_data,
        how='left',
        left_on=['player_id', 'season_start_year'],
        right_on = ['player_id', 'end_year']
    ).fillna(0)

    y = ['fp']

    all_data =  season_stats + recent_stats + y

    per_game_df = pd.DataFrame()
    game_group = merged_data.groupby('game_id')

    for field in all_data:
        v = game_group.apply(lambda x: x[field].tolist())
        df = pd.DataFrame(v.tolist(), index=v.index).rename(columns=lambda x: x + 1).add_prefix(field + "_p").reset_index(drop=True)
        per_game_df = pd.concat([per_game_df, df], axis=1, sort=False)

    if duplicate:
        per_game_df = duplicate_rows(per_game_df, num_players)

    return per_game_df.fillna(0)

### Can generate data

In [48]:
per_game_df = create_games_data(player_matches, player_seasons, NUM_PLAYERS_ON_TEAM, ROLLING_GAMES_WINDOW)
path = "../data/raw_data/games_" + str(NUM_PLAYERS_ON_TEAM) + "_players" + ".csv"
per_game_df.to_csv(path, index = False)

### Can generate data with "duplicated" rows

In [51]:
all_per_game_df = create_games_data(
    player_matches,
    player_seasons,
    NUM_PLAYERS_ON_TEAM,
    ROLLING_GAMES_WINDOW,
    True
)
path = "../data/raw_data/all_games_" + str(NUM_PLAYERS_ON_TEAM) + "_players" + ".csv"
all_per_game_df.to_csv(path, index = False)