In [8]:
import pandas as pd
from datetime import datetime
from datetime import timedelta
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import Location, Outcome
import json

# Player matches data

## Grab all of the daily data

In [9]:
def get_player_matches_over_dates(start_date, end_date):
    player_match_df = pd.DataFrame()
    curr_date = start_date

    while curr_date <= end_date:
        day = curr_date.day
        month = curr_date.month
        year = curr_date.year

        daily_matches = client.player_box_scores(day=day, month=month, year=year)
        daily_df = pd.DataFrame(daily_matches)

        if len(daily_df) > 0:
            daily_df.insert(0, 'date', datetime(year, month, day))
            daily_df.insert(1, 'year', year)
            daily_df.insert(2, 'month', month)
            daily_df.insert(3, 'day', day)
            player_match_df = pd.concat([player_match_df, daily_df])

        curr_date = curr_date + timedelta(days=1)
    
    return player_match_df.reset_index()

## Clean up the data

In [11]:
with open('../data/team_name_mapping.json', 'r') as f:
    team_name_mapping = json.load(f)

### Clean up enums and rename columns

In [12]:
def remove_enum_columns(player_match_df):
    df = player_match_df.assign(
        team_key = lambda df: df['team'].map(lambda team: team_name_mapping[team.value]),
        opponent_key = lambda df: df['opponent'].map(lambda team: team_name_mapping[team.value]),
        home_game = lambda df: df['location'].map(lambda loc: loc == Location.HOME),
        win = lambda df: df['outcome'].map(lambda outcome: outcome == Outcome.WIN)
    )

    return df.drop(columns=['team', 'opponent', 'location', 'outcome'])

In [13]:
def rename_columns(player_match_df):
    df = player_match_df.rename(columns={
        'slug': 'player_id',
        'seconds_played': 'secs',
        'made_field_goals': 'fgm',
        'attempted_field_goals': 'fga',
        'made_three_point_field_goals': '3pm',
        'attempted_three_point_field_goals': '3pa',
        'made_free_throws': 'ftm',
        'attempted_free_throws': 'fta',
        'offensive_rebounds': 'orb',
        'defensive_rebounds': 'drb',
        'assists': 'ast',
        'steals': 'stl',
        'blocks': 'blk',
        'turnovers': 'tvr',
        'personal_fouls': 'pf'
    })

    return df

### Add game ID, season start year

In [14]:
def format_date(date):
    return datetime.strftime(date, format = "%Y%m%d")

In [15]:
def get_game_id(date, home, away):
    return format_date(date) + "_" + home + "_" + away

In [16]:
def get_game_id_for_row(row):
    home = row['team_key'] if row['home_game'] else row['opponent_key']
    away = row['opponent_key'] if row['home_game'] else row['team_key']

    return get_game_id(row['date'], home, away)

In [17]:
def get_season_start_year_for_row(row):
    year = row['year']
    month = row['month']

    if month < 7:
        return year - 1
    else:
        return year

In [18]:
def add_game_ids(player_match_df):
    df = player_match_df.copy()
    df.insert(0, 'game_id', df.apply(get_game_id_for_row, axis=1))

    return df

In [19]:
def add_season_start_year(player_match_df):
    df = player_match_df.copy()
    df.insert(5, 'season_start_year', df.apply(get_season_start_year_for_row, axis=1))

    return df

## Add fantasy points

In [20]:
def get_fantasy_points(row):
    shot_points = 3 * row['3pm'] + 2 * row['fgm'] + row['ftm']
    rebound_points = 1.2 * (row['orb'] + row['drb'])
    assist_points = 1.5 * row['ast']
    block_points = 3.0 * row['blk']
    steal_points = 3.0 * row['stl']

    return shot_points + rebound_points + assist_points + block_points + steal_points

In [21]:
def calculate_fantasy_points_for_df(df):
    return df.apply(lambda x: get_fantasy_points(x), axis = 1)

In [32]:
def add_fantasy_points(player_match_df):
    df = player_match_df.copy()
    df['fp'] = calculate_fantasy_points_for_df(df)
    
    return df

## Add "last-n-games" data

In [23]:
def add_rolling_stats(player_match_df, stats, n):
    df = player_match_df.copy()

    for stat in stats:
        newCol = stat + "_l" + str(n)
        rolStat = (df.groupby('player_id')[stat].apply(
            lambda x: x.rolling(n, min_periods = 1).mean().shift(1).fillna(0)
        )).reset_index(0, drop=True)
        df[newCol] = list(rolStat)
    
    return df

## Methods for adding new data

In [2]:
# TODO: make this first delete the old_data rows that conflict with the new data
# That way we can properly test old fanduel competitions
def roll_with_old_data(new_data, old_data, stats, n):
    combined_data = pd.concat([old_data, new_data], join="inner").reset_index()
    new_data_indices = combined_data.tail(len(new_data)).index
    combined_data.sort_values(by=['game_id'])

    df = add_rolling_stats(combined_data, stats, n)
    
    return df.loc[new_data_indices, :]

## Call functions on fetched data

In [44]:
start_date = datetime(2019, 11, 18) # datetime(2010, 1, 1)
end_date = datetime(2019, 11, 30)

player_match_df = get_player_matches_over_dates(start_date, end_date)

In [45]:
clean_pm_df = remove_enum_columns(player_match_df)
clean_pm_df = rename_columns(clean_pm_df)
clean_pm_df = add_game_ids(clean_pm_df)
clean_pm_df = add_season_start_year(clean_pm_df)
clean_pm_df = add_fantasy_points(clean_pm_df)

In [48]:
stats = ['secs', 'fgm', 'fga', '3pm', '3pa', 'ftm', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tvr', 'pf', 'fp']
ROLLING_GAMES_WINDOW = 5
# Choose one of the below
# 1. Use old data and roll with it
old_data = pd.read_csv("../data/raw_data/20100101_20191117_player_matches.csv")

clean_pm_df = roll_with_old_data(clean_pm_df, old_data, stats, ROLLING_GAMES_WINDOW)
complete_df = pd.concat([old_data, clean_pm_df])
complete_df.to_csv("../data/raw_data/20100101_20191129_player_matches.csv")

# 2. Use collected data in isolation
# clean_pm_df = add_rolling_stats(clean_pm_df, stats, ROLLING_GAMES_WINDOW)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


## Save to dataframe

In [28]:
path = "../data/raw_data/" + format_date(start_date) + "_" + format_date(end_date) + "_" + "player_matches.csv"

clean_pm_df.to_csv(path, index = False)