In [1]:
import os
import pandas as pd
import sys

from pathlib import Path

# Custom imports
sys.path.append(os.path.join(
    os.path.abspath('.'), 'notebooks'))

from commons import *

In [4]:
FILE_BBRef_players = 'BBRef_Ids_to_Players.csv'
FILE_BBRef_teams   = 'BBRef_Ids_to_Teams.csv'

In [5]:
df_BBRef_players = pd.read_csv(DIR_raw / FILE_BBRef_players)
df_BBRef_teams   = pd.read_csv(DIR_raw / FILE_BBRef_teams)

## Datasets processing

### Ids to Players

In [None]:
df_BBRef_players = df_BBRef_players[['BBRefName', 'BBRefID', 'BBRefBirthDate']].dropna()

df_BBRef_players['BBRefID']   = df_BBRef_players['BBRefID'].str.capitalize()
df_BBRef_players['BBRefName'] = df_BBRef_players['BBRefName'].apply(lambda s: unicd2ascii(s))

df_BBRef_players.rename(columns={'BBRefName': 'Name',
                                 'BBRefID': 'Id',
                                 'BBRefBirthDate': 'DOB'}, inplace=True)

df_BBRef_players.set_index('Id', inplace=True)
df_BBRef_players.to_csv(DIR_raw / 'Ids_to_Players.csv')

### Ids to Teams 

In [None]:
df_BBRef_teams   = df_BBRef_teams[['BBRef_Team_Name', 'BBRef_Team_Abbreviation']].dropna()
df_BBRef_teams   = df_BBRef_teams.drop_duplicates()

df_BBRef_teams.rename(columns={'BBRef_Team_Name': 'Name',
                               'BBRef_Team_Abbreviation': 'Id'}, inplace=True)

df_BBRef_teams.set_index('Id', inplace=True)
df_BBRef_teams.to_csv(DIR_raw / 'Ids_to_Teams.csv')

### Polish NBA players, teams and league data

In [2]:
TYPE_game = 'RS'

In [3]:
if TYPE_game == 'PO':
    src_dir_pace    = DIR_raw_po_pace
    src_dir_teams   = DIR_raw_po_teams
    src_dir_players = DIR_raw_po_players
    
    dst_dir_league  = DIR_pro_po_league
    dst_dir_teams   = DIR_pro_po_teams
    dst_dir_players = DIR_pro_po_players
elif TYPE_game == 'RS':
    src_dir_pace    = DIR_raw_rs_pace
    src_dir_teams   = DIR_raw_rs_teams
    src_dir_players = DIR_raw_rs_players
    
    dst_dir_league  = DIR_pro_rs_league
    dst_dir_teams   = DIR_pro_rs_teams
    dst_dir_players = DIR_pro_rs_players
else:
    raise ValueError('Only NBA regular seasons or playoffs can be processed.')

In [4]:
for y in range(YEAR_start, YEAR_end + 1):
    filename = year2filename(y)
    
    df_pace    = pd.read_csv(src_dir_pace / filename)
    df_teams   = pd.read_csv(src_dir_teams / filename)
    df_players = pd.read_csv(src_dir_players / filename)
    
    df_teams   = pd.merge(df_teams, df_pace, on='Team')
    
    # Filter out data with less than min GP
    df_teams   = df_teams[df_teams['GP'] >= GP_min]
    df_players = df_players[df_players['GP'] >= GP_min]
    
    df_league  = pd.DataFrame(df_teams.drop('Team', 1).mean())
    df_league  = df_league.T
    
    df_teams.to_csv(dst_dir_teams / filename, index=False)
    df_players.to_csv(dst_dir_players / filename, index=False)
    df_league.to_csv(dst_dir_league / filename, index=False)