# Clean Data

- Historical odds and results data sourced from [here](https://www.sportsbookreviewsonline.com/scoresoddsarchives/nba/nbaoddsarchives.htm).
- This data includes some pre-season and exihibtion games; exclude those using results data from nba.com to get season start and end dates.

In [318]:
import datetime
import hashlib
import os
import pandas as pd

import utils

In [311]:
def parse(game_tuple):
    row = {
        'raw_date': game_tuple[0]['Date'],
        'team': game_tuple[0]['Team'],
        'VH': game_tuple[0]['VH'],
        'team_points': game_tuple[0]['Final'],
        'opponent': game_tuple[1]['Team'],
        'opponent_points': game_tuple[1]['Final'],
        'point_spread': get_point_spread(game_tuple),
    }

    return row

def get_game_spread(game_tuple):
    '''
    Data model here assigns the point spread to the favorite and the game total to the dog. 
    
    We need to untangle this unholy mess.
    '''
    game_spread = [clean_close(x['Close']) for x in game_tuple if clean_close(x['Close']) < 100][0]

    return game_spread

def get_point_spread(game_tuple):
    game_spread = get_game_spread(game_tuple)
    
    if game_spread == game_tuple[0]['Close']:
        return -1 * game_spread
    else:
        return game_spread
    
def clean_close(close):
    if close in ['pk', 'PK']:
        return 0
    elif close != close:
        return 0
    else:
        return close
    
def get_game_tuples(file_name):
    source_data = pd.read_excel('../data/sportsbookreview_nba/' + file_name)
    records = source_data.to_dict('records')
    game_tuples = [records[i:i+2] for i in range(0, len(records), 2)]
    
    return game_tuples

def get_game_date(row):
    raw_date = row['raw_date']
    
    year = int(row['season'].split('-')[0])
    if raw_date < 800:
        year+=1
        
    month = int(str(raw_date)[:-2])
    day = int(str(raw_date)[-2:])
    
    dt = datetime.date(year, month, day)
    
    return dt

def get_clean_name(input_string):
    if input_string == "Oklahoma City":
        return 'OklahomaCity'
    elif input_string == "LA Clippers":
        return 'LAClippers'
    elif input_string == "Golden State":
        return 'GoldenState'
    elif input_string in ['NewJersey', 'Brooklyn']:
        return 'Nets'
    else:
        return input_string

In [280]:
def get_game_id(row):
    raw_id = row['team'] + row['opponent'] + row['file_name'] + str(row['raw_date'])
    m = hashlib.md5()
    m.update(raw_id.encode())
    game_id = m.hexdigest()
    
    return game_id

In [289]:
files = os.listdir('../data/sportsbookreview_nba')
files

['nba odds 2017-18.xlsx',
 'nba odds 2009-10.xlsx',
 'nba odds 2021-22.xlsx',
 'nba odds 2010-11.xlsx',
 'nba odds 2019-20.xlsx',
 'nba odds 2007-08.xlsx',
 'nba odds 2014-15.xlsx',
 'nba odds 2022-23.xlsx',
 'nba odds 2018-19.xlsx',
 'nba odds 2012-13.xlsx',
 'nba odds 2015-16.xlsx',
 '.ipynb_checkpoints',
 'nba odds 2016-17.xlsx',
 'nba odds 2008-09.xlsx',
 'nba odds 2020-21.xlsx',
 'nba odds 2013-14.xlsx',
 'nba odds 2011-12.xlsx']

In [290]:
rows = []

for file_name in files:
    if 'xlsx' in file_name:
        print(file_name)

        game_tuples = get_game_tuples(file_name)

        for game_tuple in game_tuples:
            ## Get First Team
            row = parse(game_tuple)
            row.update({'file_name': file_name})
            game_id = get_game_id(row)
            row.update({'game_id': game_id})
            rows.append(row)

            ## Now, Reverse Order and Get Second Team
            game_tuple.reverse()

            row = parse(game_tuple)
            row.update({'file_name': file_name})
            row.update({'game_id': game_id})
            rows.append(row)

nba odds 2017-18.xlsx
nba odds 2009-10.xlsx
nba odds 2021-22.xlsx
nba odds 2010-11.xlsx
nba odds 2019-20.xlsx
nba odds 2007-08.xlsx
nba odds 2014-15.xlsx
nba odds 2022-23.xlsx
nba odds 2018-19.xlsx
nba odds 2012-13.xlsx
nba odds 2015-16.xlsx
nba odds 2016-17.xlsx
nba odds 2008-09.xlsx
nba odds 2020-21.xlsx
nba odds 2013-14.xlsx
nba odds 2011-12.xlsx


### Interlude: Get Season Start and End Dates

(We Scrape this data in Chapter 5)

In [325]:
stem = '../data/nba_stats/'

season_start_and_end_dates = []

files = [
    x for x in os.listdir(stem)
    if 'Base' in x
]

for file in files:
    row = (
        pd.read_csv(stem + file)
        .rename(columns={'season_year': 'season'})
        .assign(
            game_date = lambda x: x['game_date'].transform(lambda s: s.split('T')[0]),
            dt = lambda x: x['game_date'].apply(utils.clean_date)
        )
        .groupby('season')
        .agg(
            first_game = ('dt', 'min'),
            last_game = ('dt', 'max')
        )
        .reset_index()
    )
    
    season_start_and_end_dates.append(row)

## Finish Data

In [328]:
clean_data = (
    pd.DataFrame(rows)
    .assign(
        season = lambda x: x['file_name'].transform(lambda s: s.split(' ')[-1].split('.')[0]),
        dt = lambda x: x.apply(get_game_date, axis=1),
        home_away = lambda x: x['VH'].transform(lambda s: 'Home' if s == "H" else "Away"),
        mov = lambda x: x['team_points'] - x['opponent_points'],
        team = lambda x: x['team'].apply(get_clean_name),
        opponent = lambda x: x['opponent'].apply(get_clean_name)
    )
    .sort_values(
        by=['season', 'team', 'dt']
    )
    .reset_index()
    .reset_index()
    .assign(
        game_number = lambda x: x.groupby(['season', 'team'])['level_0'].rank(method="first", ascending=True)
    )
    .merge(
        pd.concat(season_start_and_end_dates),
        on='season',
        how='left'
    )
    .query('dt >= first_game')
    .query('dt <= last_game')
    .drop(['raw_date', 'VH', 'level_0', 'index', 'first_game', 'last_game'], axis=1)
)

clean_data.sample(10)

Unnamed: 0,team,team_points,opponent,opponent_points,point_spread,file_name,game_id,season,dt,home_away,mov,game_number
21692,LALakers,107,Memphis,100,3.0,nba odds 2015-16.xlsx,bb37910c4da353107b1ff2f30e241283,2015-16,2016-03-22,Home,7,70.0
14652,Philadelphia,89,Sacramento,80,-6.0,nba odds 2012-13.xlsx,6ddf955b7704003374a37a007eb3caf7,2012-13,2013-02-01,Home,9,46.0
11034,Detroit,82,Milwaukee,103,8.0,nba odds 2011-12.xlsx,3aebf3a6336e7520f9d87d0f1572fa26,2011-12,2012-01-30,Away,-21,22.0
650,Detroit,77,Houston,80,3.5,nba odds 2007-08.xlsx,3443b74795a145b4692e80a8a6868b0b,2007-08,2007-12-12,Away,-3,22.0
1537,Nets,87,Milwaukee,80,-6.0,nba odds 2007-08.xlsx,5fe2b05b8c17b96a29c7ae98c1a37009,2007-08,2008-01-29,Home,7,45.0
19935,Philadelphia,86,Cleveland,87,17.0,nba odds 2014-15.xlsx,a99974f7050a41afa8280abd19a397ab,2014-15,2015-03-29,Away,-1,74.0
24973,OklahomaCity,118,NewOrleans,110,-6.0,nba odds 2016-17.xlsx,e7f673a6c99dab67272410d4edbb6357,2016-17,2017-02-26,Home,8,59.0
11370,LALakers,90,Denver,99,5.5,nba odds 2011-12.xlsx,cce9a78b36f0038521f1f36d1df73115,2011-12,2012-01-01,Away,-9,6.0
5201,Washington,98,Indiana,118,-1.0,nba odds 2008-09.xlsx,8e5d3fe54184ec4c910fcd5604eb7639,2008-09,2008-12-15,Home,-20,22.0
833,Houston,83,Dallas,96,-3.5,nba odds 2007-08.xlsx,220a59c3fc64b19f2c99350ed3ff08ee,2007-08,2007-12-15,Home,-13,24.0


In [329]:
clean_data.to_csv('../data/NBA Scores and Point Spreads.csv', index=False)