In [43]:
import glob
import json
import os
from collections import defaultdict
import numpy as np
from fields import *
import h5py


In [44]:
games = glob.glob('data_collector/2015-16/regular_season/*')

In [45]:
print len(games)

1230


In [46]:
print games[50]

data_collector/2015-16/regular_season/0021500632.json


In [47]:
with open(games[350],'r') as f:
    sample_game = json.loads(f.read())

In [48]:

def get_message(event):
    if event[event_field["HOMEDESCRIPTION"]]:
        return "[H] {}".format(event[event_field["HOMEDESCRIPTION"]])
    elif event[event_field["NEUTRALDESCRIPTION"]]:
        return "[N] {}".format(event[event_field["NEUTRALDESCRIPTION"]])
    elif event[event_field["VISITORDESCRIPTION"]]:
        return "[V] {}".format(event[event_field["VISITORDESCRIPTION"]])
    else:
        return str(event[event_field['EVENTMSGTYPE']])
    
def print_event(event):
    print("[Q{}] [{:>5}] [E{:03d}] [T:{:02d}] [MT:{:03d}] {}".format(
      event[event_field["PERIOD"]], # quarter
      event[event_field["PCTIMESTRING"]], # timestamp for quarter
      event[event_field["EVENTNUM"]], # event number
      event[event_field["EVENTMSGTYPE"]], # event type
      event[event_field["EVENTMSGACTIONTYPE"]], # event sub type
      get_message(event)))

In [49]:
def parse_teams(game):
    home = None
    away = None
    for event in game:
        if event[event_field['EVENTMSGTYPE']] == 8: # sub
            if event[event_field["VISITORDESCRIPTION"]] is not None:
                away = event[event_field["PLAYER1_TEAM_ABBREVIATION"]]
            else:
                home = event[event_field["PLAYER1_TEAM_ABBREVIATION"]]
    return home, away

In [50]:
def parse_lineups(game):
    """
    Take rowSet of events from API call and return a list of equal size with the 5 man lineup on the floor for each team for each event
    Example output format:
    [
        {
            "Lakers": ["Lonzo Ball", "Brandon Ingram", "Corey Brewer", "Julius Randle", "Brook Lopez"],
            "Warriors": ["Stephen Curry", "Kevin Durant", "Klay Thompson", "Draymond Green", "Andre Iguodala"],
            "event_num": 2
        }, ...
    ]
    """
    player_ids = {}
    team_info = {}
    team_info['home_team'], team_info['away_team'] = parse_teams(game)
    home_team = team_info["home_team"]
    away_team = team_info["away_team"]

    lineups = {
        home_team: set(),
        away_team: set()
    }

    game_with_lineups = []
    current_quarter = -1

    for index, event in enumerate(game):
        #print(event_with_lineups_list)
        # reset lineups when quarter changes
        if current_quarter != event[event_field["PERIOD"]]:
            current_quarter = event[event_field["PERIOD"]]
            lineups[team_info["home_team"]] = set()
            lineups[team_info["away_team"]] = set()
            quarter_start_index = index

        # this piece of code is for a single empty event toward the end of this game: 
        # http://stats.nba.com/game/0020000883/playbyplay/ that causes problems otherwise
        if event[event_field["EVENTMSGTYPE"]] == event_type["SUB"] and get_message(event) is None:
            pass
        # substitution event
        # player 1 is being substituted out
        # player 2 is coming in
        elif event[event_field["EVENTMSGTYPE"]] == event_type["SUB"]:
            #print_event(event)
            team = event[event_field["PLAYER1_TEAM_ABBREVIATION"]]
            player_to_sub_out = (event[event_field["PLAYER1_NAME"]], event[event_field["PLAYER1_ID"]])
            player_to_sub_in  = (event[event_field["PLAYER2_NAME"]], event[event_field["PLAYER2_ID"]])

            if player_to_sub_out[0] not in player_ids:
                player_ids[player_to_sub_out[0]] = event[event_field["PLAYER1_ID"]]
            if player_to_sub_in[0] not in player_ids:
                player_ids[player_to_sub_in[0]] = event[event_field["PLAYER2_ID"]]

            if player_to_sub_out not in lineups[team]:
                # backfill
                for e in game_with_lineups[quarter_start_index:]:
                    e[team].add(player_to_sub_out)

            lineups[team].discard(player_to_sub_out)
            lineups[team].add(player_to_sub_in)

        game_with_lineups.append({
            team_info["home_team"] : lineups[team_info["home_team"]].copy(),
            team_info["away_team"] : lineups[team_info["away_team"]].copy(),
            "event_num": event[event_field["EVENTNUM"]],
        })

        # player1, player2, player3
        if event[event_field["EVENTMSGTYPE"]] != event_type["SUB"] \
                    and event[event_field["EVENTMSGTYPE"]] != event_type["TIM"]\
                    and event[event_field["EVENTMSGTYPE"]] != event_type["EJEC"]:
           
            if event[event_field["EVENTMSGTYPE"]] == event_type["PF"]:
                msg = get_message(event)
                if msg.isdigit() or "Technical" in msg or "T.FOUL" in msg:
                    continue
            
            #f = [event_field['PLAYER1_NAME'], event_field['PLAYER2_NAME'], event_field['PLAYER3_NAME']]
            players = ['PLAYER1','PLAYER2','PLAYER3']
            
            for player in players:
                # team code is always an offset of 4 from player name
                if event[event_field[player+"_NAME"]] and event[event_field[player+"_TEAM_ABBREVIATION"]]:
                        
                    team = event[event_field[player+"_TEAM_ABBREVIATION"]]
                    
                    player_name = event[event_field[player+"_NAME"]]
                    
                    if player_name not in player_ids:
                        player_ids[player_name] = event[event_field[player+"_ID"]]
                        
                    if (player_name, event[event_field[player+"_ID"]]) not in lineups[team]:
                        if len(lineups[team]) == 5:
                            print "OVER"
                            print player
                            print player_name
                            print_event(event)
                        lineups[team].add((player_name,event[event_field[player+"_ID"]]))
                        # backfill
                        for e in game_with_lineups[quarter_start_index:]:
                            e[team].add((player_name,event[event_field[player+"_ID"]]))


    return game_with_lineups

In [51]:
def parse_possession_info(game):
    """takes in a gameid from stats.nba.com and parses play by play to get data in the following format:
    [ {
        "home_lineup": [playerid1, ..., playerid5],
        "away_lineup": [playerid6, ..., playerid10],
        "score_margin_update": 2, #(0-4 range of points that can be scored on a possession, not accounting for weirdness that can happen with technicals, flagrants)
        "home_team_is_on_offense": True, # or false,
        "possession_metadata": 3 #any data we want to keep about what the event was, at least event number for reference
       }, {...}, ...
    ]
    """
    lineups = parse_lineups(game)
    
    
    team_info = {}
    team_info['home_team'], team_info['away_team'] = parse_teams(game)
    home_team = team_info["home_team"]
    away_team = team_info["away_team"]

    initial_free_throw_keys = ["1of2", "1of3", "tech"]
    subsequent_free_throws_keys = ["2of2", "2of3", "3of3"]

    initial_free_throws = [free_throw_event_type[key] for key in initial_free_throw_keys]
    subsequent_free_throws = [free_throw_event_type[key] for key in subsequent_free_throws_keys]

    possession_event_list = []


    for index, event in enumerate(game):
        team = event[event_field["PLAYER1_TEAM_ABBREVIATION"]]
        # made shot
        possession_event = {
            "home_lineup": list(lineups[index][home_team]),
            "away_lineup": list(lineups[index][away_team]),
            "scoring_margin_update": 0,
            "home_team_is_on_offense": team == home_team,
            "possession_metadata": {
                "event_num": event[event_field["EVENTNUM"]],
                "message": get_message(event),
                "event_type": event[event_field["EVENTMSGTYPE"]]
            }
        }

        if event[event_field["EVENTMSGTYPE"]] == event_type["FGM"]:
            if get_message(event):
                possession_event['scoring_margin_update'] = 3 if "3PT" in get_message(event) else 2
            possession_event_list.append(possession_event)


        # free throws
        # logic here is to add a scoring possession on the first free throw, then update that possession
        # for any further made free throws. Substitutions almost always come after the first free throw so this takes
        # care of that. In and one situation, update previous bucket with extra point if free throw made
        elif event[event_field["EVENTMSGTYPE"]] == event_type["FTA"]:

            free_throw_event_code = event[event_field["EVENTMSGACTIONTYPE"]]
            if free_throw_event_code in initial_free_throws:
                possession_event['scoring_margin_update'] = 1 if "MISS" not in get_message(event) else 0
                possession_event_list.append(possession_event)
            elif free_throw_event_code in subsequent_free_throws:
                back_iter = len(possession_event_list) - 1
                if "MISS" not in get_message(event):
                    # scan back to find previous free throw event
                    while back_iter > 0 and possession_event_list[back_iter]["possession_metadata"]["event_type"] != event_type["FTA"]:
                        back_iter += -1
                    if possession_event_list[back_iter]["possession_metadata"]["event_type"] == event_type["FTA"]:
                        possession_event_list[back_iter]["scoring_margin_update"] += 1

            elif free_throw_event_code == free_throw_event_type["1of1"]:
                back_iter = len(possession_event_list) - 1

                if "MISS" not in get_message(event):
                    # scan back to find previous made bucket
                    while back_iter > 0 and possession_event_list[back_iter]["possession_metadata"]["event_type"] != event_type["FGM"]:
                        back_iter += -1
                    if possession_event_list[back_iter]["possession_metadata"]["event_type"] == event_type["FGM"]:
                        possession_event_list[back_iter]["scoring_margin_update"] += 1

        # rebounds
        elif event[event_field["EVENTMSGTYPE"]] == event_type["REB"]:

            team_of_rebounder = event[event_field["PLAYER1_TEAM_ABBREVIATION"]]
            team_of_player_who_missed = game[index-1][event_field["PLAYER1_TEAM_ABBREVIATION"]]

            if team_of_rebounder != team_of_player_who_missed:
                possession_event['scoring_margin_update'] = 0
                possession_event['home_team_is_on_offense'] = home_team == team_of_player_who_missed
                possession_event_list.append(possession_event)

        # steals / turnovers
        elif event[event_field["EVENTMSGTYPE"]] == event_type["STL"]:
            if get_message(event):
                if "steal" in get_message(event).lower():
                    stealers_team = event[event_field["PLAYER1_TEAM_ABBREVIATION"]]
                    home_team_is_on_offense = stealers_team == home_team
                elif "turnover" in get_message(event).lower():
                    to_team = event[event_field["PLAYER1_TEAM_ABBREVIATION"]]
                    home_team_is_on_offense = to_team == home_team
                else:
                    raise RuntimeError("Confused by this event: {}".format(get_message(event)))
                
                possession_event['scoring_margin_update'] = 0
                possession_event['home_team_is_on_offense'] = home_team_is_on_offense
                possession_event_list.append(possession_event)
                


    return possession_event_list

In [52]:
def parse_box_stats(game):
    box_stats = {}
    for index, event in enumerate(game):
        player1 = (event[event_field["PLAYER1_NAME"]],event[event_field["PLAYER1_ID"]])
        if player1 and player1 not in box_stats:
            box_stats[player1] = defaultdict(float)
        player2 = (event[event_field["PLAYER2_NAME"]],event[event_field["PLAYER2_ID"]])
        if player2 and player2 not in box_stats:
            box_stats[player2] = defaultdict(float)
        player3 = (event[event_field["PLAYER3_NAME"]],event[event_field["PLAYER3_ID"]])
        if player3 and player3 not in box_stats:
            box_stats[player3] = defaultdict(float)
        
        if event[event_field["EVENTMSGTYPE"]] == event_type["FGM"]:
            box_stats[player1]['FGM'] += 1
            box_stats[player1]['FGA'] += 1
            
            if '3PT' in get_message(event):
                box_stats[player1]['3PM'] += 1
                box_stats[player1]['3PA'] += 1
                box_stats[player1]['PTS'] += 3
            else:
                box_stats[player1]['2PM'] += 1
                box_stats[player1]['2PA'] += 1
                box_stats[player1]['PTS'] += 2
            
            if player2:
                box_stats[player2]['AST'] += 1
                
        elif event[event_field["EVENTMSGTYPE"]] == event_type["FGA"]:
            box_stats[player1]['FGA'] += 1
            
            if '3PT' in get_message(event):
                box_stats[player1]['3PA'] += 1
            else:
                box_stats[player1]['2PA'] += 1
                
            if player3:
                box_stats[player3]['BLK'] += 1
                
        elif event[event_field["EVENTMSGTYPE"]] == event_type["FTA"]:
            box_stats[player1]['FTA'] += 1
            
            if "MISS" not in get_message(event):
                box_stats[player1]['FTM'] += 1
                box_stats[player1]['PTS'] += 1
                
        elif event[event_field["EVENTMSGTYPE"]] == event_type["REB"]:
            box_stats[player1]['REB'] += 1
            
            if event[event_field["PLAYER1_TEAM_ABBREVIATION"]] == game[index-1][event_field["PLAYER1_TEAM_ABBREVIATION"]]:
                box_stats[player1]['ORB'] += 1
            else:
                box_stats[player1]['DRB'] += 1
                
        elif event[event_field["EVENTMSGTYPE"]] == event_type["STL"]:
            box_stats[player1]['TOV'] += 1
            
            if player2:
                box_stats[player2]['STL'] += 1
                
    return box_stats

In [53]:
for k,v in parse_box_stats(sample_game).items():
    print k
    for k2,v2 in sorted(v.items(), key=lambda x: x[0]):
        print k2, v2

(u'Cole Aldrich', 202332)
2PA 5.0
2PM 4.0
AST 2.0
BLK 3.0
DRB 6.0
FGA 5.0
FGM 4.0
FTA 2.0
ORB 2.0
PTS 8.0
REB 8.0
TOV 1.0
(u'Tim Duncan', 1495)
2PA 6.0
2PM 1.0
AST 1.0
DRB 6.0
FGA 6.0
FGM 1.0
PTS 2.0
REB 6.0
STL 1.0
TOV 3.0
(None, 1610612746)
DRB 9.0
REB 9.0
TOV 2.0
(u'Ray McCallum', 203492)
2PA 2.0
2PM 1.0
AST 1.0
FGA 2.0
FGM 1.0
FTA 2.0
FTM 1.0
PTS 3.0
STL 1.0
(u'Marques Bragg', 66)
(u'Tony Parker', 2225)
2PA 10.0
2PM 5.0
3PA 3.0
3PM 1.0
AST 4.0
DRB 1.0
FGA 13.0
FGM 6.0
FTA 2.0
FTM 1.0
PTS 14.0
REB 1.0
STL 1.0
TOV 2.0
(u'DeAndre Jordan', 201599)
2PA 6.0
2PM 4.0
AST 1.0
BLK 2.0
DRB 15.0
FGA 6.0
FGM 4.0
FTA 2.0
FTM 1.0
ORB 2.0
PTS 9.0
REB 17.0
STL 3.0
TOV 3.0
(u'Dino Radja', 129)
(u'Jonathon Simmons', 203613)
2PA 4.0
2PM 2.0
3PA 1.0
BLK 1.0
DRB 4.0
FGA 5.0
FGM 2.0
FTA 2.0
FTM 1.0
PTS 5.0
REB 4.0
STL 2.0
TOV 4.0
(u'Paul Pierce', 1718)
2PA 4.0
2PM 2.0
3PA 5.0
3PM 2.0
AST 1.0
DRB 4.0
FGA 9.0
FGM 4.0
FTA 2.0
FTM 2.0
PTS 12.0
REB 4.0
TOV 1.0
(u'Chris Paul', 101108)
2PA 11.0
2PM 6.0
3PA 7.0


In [54]:
def preprocess_data(folder, playoffs = False):
    if playoffs:
        gametype = 'playoffs'
    else:
        gametype = 'regular_season'
    print "opening: " + folder + "/*" + gametype + "*.json"
    meta_file = glob.glob(folder + "/*" + gametype + "*.json")[0]

    with open(meta_file,'r') as f:
        games_metadata = json.loads(f.read())

    parsed_games = []
    i = 0
    for game_metadata in games_metadata:
        try:
            with open(folder + "/" + gametype + "/" + game_metadata['game_id'] + '.json','r') as f:
                game = json.loads(f.read())
            i += 1
            parsed_games.append({   "meta": game_metadata,
                                    "pbp": game,
                                    "pos": parse_possession_info(game),
                                    "box": parse_box_stats(game)                       
                                })
        except:
            print "could not find game " + game_metadata['game_id'] 
            
    return np.array(parsed_games)


In [55]:
def pos_to_numpy(pos):
    temp = [0] * 11
    if pos['home_team_is_on_offense']:
        offense = pos['home_lineup']
        defense = pos['away_lineup']
    else:
        offense = pos['away_lineup']
        defense = pos['home_lineup']
    i = 0
    for i in range(5):
        temp[i] = offense[i][1]
        temp[i+5] = defense[i][1]
    temp[10] = pos['scoring_margin_update']
    return np.array(temp)

def game_to_numpy(game):
    temp = []
    for pos in game['pos']:
        if(len(pos['away_lineup']) == 5 and len(pos['home_lineup']) == 5):
            temp.append(pos_to_numpy(pos))
    return np.stack(temp).astype("int32")
    
        

In [56]:
print game['box'].keys()
print game_to_numpy(game)

[(u'Adonal Foyle', 1502), (u'Michael Olowokandi', 1709), (u'Quentin Richardson', 2047), (None, 1610612746), (u'Dean Oliver', 2352), (u'Gilbert Arenas', 2240), (u'Darius Miles', 2032), (u'Eric Piatkowski', 15), (u'Antawn Jamison', 1712), (u'Dean Garrett', 1051), (u'Jason Richardson', 2202), (u'Jeff McInnis', 976), (u'Corey Maggette', 1894), (u'Harold Jamison', 1942), (u'Earl Boykins', 1863), (None, 0), (u'Troy Murphy', 2211), (None, 1610612744), (u'Erick Dampier', 956), (u'Danny Fortson', 1504), (u'Elton Brand', 1882)]
[[  15 1709 1894 ... 2240 1712    0]
 [1502 2202 1504 ...  976 1882    0]
 [  15 1709 1894 ... 2240 1712    0]
 ...
 [  15 1709 2032 ... 2240 1712    0]
 [2202 2211  956 ...  976 2032    2]
 [  15 1709 1894 ... 2240 1712    0]]


In [58]:
directory = "processed_data/"
if not os.path.exists(directory):
    os.makedirs(directory)

player_dict = {}
all_data = []
for year in range(1998,2018):
    folder = "data_collector/" + str(year-1) + "-" + str(year)[2:]
    reg_data = preprocess_data(folder)
    playoff_data = preprocess_data(folder, playoffs=True)
    
    for i, game in enumerate(reg_data):
        for key in  game['box'].keys():
            if(key[0]) is not None:
                player_dict[int(key[1])] = key[0]
        game_desc= str(i) + "-" + game['meta']['away'] +"@" + game['meta']['home'] + ":" +game['meta']['date']
        try:
            all_data.append(game_to_numpy(game))
        except:
            print "BAD GAME:", game_desc
    
    
with open("processed_data/player_keys.json","w") as f:
    f.write(json.dumps(player_dict))
    
data = np.concatenate(all_data, axis=0)
f = h5py.File(directory + "all.h5", "w")
f.create_dataset('all',data=data)
f.close()

opening: data_collector/1997-98/*regular_season*.json
OVER
PLAYER1
Andrew DeClercq
[Q3] [ 4:45] [E310] [T:03] [MT:017] [H] MISS DeClercq Free Throw Clear Path
OVER
PLAYER1
Damon Stoudamire
[Q4] [ 0:03] [E464] [T:01] [MT:001] [V] Stoudamire 27' 3PT Jump Shot (21 PTS)
OVER
PLAYER1
Detlef Schrempf
[Q1] [ 0:36] [E167] [T:06] [MT:002] [H] Schrempf S.FOUL (P1.T2)
OVER
PLAYER1
Derrick Coleman
[Q2] [ 5:28] [E257] [T:04] [MT:000] [V] Coleman REBOUND (Off:3 Def:3)
OVER
PLAYER1
Detlef Schrempf
[Q2] [ 3:25] [E210] [T:01] [MT:005] [H] Schrempf  Layup (6 PTS) (Payton 3 AST)
OVER
PLAYER1
Jim Jackson
[Q2] [ 1:57] [E227] [T:02] [MT:001] [V] MISS Jackson 14' Jump Shot
OVER
PLAYER1
Theo Ratliff
[Q2] [ 1:11] [E312] [T:06] [MT:001] [V] Ratliff P.FOUL (P2.T2)
OVER
PLAYER1
David Wingate
[Q2] [ 6:31] [E172] [T:06] [MT:002] [V] Wingate S.FOUL (P2.T3)
OVER
PLAYER1
David Wingate
[Q3] [ 0:41] [E338] [T:02] [MT:001] [V] MISS Wingate 19' Jump Shot
OVER
PLAYER1
Detlef Schrempf
[Q4] [ 5:37] [E402] [T:04] [MT:000] [V]

(385135, 11)


In [30]:
for year in range(1998,2018):
    player_dict = {}
    folder = "data_collector/" + str(year-1) + "-" + str(year)[2:]
    reg_data = preprocess_data(folder)
    playoff_data = preprocess_data(folder, playoffs=True)

    directory = "processed_data/" + str(year)
    if not os.path.exists(directory):
        os.makedirs(directory)

    f = h5py.File(directory + "/regular.h5", "w")
    for i, game in enumerate(reg_data):
        for key in  game['box'].keys():
            if(key[0]) is not None:
                player_dict[key[1]] = key[0]
        game_desc= str(i) + "-" + game['meta']['away'] +"@" + game['meta']['home'] + ":" +game['meta']['date']
        try:
            data = game_to_numpy(game)
            f.create_dataset(game_desc,data=data)
        except:
            print "BAD GAME:", game_desc
            
    f.close()
    f = h5py.File(directory + "/post.h5", "w")
    for i, game in enumerate(playoff_data):
        for key in  game['box'].keys():
            if(key[0]) is not None:
                player_dict[key[1]] = key[0]
        game_desc= str(i) + "-" + game['meta']['away'] +"@" + game['meta']['home'] + ":" +game['meta']['date']
        try:
            data = game_to_numpy(game)
            f.create_dataset(game_desc,data=data)
        except:
            print "BAD GAME:", game_desc
    f.close()
    with open("processed_data/player_keys.json","w") as f:
        f.write(json.dumps(player_dict))

opening: data_collector/1997-98/*regular_season*.json
OVER
PLAYER1
Andrew DeClercq
[Q3] [ 4:45] [E310] [T:03] [MT:017] [H] MISS DeClercq Free Throw Clear Path
OVER
PLAYER1
Damon Stoudamire
[Q4] [ 0:03] [E464] [T:01] [MT:001] [V] Stoudamire 27' 3PT Jump Shot (21 PTS)
OVER
PLAYER1
Detlef Schrempf
[Q1] [ 0:36] [E167] [T:06] [MT:002] [H] Schrempf S.FOUL (P1.T2)
OVER
PLAYER1
Derrick Coleman
[Q2] [ 5:28] [E257] [T:04] [MT:000] [V] Coleman REBOUND (Off:3 Def:3)
OVER
PLAYER1
Detlef Schrempf
[Q2] [ 3:25] [E210] [T:01] [MT:005] [H] Schrempf  Layup (6 PTS) (Payton 3 AST)
OVER
PLAYER1
Jim Jackson
[Q2] [ 1:57] [E227] [T:02] [MT:001] [V] MISS Jackson 14' Jump Shot
OVER
PLAYER1
Theo Ratliff
[Q2] [ 1:11] [E312] [T:06] [MT:001] [V] Ratliff P.FOUL (P2.T2)
OVER
PLAYER1
David Wingate
[Q2] [ 6:31] [E172] [T:06] [MT:002] [V] Wingate S.FOUL (P2.T3)
OVER
PLAYER1
David Wingate
[Q3] [ 0:41] [E338] [T:02] [MT:001] [V] MISS Wingate 19' Jump Shot
OVER
PLAYER1
Detlef Schrempf
[Q4] [ 5:37] [E402] [T:04] [MT:000] [V]

In [None]:
over = 0
under = 0
total = 0

        
print over * 1.0/ (total)
        
print under * 1.0/ (total)    

In [None]:
0.000511445531051
0.000370023092541

In [None]:
print reg_data[200]['box']