In [1]:
import glob
import json
import numpy as np
import random

In [2]:
years = glob.glob('nba_lineup_data/*')
lineups = []
y = None
for year in years:
    year_dict = json.load(open(year,'r'))
    for k, v in year_dict.items():
        lineups.append((k,v))

In [3]:
print(eval(lineups[1][0]))

(['Josh Childress*ID:2735', 'Josh Smith*ID:2746', 'Marvin Williams*ID:101107', 'Tyronn Lue*ID:1731', 'Zaza Pachulia*ID:2585'], ['Aaron Williams*ID:1425', 'Al Thornton*ID:201154', 'Brevin Knight*ID:1510', 'Cuttino Mobley*ID:1749', 'Quinton Ross*ID:2624'])


In [4]:
player_to_idx = {}
idx_to_player = []
normal_name_to_idx = {}
i = 0
for lineup in lineups:
    l = eval(lineup[0])
    o_lineup = l[0]
    d_lineup = l[1]
    for player in o_lineup + d_lineup:
        if player not in player_to_idx:
            player_to_idx[player] = i
            normal_name_to_idx[player.split('*')[0].lower()] = i
            idx_to_player.append(player)
            i += 1

In [5]:
print(lineups[1])

("([u'Josh Childress*ID:2735', u'Josh Smith*ID:2746', u'Marvin Williams*ID:101107', u'Tyronn Lue*ID:1731', u'Zaza Pachulia*ID:2585'], [u'Aaron Williams*ID:1425', u'Al Thornton*ID:201154', u'Brevin Knight*ID:1510', u'Cuttino Mobley*ID:1749', u'Quinton Ross*ID:2624'])", {'box': {'Josh Childress*ID:2735': {'2PA': 2, 'FGA': 2}, 'Josh Smith*ID:2746': {'2PA': 1, 'FGM': 1, 'PTS': 2, 'FGA': 1, '2PM': 1}, 'Cuttino Mobley*ID:1749': {'REB': 1, 'DRB': 1}, 'Marvin Williams*ID:101107': {'2PA': 2, 'FGM': 2, 'REB': 1, 'ORB': 1, 'TOV': 1, 'PTS': 4, 'FGA': 2, '2PM': 2}, 'Tyronn Lue*ID:1731': {'3PA': 1, 'TOV': 1, 'FGA': 1, 'AST': 1}, 'Al Thornton*ID:201154': {'REB': 1, 'DRB': 1}}, 'num': 7, 'score': 6})


In [6]:
off_stat_labels = ['PTS','TSA','FGA','AST','REB','2PA','2PM','3PA','3PM','FTA','FTM']
def_stat_labels = ['REB','STL','BLK']
def get_normalized_team_stats(lineup, info):
    box = info['box']
    l = eval(lineup)
    o_lineup = [player_to_idx[p] for p in l[0]]
    d_lineup = [player_to_idx[p] for p in l[1]]
    num = info['num']
    meta = (num, info['score']/num)
    off_stats = np.zeros((5,11))
    def_stats = np.zeros((5,3))
    for i, player in enumerate(l[0][:5]):
        for j, stat in enumerate(off_stat_labels):
            if player in box and stat in box[player]:
                off_stats[i][j] = box[player][stat] * 1.0 / num
        #TSA = FGA + .44 * FTA
        off_stats[i][1] = (off_stats[i][2] + .44 * off_stats[i][9])
            
    for i, player in enumerate(l[1][:5]):
        for j, stat in enumerate(def_stat_labels):
            if player in box and stat in box[player]:
                def_stats[i][j] = box[player][stat] * 1.0 / num
            
    return o_lineup, d_lineup, off_stats, def_stats, meta

In [7]:
lineups_parsed = [get_normalized_team_stats(l[0],l[1]) for l in lineups]

In [8]:
def get_appx_norms(dset): 
    order = [i for i in range(len(dset))]
    random.shuffle(order)
    n_samples = 50000
    obox = np.zeros((n_samples*5,11))
    dbox = np.zeros((n_samples*5,3))
    full_plusminus = np.zeros(n_samples)
    for i, o in enumerate(order):
        if(i >= n_samples):
            break
        offense, defense, offbox, defbox, (n_pos, score_change) = dset[o]
        obox[i*5:(i+1)*5] = offbox
        dbox[i*5:(i+1)*5] = defbox
        full_plusminus[i] = score_change
        
    return(obox.mean(axis=0),
           obox.std(axis=0),
           dbox.mean(axis=0),
           dbox.std(axis=0),
           full_plusminus.mean(),
           full_plusminus.std())

obox_means, obox_stdevs, dbox_means, dbox_stdevs, pm_mean, pm_stdev = get_appx_norms(lineups_parsed)

def normalize_lineup(lineup):
    offense, defense, obox, dbox, (n_pos, pm) = lineup
    obox = (obox - obox_means) * 1.0 / obox_stdevs
    dbox = (dbox - dbox_means) * 1.0 / dbox_stdevs
    pm = (pm - pm_mean) * 1.0 / pm_stdev
    return offense, defense, np.float16(obox), np.float16(dbox), (n_pos, pm)

def unnormalize(box, pm):
    box = box * box_stdevs + box_means
    pm = pm * pm_stdev + pm_mean
    return box, pm


In [10]:
norm_lineups = [normalize_lineup(l) for l in lineups_parsed]

In [11]:
import pickle
with open("processed_data/pickled_lineups1998-2018",'wb') as outfile:
    pickle.dump(norm_lineups,outfile)

In [14]:
with open("processed_data/lineup_metadata1998-2018",'wb') as outfile:
    pickle.dump((player_to_idx, 
                 idx_to_player, 
                 normal_name_to_idx, 
                 off_stat_labels, 
                 def_stat_labels,
                 obox_means,
                 obox_stdevs,
                 dbox_means,
                 dbox_stdevs,
                 pm_mean,
                 pm_stdev), outfile)

In [15]:
print(norm_lineups[4])

([38, 39, 40, 41, 42], [43, 44, 45, 46, 47], array([[ 0.8413,  0.1849, -0.617 , -0.3037, -0.2089, -0.528 , -0.3652,
        -0.281 , -0.1678,  3.578 ,  4.355 ],
       [-0.442 , -0.6646, -0.617 , -0.3037, -0.2089, -0.528 , -0.3652,
        -0.281 , -0.1678, -0.243 , -0.1917],
       [-0.442 ,  1.267 ,  1.334 , -0.3037,  4.3   ,  1.658 , -0.3652,
        -0.281 , -0.1678, -0.243 , -0.1917],
       [-0.442 , -0.6646, -0.617 , -0.3037, -0.2089, -0.528 , -0.3652,
        -0.281 , -0.1678, -0.243 , -0.1917],
       [-0.442 ,  1.267 ,  1.334 , -0.3037, -0.2089,  1.658 , -0.3652,
        -0.281 , -0.1678, -0.243 , -0.1917]], dtype=float16), array([[-0.37  , -0.1815, -0.1431],
       [ 2.764 , -0.1815, -0.1431],
       [-0.37  , -0.1815, -0.1431],
       [-0.37  , -0.1815, -0.1431],
       [-0.37  , -0.1815,  7.26  ]], dtype=float16), (2, 0.031834606643597746))
