RAPM
===

In [262]:
import json

from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer, OneHotEncoder
from pyspark.sql import Row, SQLContext
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, RidgeRegressionWithSGD

sqlContext = SQLContext(sc)

In [271]:
matchups = (sc
            .textFile('matchups/matchups.json')
            .map(lambda string: json.loads(str(string)))
            .filter(lambda matchup: matchup['season'] in ['2014','2015'])
            .cache())

In [272]:
example = matchups.first()
print('home: {}\naway: {}\nseason: {}\nhome unit: {}\naway unit: {}\n\npossessions: {}\nhome scored: {}\naway scored: {}'
      .format(example['home'],                                 
              example['away'],   
              example['season'],                                
              example[example['home']]['on'],
              example[example['away']]['on'],
              (example[example['home']]['stats']['poss']+example[example['home']]['stats']['poss'])/2.,
              example[example['home']]['stats']['pts'],
              example[example['away']]['stats']['pts']))

home: Pacers
away: Magic
season: 2014
home unit: [u'George Hill', u'Lance Stephenson', u'Paul George', u'David West', u'Roy Hibbert']
away unit: [u'Jameer Nelson', u'Arron Afflalo', u'Maurice Harkless', u'Jason Maxiell', u'Nikola Vucevic']

possessions: 14.0
home scored: 14
away scored: 7


In [274]:
players = (matchups
           .filter(lambda stint: stint[u'season'] in [u'2015', u'2014'])
           .flatMap(lambda stint: (stint[stint[u'home']]['on'], stint[stint[u'away']]['on']))
           .flatMap(lambda players: [str(player) for player in players])
           .distinct())
num_players_broadcast = sc.broadcast(players.count())
print("There are {} players in 2014-2015.".format(players.count()))

There are 580 players in 2014-2015.


In [275]:
players_index = players.zipWithIndex()
players_dict = players_index.collectAsMap()
players_dict_broadcast = sc.broadcast(players_dict)
print(players_dict_broadcast.value)

{'Pau Gasol': 74, 'George Hill': 7, 'Joakim Noah': 502, 'Derrick Williams': 77, 'Randy Foye': 508, 'Robbie Hummel': 163, 'Jeffery Taylor': 27, 'Mike Miller': 432, 'Luol Deng': 216, 'Draymond Green': 526, 'Kyle Lowry': 332, 'Carlos Boozer': 442, 'Jannero Pargo': 333, 'James Southerland': 167, 'Austin Daye': 334, 'Thaddeus Young': 273, 'Kostas Papanikolaou': 10, 'Kevin Love': 453, 'Nikola Vucevic': 461, 'Aaron Brooks': 285, 'Chauncey Billups': 170, 'Jodie Meeks': 501, 'Jordan Hamilton': 17, 'Devin Harris': 380, 'Isaiah Thomas': 477, 'James Ennis': 39, 'Jared Sullinger': 481, 'Paul Millsap': 55, 'Avery Bradley': 56, 'Joe Johnson': 95, 'Will Barton': 59, 'Nik Stauskas': 308, 'Jerami Grant': 98, 'James Jones': 66, 'DeMarcus Cousins': 67, 'Michael Carter-Williams': 348, 'Caron Butler': 351, 'Julius Randle': 352, 'Marc Gasol': 4, 'DJ White': 499, 'Tony Parker': 500, 'LeBron James': 507, 'Brook Lopez': 263, 'Dwight Powell': 357, 'Shannon Brown': 193, 'Zach Randolph': 359, 'Nazr Mohammed': 186,

In [276]:
def createLabeledPointFromMatchup(m):
    global players_dict_broadcast
    global num_players_broadcast
    home = m['home']
    away = m['away']
    home_unit = m[home]['on']
    away_unit = m[away]['on']
    home_poss = m[home]['stats']['poss']
    away_poss = m[away]['stats']['poss']
    avg_poss = (home_poss+away_poss)/2.
    if avg_poss <= 0:
        avg_poss = 0.5
    players_dict = {players_dict_broadcast.value[player]:avg_poss for player in home_unit}
    players_dict.update({players_dict_broadcast.value[player]:-avg_poss for player in away_unit})
    home_pts = m[home]['stats']['pts']
    away_pts = m[away]['stats']['pts']
    return LabeledPoint(100.*(home_pts-away_pts)/avg_poss, SparseVector(num_players_broadcast.value, players_dict))

In [277]:
parsedData = (matchups
 .map(createLabeledPointFromMatchup)
 .cache())
lm = LinearRegressionWithSGD.train(parsedData)
ridge = RidgeRegressionWithSGD.train(parsedData)

In [282]:
lm_weights = lm.weights
lm_dict = {name:lm_weights[index] for (name, index) in players_dict.iteritems()}
sorted(lm_dict.items(), key=lambda x: -x[1])

[('Chris Paul', 2.9232686972636377),
 ('Stephen Curry', 2.9015071871639941),
 ('Manu Ginobili', 2.7062556384349632),
 ('James Harden', 2.5273102353377115),
 ('Kyle Lowry', 2.2791737693685183),
 ('LeBron James', 2.2042299026304168),
 ('Kyle Korver', 2.1282671482247721),
 ('Carmelo Anthony', 2.0800566171698915),
 ('Patrick Mills', 2.0571306707838084),
 ('Ricky Rubio', 2.0533803959162769),
 ('Kawhi Leonard', 2.0160732007304487),
 ('Anthony Davis', 1.9951903500699244),
 ('Pero Antic', 1.9666570979720102),
 ('Zach Randolph', 1.8940552479513335),
 ('Robert Covington', 1.87384141380298),
 ('Draymond Green', 1.7928080324261628),
 ('Dirk Nowitzki', 1.7261853113085546),
 ('Danny Green', 1.7067139144364265),
 ('Kevin Durant', 1.6880941240388787),
 ('LaMarcus Aldridge', 1.6878263997408365),
 ('Marcin Gortat', 1.6829530198106788),
 ('Cory Jefferson', 1.6814470842601432),
 ('DeMarcus Cousins', 1.5925111312916338),
 ('Ty Lawson', 1.5855934737211648),
 ('Monta Ellis', 1.5831037094589238),
 ('Iman Shum