In [None]:
import libsimulation
from src import main

import os, datetime, argparse, requests, urllib.parse, sys, re, traceback, json
import math, numpy as np
from matplotlib import pyplot as plt

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = None
import statsmodels.api as sm
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# NBA data jupyter notebook

__Important__
This notebook is here for you to quickly test with the data.
It is __not__ the final submission, as we will only run your code provided in `src/main.py` and any other files referenced from it.

## Set up some environmental settings

In [None]:
# Initialize some settings
settings = libsimulation.SimulationSettings()
# This prevents you accidentally loading data beyond this point, and also defines the start of the simulation run period
settings.cutoff = '2019-01-01'

# Exploring data

In [None]:
data_loader = libsimulation.NbaDataLoader(settings)

In [None]:
data_loader.getSeason('2011')

In [None]:
data_loader.getSeason('2018POST')

In [None]:
data_loader.getGame(5210)

In [None]:
data_loader.getPlayers('2011')

## V2 API
The second version of the api makes some additional fields available, such as team and player IDs, and assists.
The original API has been left unchanged.

In [None]:
data_loader.getSeasonV2('2019')

In [None]:
data_loader.getGameV2(5210)

In [None]:
data_loader.getPlayersV2('2011')

## Play by Play data

In [None]:
data_loader.getPlays(12509) # Get plays given a gameId

# Building and Testing a model

## Load some data into train and validaiton sets

In [None]:
data_loader = libsimulation.NbaDataLoader(settings)

In [None]:
def get_multi_season_game_data(data_loader, first_year, last_year):
    data = [pd.DataFrame(data_loader.getSeason(str(season))) for season in range(first_year, last_year + 1)]
    data = pd.concat(data, axis=0)
    data.dropna(axis=0, inplace=True)
    data.dateTime=pd.to_datetime(data.dateTime)
    data.sort_values('dateTime', inplace=True)
    data.reset_index(inplace=True, drop=True)
    return data

In [None]:
train_data = get_multi_season_game_data(data_loader, 2009, 2016)
test_data = get_multi_season_game_data(data_loader, 2017, 2018)

In [None]:
train_data.head(2)

In [None]:
test_data.head(2)

## Define some functions to calcuate a Elo ratings over time

In [None]:
## Elo model's probability of home team winning
def home_win_probability(home_elo, away_elo):
    return 1 / (1 + math.pow(10, -(home_elo - away_elo) / 400)) 

## Get new Elo ratings home and away teams after a game
def get_updated_elo(
    home_elo, away_elo, 
    home_victory, ## 1 if home team won, 0 if away team won
    K,  ## model hyperparameter 
): 
    if home_victory not in [0, 1, False, True]:
        raise ValueError(f"home_victory should be 1 if home team won, 0 if away team won. Got {home_victory}")

    P_home_win = home_win_probability(home_elo, away_elo) 
    P_away_win = 1 - P_home_win
  
    # When home team wins 
    if home_victory : 
        home_elo += K * P_away_win
        away_elo -= K * P_home_win
      
    # When away team wins 
    else : 
        home_elo -= K * P_away_win
        away_elo += K * P_home_win
        
    return home_elo, away_elo

## Iterate through games updating each teams Elo rating
def get_elos_over_time(data, ## dataframe of games, must be in order of occurence
                      starting_elo_dict={},  ## dictionary of elo scores by team at the beginning of the data period
                      default_elo=0,  ## elo initally given to a team not in starting_elo_dict
                      K=10,  ## model hyperparameter; higher number means individuals game affects Elo more
                     ):
    
    elo_dict = starting_elo_dict.copy()
    data['homeElo'] = np.nan
    data['awayElo'] = np.nan

    ## Iterate over rows of the dataframe (i.e. over games)
    for i, row in data.iterrows():
        
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        home_elo = elo_dict.get(home_team, default_elo)
        away_elo = elo_dict.get(away_team, default_elo)
        
        ## Put the team's current ELO in the dataframe (this is the teams ELO *before* the match)
        data.loc[i,'homeElo'] = home_elo
        data.loc[i,'awayElo'] = away_elo
        
        ## Calculate the new elo scores and update elo_dict with them
        home_victory = row['pointsDiff'] > 0
        home_elo, away_elo = get_updated_elo(home_elo, away_elo, home_victory, K)
        elo_dict[home_team] = home_elo
        elo_dict[away_team] = away_elo
    
    return elo_dict

In [None]:
K = 10
pre_test_elo_dict = get_elos_over_time(train_data, starting_elo_dict={}, K=K)
post_test_elo_dict = get_elos_over_time(test_data, starting_elo_dict=pre_test_elo_dict, K=K)

In [None]:
train_data.head(2)

In [None]:
test_data.head(2)

## Look at Elo ratings over time

In [None]:
def plot_team_elo_over_time(data, team):
    team_data = data.query(f'homeTeam == "{team}" | awayTeam == "{team}"').copy()
    team_data['Elo'] = team_data.eval(f'(homeTeam == "{team}") * homeElo + (awayTeam == "{team}") * awayElo')
    team_data = team_data[['dateTime', 'gameId', 'Elo']]
    plt.plot(team_data['dateTime'], team_data['Elo'], label=team)

In [None]:
combined_data = pd.concat([train_data, test_data])
for team in ['GS', 'MIA', 'NY', 'SA']:
    plot_team_elo_over_time(combined_data, team)
plt.legend();

## Fit a linear model on our train data

In [None]:
train_data['EloDifference'] = train_data['homeElo'] - train_data['awayElo']
test_data['EloDifference'] = test_data['homeElo'] - test_data['awayElo']

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=False)
model.fit(X=train_data[['EloDifference']], y=train_data['pointsSum'])

In [None]:
## Use the statsmodels library to fit a linear model of Elo difference to points difference
train_data['EloSum'] = train_data['homeElo'] + train_data['awayElo']
test_data['EloSum'] = test_data['homeElo'] + test_data['awayElo']
X = train_data[['EloDifference', 'EloSum']]
X = sm.add_constant(X)
y = train_data['pointsDiff']
model = sm.OLS(y, X).fit()
model.summary()

## Make some predicitons on the test set

In [None]:
X_test = test_data[['EloDifference', 'EloSum']]
X_test = sm.add_constant(X_test)
test_data['predictedDiff'] = model.predict(X_test)
test_data.head(2)

In [None]:
## Check how good our predictions are
sns.lmplot('predictedDiff', 'pointsDiff', test_data);

In [None]:
## remodel to see statistics on test data
X = test_data['predictedDiff']
y = test_data['pointsDiff']
test_model = sm.OLS(y, X).fit()
test_model.summary()

## Set up a prediction method to incorporate the Elo model and return valid predictions

In [None]:
# Write some code
def predict(required_predictions, data_loader, log=lambda x: print(x)):
    first_year = 2016
    
    log('Loading training data')
    train_data = get_multi_season_game_data(data_loader, first_year=first_year, last_year=2020)
    
    log('Getting Elo ratings over time on train data')
    elo_dict = get_elos_over_time(train_data, starting_elo_dict={}, K=10)
    train_data['EloDifference'] = train_data['homeElo'] - train_data['awayElo']
    train_data['EloSum'] = train_data['homeElo'] + train_data['awayElo']
    
    log('Fitting linear model from Elo difference and sum to points difference')
    X = train_data[['EloDifference', 'EloSum']]
    X = sm.add_constant(X)
    y = train_data['pointsDiff']
    diff_model = sm.OLS(y, X).fit()
    
    log('Fitting linear model from Elo difference and sum to points sum')
    y = train_data['pointsSum']
    sum_model = sm.OLS(y, X).fit()
    
    log('Generating predictions')
#     required_predictions = pd.DataFrame(required_predictions)
    tmp = required_predictions[['homeTeam', 'awayTeam']].copy()
    tmp['homeElo'] = [elo_dict[team] for team in tmp['homeTeam']]
    tmp['awayElo'] = [elo_dict[team] for team in tmp['awayTeam']]
    tmp['EloDifference'] = tmp.eval('homeElo - awayElo')
    tmp['EloSum'] = tmp.eval('homeElo + awayElo')
    X = tmp[['EloDifference', 'EloSum']]
    X = sm.add_constant(X)
    tmp['predictedDiff'] = diff_model.predict(X)
    tmp['predictedSum'] = sum_model.predict(X)
    
    required_predictions['predictedDiff'] = tmp['predictedDiff']
    required_predictions['predictedSum'] = tmp['predictedSum']
    
    log('Finished')
    
#     return required_predictions.to_dict('records')
    return required_predictions

In [None]:
required_predictions = test_data[:300][['homeTeam', 'awayTeam', 'dateTime', 'gameId']]
required_predictions = predict(required_predictions, data_loader)

In [None]:
required_predictions.head()

In [None]:
def single_game_error(predictedDiff, predictedSum, actualDiff, actualSum):
    return abs(predictedDiff - actualDiff) + abs(predictedSum - actualSum)

## This function adds a new columns to hte input dataframe (in place) corresponding to the score for each game as well as returning the total score over the entire dataframe
def score_predictions(predictions):
    x1 = predictions['predictedDiff']
    x2 = predictions['predictedSum']
    y1 = predictions['pointsDiff']
    y2 = predictions['pointsSum']
    
    ## baseline model 
    x1_baseline = 0  ## no information about who will win
    x2_baseline = 200  ## avergae points total between 2009 and 2016 seasons
    
    predictions['error'] = single_game_error(x1, x2, y1, y2)
    predictions['baseline_error'] = single_game_error(x1_baseline, x2_baseline, y1, y2)
    
    predictions['score'] = predictions.eval('baseline_error - error')
    
    return predictions.score.sum()

In [None]:
tmp = pd.merge(required_predictions[['gameId', 'predictedDiff', 'predictedSum']], test_data[['gameId', 'pointsDiff', 'pointsSum']], on='gameId', how='left')
print('Total score across entire dataframe: ' + str(score_predictions(tmp)))

In [None]:
plt.plot(np.cumsum(tmp['score']))
plt.xlabel('game')
plt.title('cumulative score');

In [None]:
# Run a simulation
settings.predict = predict
simulation_result = libsimulation.runSimulation(settings)

In [None]:
simulation_result.head(2)