In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score


dataset = pd.read_csv('./data/october_schedule.csv', parse_dates=["Date"])
dataset = dataset.drop(columns=['Unnamed: 6', 'Attend.'])

renamed_columns = ["Date", "Score Type", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Notes"]
dataset.columns = renamed_columns
dataset

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes
0,2018-10-16,8:00p,Philadelphia 76ers,87,Boston Celtics,105,,
1,2018-10-16,10:30p,Oklahoma City Thunder,100,Golden State Warriors,108,,
2,2018-10-17,7:00p,Milwaukee Bucks,113,Charlotte Hornets,112,,
3,2018-10-17,7:00p,Brooklyn Nets,100,Detroit Pistons,103,,
4,2018-10-17,7:00p,Memphis Grizzlies,83,Indiana Pacers,111,,
...,...,...,...,...,...,...,...,...
105,2018-10-31,8:00p,Utah Jazz,125,Minnesota Timberwolves,128,,
106,2018-10-31,8:00p,Indiana Pacers,107,New York Knicks,101,,
107,2018-10-31,10:30p,New Orleans Pelicans,121,Golden State Warriors,131,,
108,2018-10-31,10:30p,Dallas Mavericks,113,Los Angeles Lakers,114,,


In [3]:
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]
y_true = dataset["HomeWin"].values
y_true

array([ True,  True, False,  True,  True,  True,  True,  True, False,
        True, False, False,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
       False,  True, False, False,  True,  True,  True, False, False,
       False,  True,  True, False,  True,  True,  True,  True, False,
       False,  True, False,  True,  True,  True,  True, False,  True,
        True,  True, False, False,  True, False,  True,  True,  True,
       False, False,  True,  True, False,  True, False, False,  True,
        True, False, False, False, False,  True,  True,  True,  True,
        True, False, False,  True,  True, False,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True, False,  True,
        True, False,  True,  True,  True, False,  True, False,  True,
        True, False])

In [71]:
from collections import defaultdict


won_last = defaultdict(int)
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    dataset.iloc[index] = row
    
won_last[home_team] = row["HomeWin"]
won_last[visitor_team] = not row["HomeWin"]


In [5]:
dataset["VisitorLastWin"] = 0
dataset['HomeLastWin'] = 0

won_last = defaultdict(int)
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    dataset.iloc[index] = row

    won_last[home_team] = row["HomeWin"]
    won_last[visitor_team] = not row["HomeWin"]
    
won_last

defaultdict(int,
            {'Boston Celtics': True,
             'Philadelphia 76ers': False,
             'Golden State Warriors': True,
             'Oklahoma City Thunder': True,
             'Charlotte Hornets': True,
             'Milwaukee Bucks': True,
             'Detroit Pistons': False,
             'Brooklyn Nets': True,
             'Indiana Pacers': True,
             'Memphis Grizzlies': True,
             'Orlando Magic': False,
             'Miami Heat': False,
             'New York Knicks': False,
             'Atlanta Hawks': False,
             'Toronto Raptors': True,
             'Cleveland Cavaliers': True,
             'Houston Rockets': False,
             'New Orleans Pelicans': False,
             'San Antonio Spurs': True,
             'Minnesota Timberwolves': True,
             'Sacramento Kings': True,
             'Utah Jazz': False,
             'Los Angeles Clippers': False,
             'Denver Nuggets': True,
             'Phoenix Suns': False,
  

In [None]:
clf = DecisionTreeClassifier(random_state=14)
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values
scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

In [None]:
standings = pd.read_csv('./data/expanded-standings.csv', skiprows=[0])
dataset["HomeTeamRanksHigher"] = 0
standings

In [None]:
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]

    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    dataset.iloc[index] = row

X_homehigher = dataset[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
dataset

In [None]:
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 61.8%


In [7]:
standings = pd.read_csv('./data/expanded-standings.csv', skiprows=[0])
dataset["HomeTeamRanksHigher"] = 0
standings

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Milwaukee Bucks,60-22,33-8,27-14,40-12,20-10,13-5,14-2,13-5,...,17-8,5-6,45-5,7-0,8-6,10-4,12-3,10-1,10-6,3-2
1,2,Toronto Raptors,58-24,32-9,26-15,36-16,22-8,12-4,10-8,14-4,...,15-8,11-7,33-9,7-1,12-3,8-7,10-5,8-1,9-6,4-1
2,3,Golden State Warriors,57-25,30-11,27-14,22-8,35-17,6-4,8-2,8-2,...,16-9,7-7,34-10,8-1,7-7,10-5,11-2,7-4,9-5,5-1
3,4,Denver Nuggets,54-28,34-7,20-21,20-10,34-18,7-3,6-4,7-3,...,15-10,13-3,23-11,6-1,9-6,8-4,12-4,7-4,9-6,3-3
4,5,Houston Rockets,53-29,31-10,22-19,21-9,32-20,8-2,6-4,7-3,...,20-5,5-7,29-12,1-5,9-6,11-4,8-6,8-4,12-3,4-1
5,6,Portland Trail Blazers,53-29,32-9,21-20,24-6,29-23,9-1,8-2,7-3,...,19-6,4-6,29-8,5-2,8-7,8-7,11-4,6-3,10-5,5-1
6,7,Philadelphia 76ers,51-31,31-10,20-21,31-21,20-10,8-8,12-6,11-7,...,14-10,10-8,22-16,4-4,12-4,7-6,11-4,6-4,9-5,2-4
7,8,Utah Jazz,50-32,29-12,21-20,20-10,30-22,6-4,7-3,7-3,...,18-7,0-7,34-12,4-3,7-9,7-7,11-4,6-3,11-4,4-2
8,9,Boston Celtics,49-33,28-13,21-20,35-17,14-16,10-6,13-5,12-6,...,12-12,5-6,24-12,5-2,7-8,9-5,11-4,5-6,8-7,4-1
9,10,Oklahoma City Thunder,49-33,27-14,22-19,21-9,28-24,6-4,8-2,7-3,...,12-13,6-7,23-12,2-4,12-3,9-6,9-5,6-5,6-10,5-0


In [8]:
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]

    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    dataset.iloc[index] = row

X_homehigher = dataset[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
dataset

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin,VisitorLastWin,HomeLastWin,HomeTeamRanksHigher
0,2018-10-16,8:00p,Philadelphia 76ers,87,Boston Celtics,105,,,True,0,0,1
1,2018-10-16,10:30p,Oklahoma City Thunder,100,Golden State Warriors,108,,,True,0,0,0
2,2018-10-17,7:00p,Milwaukee Bucks,113,Charlotte Hornets,112,,,False,0,0,1
3,2018-10-17,7:00p,Brooklyn Nets,100,Detroit Pistons,103,,,True,0,0,1
4,2018-10-17,7:00p,Memphis Grizzlies,83,Indiana Pacers,111,,,True,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
105,2018-10-31,8:00p,Utah Jazz,125,Minnesota Timberwolves,128,,,True,True,True,1
106,2018-10-31,8:00p,Indiana Pacers,107,New York Knicks,101,,,False,False,True,1
107,2018-10-31,10:30p,New Orleans Pelicans,121,Golden State Warriors,131,,,True,False,True,0
108,2018-10-31,10:30p,Dallas Mavericks,113,Los Angeles Lakers,114,,,True,False,False,0


In [9]:
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 67.3%
