**Import Data**

In [89]:
#Import the dataset and parse dates
import os
import numpy as np
import pandas as pd
results = pd.read_csv("/content/sample_data/leagues_NBA_2019_games_games.csv",parse_dates=["Date"])

In [90]:
results.head(5)

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes
0,2018-10-29,Box Score,Orlando Magic,87,Indiana Pacers,97,,
1,2018-10-29,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,
2,2018-10-29,Box Score,Chicago Bulls,95,Miami Heat,107,,
3,2018-10-30,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,
4,2018-10-30,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,


**Data Clearning**

In [91]:
# Don't read the first row, as it is blank, and parse the date column as a date
results = pd.read_csv("/content/sample_data/leagues_NBA_2019_games_games.csv", skiprows=[0,])
# Fix the name of the columns
results.columns = ["Date", "Score Type", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Notes"]
results.iloc[:5]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes
0,Tue Oct 29 2018,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,
1,Tue Oct 29 2018,Box Score,Chicago Bulls,95,Miami Heat,107,,
2,Wed Oct 30 2018,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,
3,Wed Oct 30 2018,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,
4,Wed Oct 30 2018,Box Score,Washington Wizards,102,Detroit Pistons,113,,


**Find HomeWin team**

In [92]:
results["HomeWin"] = results["VisitorPts"] < results["HomePts"]
# Our "class values"
y_true = results["HomeWin"].values
results.iloc[:5]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin
0,Tue Oct 29 2018,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True
1,Tue Oct 29 2018,Box Score,Chicago Bulls,95,Miami Heat,107,,,True
2,Wed Oct 30 2018,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True
3,Wed Oct 30 2018,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True
4,Wed Oct 30 2018,Box Score,Washington Wizards,102,Detroit Pistons,113,,,True


In [93]:
print("Home Win accuracy: {0:.1f}%".format(100 * results["HomeWin"].sum() / results["HomeWin"].count()))
results["HomeLastWin"] = False
results["VisitorLastWin"] = False
# This creates two new columns, all set to False
results.iloc[:5]

Home Win accuracy: 58.0%


Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,Tue Oct 29 2018,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True,False,False
1,Tue Oct 29 2018,Box Score,Chicago Bulls,95,Miami Heat,107,,,True,False,False
2,Wed Oct 30 2018,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True,False,False
3,Wed Oct 30 2018,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True,False,False
4,Wed Oct 30 2018,Box Score,Washington Wizards,102,Detroit Pistons,113,,,True,False,False


In [94]:
# Now compute the actual values for these
# Did the home and visitor teams win their last game?
from collections import defaultdict
won_last = defaultdict(int)

for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    results.iloc[index] = row    
    # Set current win(use the result of last game for the next match when the same team meet with each other)
    won_last[home_team] = row["HomeWin"]
    won_last[visitor_team] = not row["HomeWin"]
results.iloc[20:25]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin
20,Fri Nov 1 2018,Box Score,Miami Heat,100,Brooklyn Nets,101,,,True,False,False
21,Fri Nov 1 2018,Box Score,Cleveland Cavaliers,84,Charlotte Bobcats,90,,,True,False,True
22,Fri Nov 1 2018,Box Score,Portland Trail Blazers,113,Denver Nuggets,98,,,False,False,False
23,Fri Nov 1 2018,Box Score,Dallas Mavericks,105,Houston Rockets,113,,,True,True,True
24,Fri Nov 1 2018,Box Score,San Antonio Spurs,91,Los Angeles Lakers,85,,,False,False,True


**Decision Tree**

In [95]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

from sklearn.model_selection import cross_val_score

# Create a dataset with just the neccessary information
X_previouswins = results[["HomeLastWin", "VisitorLastWin"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Using just the last result from the home and visitor teams")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using just the last result from the home and visitor teams
Accuracy: 59.1%


In [96]:
# What about win streaks?
results["HomeWinStreak"] = 0
results["VisitorWinStreak"] = 0
# Did the home and visitor teams win their last game?
from collections import defaultdict
win_streak = defaultdict(int)

for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeWinStreak"] = win_streak[home_team]
    row["VisitorWinStreak"] = win_streak[visitor_team]
    results.loc[index] = row    
    # Set current win
    if row["HomeWin"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1

In [97]:
clf = DecisionTreeClassifier(random_state=14)
X_winstreak =  results[["HomeLastWin", "VisitorLastWin", "HomeWinStreak", "VisitorWinStreak"]].values
scores = cross_val_score(clf, X_winstreak, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 58.4%


**Feature Engineering** 

Good feature is related to the accuracy, even  the feature is more important than algorithm sometimes.

**NBA forest result**

In [98]:
# Let's try see which team is better on the ladder. Using the previous year's ladder
ladder = pd.read_csv('/content/sample_data/leagues_NBA_2018_standings_expanded-standings.csv')
ladder

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,NW,P,SW,Pre,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,8-2,8-2,9-1,36-14,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,10-6,16-2,13-5,39-14,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
2,3,San Antonio Spurs,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,9-9,12-6,12-4,42-12,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
3,4,Denver Nuggets,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,11-5,14-4,13-5,33-21,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,12-6,11-5,12-6,39-17,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1
5,6,Memphis Grizzlies,56-26,32-9,24-17,22-8,34-18,8-2,8-2,6-4,12-6,12-6,10-6,33-18,23-8,6-4,28-9,0-1,12-1,7-7,10-7,9-2,11-6,7-2
6,7,New York Knicks,54-28,31-10,23-18,37-15,17-13,10-6,12-6,15-3,6-4,5-5,6-4,32-18,22-10,7-5,31-12,,11-4,10-5,7-6,6-5,12-6,8-2
7,8,Brooklyn Nets,49-33,26-15,23-18,36-16,13-17,11-5,13-5,12-6,5-5,5-5,3-7,31-22,18-11,9-4,23-17,,11-4,5-11,11-4,7-5,8-7,7-2
8,9,Indiana Pacers,49-32,30-11,19-21,31-20,18-12,6-11,13-3,12-6,3-7,7-3,8-2,32-21,17-11,4-9,27-14,1-0,7-8,10-5,9-6,9-3,11-5,2-5
9,10,Golden State Warriors,47-35,28-13,19-22,19-11,28-24,7-3,5-5,7-3,10-8,9-7,9-9,30-22,17-13,5-3,20-18,1-0,8-6,12-4,8-7,4-8,9-7,5-3


In [99]:
# We can create a new feature -- HomeTeamRanksHigher\
results["HomeTeamRanksHigher"] = 0
for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    # some team change their name but they are still the same team
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    home_rank = ladder[ladder["Team"] == home_team]["Rk"].values[0]
    visitor_rank = ladder[ladder["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    results.iloc[index] = row
results[:5]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeWinStreak,VisitorWinStreak,HomeTeamRanksHigher
0,Tue Oct 29 2018,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True,0,0,0,0,1
1,Tue Oct 29 2018,Box Score,Chicago Bulls,95,Miami Heat,107,,,True,0,0,0,0,0
2,Wed Oct 30 2018,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True,0,0,0,0,1
3,Wed Oct 30 2018,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True,0,0,0,0,1
4,Wed Oct 30 2018,Box Score,Washington Wizards,102,Detroit Pistons,113,,,True,0,0,0,0,0


In [100]:
X_homehigher =  results[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 60.2%


extract the necessary parts

In [101]:
X_homehigher =  results[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values

testing by cross-validation

In [102]:
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 60.2%


Although the team ranking helps predict(The top-ranked ones have a better chance of winning),sometimes the bottom-ranked teams can beat the top-ranked teams. There are many reasons. For example, Some styles of the lower-ranked teams happen to hit the weak underbelly of the strong.

In [103]:
# Who won the last match? We ignore home/visitor for this bit
last_match_winner = defaultdict(int)
results["HomeTeamWonLast"] = 0

for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    # Set in the row, who won the last encounter
    row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
    results.iloc[index] = row
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner
results.iloc[:5]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeWinStreak,VisitorWinStreak,HomeTeamRanksHigher,HomeTeamWonLast
0,Tue Oct 29 2018,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True,0,0,0,0,1,0
1,Tue Oct 29 2018,Box Score,Chicago Bulls,95,Miami Heat,107,,,True,0,0,0,0,0,0
2,Wed Oct 30 2018,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True,0,0,0,0,1,0
3,Wed Oct 30 2018,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True,0,0,0,0,1,0
4,Wed Oct 30 2018,Box Score,Washington Wizards,102,Detroit Pistons,113,,,True,0,0,0,0,0,0


In [104]:
X_home_higher =  results[["HomeTeamRanksHigher", "HomeTeamWonLast"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
accuracy: 60.5%


In [105]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoding = LabelEncoder()
encoding.fit(results["Home Team"].values)
home_teams = encoding.transform(results["Home Team"].values)
visitor_teams = encoding.transform(results["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

In [106]:
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

In [107]:
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("accuracy: {0:.1f}%".format(np.mean(scores) * 100))

accuracy: 60.1%




Random forest averages randomly constructed decision trees to produce an algorithm that reduces the variance of the results. These used feature subsets should be able to learn more features more effectively than normal decision trees.

In [108]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Using full team labels is ranked higher")
print("accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Using full team labels is ranked higher
accuracy: 61.5%




In [109]:
X_all = np.hstack([X_home_higher, X_teams])
print(X_all.shape)

(1229, 62)


In [110]:
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Using whether the home team is ranked higher
accuracy: 62.9%




In [111]:
#n_estimators=10, criterion='gini', max_depth=None, 
#min_samples_split=2, min_samples_leaf=1,
#max_features='auto',
#max_leaf_nodes=None, bootstrap=True,
#oob_score=False, n_jobs=1,
#random_state=None, verbose=0, min_density=None, compute_importances=None
from sklearn.model_selection import GridSearchCV

parameter_space = {
                   "max_features": [2, 10, 'auto'],
                   "n_estimators": [100,],
                   "criterion": ["gini", "entropy"],
                   "min_samples_leaf": [2, 4, 6],
                   }
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("accuracy: {0:.1f}%".format(grid.best_score_ * 100))



accuracy: 65.4%


In [112]:
print(grid.best_estimator_)

RandomForestClassifier(criterion='entropy', min_samples_leaf=6, random_state=14)
