In [19]:
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import os

data_filename = "D:\\CODE\\Dataset\\basketball2016.csv"
dataset = pd.read_csv(data_filename, parse_dates=["Date"])
dataset.columns = [
    "Date", "Start", "VisitorTeam", "Visitor", "HomeTeam", "Home",
    "Score Type", "OT?", "Attend", "Arena", "Notes"
]  # type: ignore

dataset["HomeWin"] = dataset["Visitor"] < dataset["Home"]
y_true = dataset["HomeWin"].values

In [20]:
won_list = defaultdict(bool)
for index, row in dataset.iterrows():
    home_team = row["Home"]
    visitor_team = row["Visitor"]
    row["HomeLastWin"] = won_list[home_team]
    row["VisitorLastWin"] = won_list[visitor_team]
    dataset.loc[index, "HomeLastWin"] = row["HomeLastWin"]
    dataset.loc[index, "VisitorLastWin"] = row["VisitorLastWin"]
    dataset.loc[index] = row
    won_list[home_team] = row.HomeWin
    won_list[visitor_team] = not row.HomeWin

clf = DecisionTreeClassifier(random_state=14)
X_previonswins = dataset[["HomeLastWin", "VisitorLastWin"]].values
scores = cross_val_score(clf, X_previonswins, y_true, scoring='accuracy')
print("Accuracy: {0:1f}%".format(np.mean(scores) * 100))

Accuracy: 64.620462%


In [21]:
standing_filename = "D:\\CODE\\Dataset\\basketball2015.csv"
standing = pd.read_csv(standing_filename)
dataset["HomeTeamRanksHigher"] = 0
for index, row in dataset.iterrows():
    home_team = row["HomeTeam"]
    visitor_team = row["VisitorTeam"]
    home_rank = standing[standing["Team"] == home_team].Rk.values[0]
    visitor_rank = standing[standing["Team"] == visitor_team].Rk.values[0]
    if home_rank != None:
        row["HomeTeamRanksHigher"] = home_rank > visitor_rank
    else:
        row["HomeTeamRanksHigher"] = False
    dataset.loc[index, "HomeTeamRanksHigher"] = row["HomeTeamRanksHigher"]
    dataset.loc[index] = row

X_homehigher = dataset[[
    "HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"
]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Accuracy: {0:1f}%".format(np.mean(scores) * 100))

last_match_winner = defaultdict(int)
dataset["HomeTeamWonlast"] = 0
for index, row in dataset.iterrows():
    home_team = row["HomeTeam"]
    visitor_team = row["VisitorTeam"]
    teams = tuple(sorted([home_team, visitor_team]))  # type: ignore
    if last_match_winner[teams] == row["HomeTeam"]:
        row["HomeTeamWonLast"] = 1
    else:
        row["HomeTeamWonLast"] = 0
    dataset.loc[index, "HomeTeamWonlast"] = row["HomeTeamWonLast"]
    if row["HomeWin"]:
        winner = row["HomeTeam"]
    else:
        winner = row["VisitorTeam"]
    last_match_winner[teams] = winner  # type: ignore
X_lastwinner = dataset[["HomeTeamRanksHigher", "HomeTeamWonlast"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')
print("Accuracy: {0:1f}%".format(np.mean(scores) * 100))

Accuracy: 65.410600%
Accuracy: 65.991070%


In [22]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

encoding = LabelEncoder()
encoding.fit(dataset["HomeTeam"].values)
home_team = encoding.transform(dataset["HomeTeam"].values)
visitor_team = encoding.transform(dataset["VisitorTeam"].values)
X_teams = np.vstack([home_team, visitor_team]).T
onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams_expanded, y_true, scoring='accuracy')
print("Accuracy: {0:1f}%".format(np.mean(scores) * 100))

Accuracy: 58.893419%


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:1f}%".format(np.mean(scores) * 100))

X_all = np.hstack([X_homehigher, X_teams])
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:1f}%".format(np.mean(scores) * 100))

parameter_space = {
    "max_features": [2, 5, 'auto'],
    "n_estimators": [
        100,
    ],
    "min_samples_leaf": [2, 4, 6],
}
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)

Accuracy: 60.275675%
Accuracy: 62.838284%
Accuracy: 65.204815%
RandomForestClassifier(max_features=5, min_samples_leaf=4, random_state=14)
