In [1]:
import sklearn
import pandas as pd
import numpy as np
import collections
import os.path
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import tree
from sklearn.model_selection import cross_val_score
from keras.utils import np_utils
from sklearn.neighbors import KNeighborsClassifier
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import sys
from sklearn.ensemble import GradientBoostingRegressor
import math
import csv
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
import urllib
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from datetime import datetime
import random

In [2]:
############################## LOAD TRAINING SET ##############################

if os.path.exists("Data/PrecomputedMatrices/xTrain.npy") and os.path.exists("Data/PrecomputedMatrices/yTrain.npy"):
	xTrain = np.load("Data/PrecomputedMatrices/xTrain.npy")
	yTrain = np.load("Data/PrecomputedMatrices/yTrain.npy")
	print ("Shape of xTrain:", xTrain.shape)
	print ("Shape of yTrain:", yTrain.shape)
else:
	print ('We need a training set! Run dataPreprocessing.py')
	sys.exit()

# In case you want to run with Python 2
try:
    input = raw_input
except NameError:
    pass

curYear = int(input('What year are these predictions for?\n'))

Shape of xTrain: (21449, 17)
Shape of yTrain: (21449,)
What year are these predictions for?
2014


In [3]:
############################## LOAD CSV FILES ##############################

teams_pd = pd.read_csv('Data/KaggleData/Teams.csv')

In [4]:
############################## TRAIN MODEL ##############################

model = GradientBoostingRegressor(n_estimators=100, max_depth=5)

categories=['Wins','PPG','PPGA','PowerConf','3PG', 'APG','TOP','Conference Champ','Tourney Conference Champ',
           'Seed','SOS','SRS', 'RPG', 'SPG', 'Tourney Appearances','National Championships','Location']
accuracy=[]
numTrials = 1

for i in range(numTrials):
    X_train, X_test, Y_train, Y_test = train_test_split(xTrain, yTrain)
    startTime = datetime.now() # For some timing stuff
    results = model.fit(X_train, Y_train)
    preds = model.predict(X_test)

    preds[preds < .5] = 0
    preds[preds >= .5] = 1
    localAccuracy = np.mean(preds == Y_test)
    accuracy.append(localAccuracy)
    print ("Finished run #" + str(i) + ". Accuracy = " + str(localAccuracy))
    print ("Time taken: " + str(datetime.now() - startTime))
if numTrials != 0:
	print ("The average accuracy is", sum(accuracy)/len(accuracy))

Finished run #0. Accuracy = 0.7544284915159426
Time taken: 0:00:10.240803
The average accuracy is 0.7544284915159426


In [5]:
############################## TEST MODEL ##############################

def predictGame(team_1_vector, team_2_vector, home, modelUsed):
    diff = [a - b for a, b in zip(team_1_vector, team_2_vector)]
    diff.append(home)
    if hasattr(modelUsed, 'predict_proba'):
	    return modelUsed.predict_proba([diff])[0][1]
    elif hasattr(modelUsed, 'predict'):
        return modelUsed.predict([diff])[0]
    else:
        raise AttributeError("Model does not have expected prediction method")

In [1]:

def loadTeamVectors(years):
	listDictionaries = []
	for year in years:
		curVectors = np.load("Data/PrecomputedMatrices/TeamVectors/" + str(year) + "TeamVectors.npy",allow_pickle=True).item()
		listDictionaries.append(curVectors)
	return listDictionaries



In [6]:
curYear

2014

In [7]:
############################## PREDICTING ##############################

def trainModel():
	model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
	model.fit(xTrain, yTrain)
	return model

def findWinner(team1, team2, modelUsed):
	year = [curYear]
	teamVectors = loadTeamVectors(year)[0]
	team1Vector = teamVectors[int(teams_pd[teams_pd['TeamName'] == team1].values[0][0])]
	team2Vector = teamVectors[int(teams_pd[teams_pd['TeamName'] == team2].values[0][0])]
	prediction = predictGame(team1Vector, team2Vector, 0, modelUsed)
	if (prediction < 0.5):
		print ("Probability that {0} wins: {1}".format(team2, 1 - prediction))
	else:
		print ("Probability that {0} wins: {1}".format(team1, prediction))

In [8]:
trainedModel = trainModel()

In [11]:
#################Roundof 64#######################

findWinner('Florida','Albany NY' ,trainedModel)             #✓
findWinner('Pittsburgh', 'Colorado' ,trainedModel)      #✓
findWinner('SF Austin', 'VA Commonwealth' ,trainedModel) 
findWinner('UCLA', 'Tulsa' ,trainedModel)    #✓
findWinner('Dayton', 'Ohio St' ,trainedModel) 
findWinner('Syracuse', 'W Michigan' ,trainedModel) #✓
findWinner('Stanford', 'New Mexico' ,trainedModel) 
findWinner('Kansas', 'E Kentucky' ,trainedModel) #✓

Probability that Florida wins: 0.94377549302025
Probability that Pittsburgh wins: 0.6315033004975257
Probability that VA Commonwealth wins: 0.6537484184229876
Probability that UCLA wins: 0.8348477875992311
Probability that Ohio St wins: 0.6381131781997567
Probability that Syracuse wins: 0.9762501514386593
Probability that New Mexico wins: 0.52752818419239
Probability that Kansas wins: 0.9593980811273365


In [12]:
findWinner('Virginia','Coastal Car' ,trainedModel)   #✓
findWinner('Memphis', 'G Washington' ,trainedModel)     #✓ 
findWinner('Harvard', 'Cincinnati' ,trainedModel)  
findWinner('Michigan St', 'Delaware' ,trainedModel) #✓
findWinner('North Carolina', 'Providence' ,trainedModel) #✓
findWinner('Iowa St', 'NC Central' ,trainedModel) #✓
findWinner('Connecticut', "St Joseph's PA" ,trainedModel) #✓
findWinner('Villanova', 'WI Milwaukee' ,trainedModel)#✓

Probability that Virginia wins: 1.004538779592295
Probability that Memphis wins: 0.5211207437542839
Probability that Cincinnati wins: 0.6968271168146272
Probability that Michigan St wins: 0.9152267563006703
Probability that North Carolina wins: 0.5982765119108003
Probability that Iowa St wins: 0.8053684216496044
Probability that Connecticut wins: 0.8508477907761932
Probability that Villanova wins: 0.9585617812004315


In [13]:
findWinner('Arizona','Weber St' ,trainedModel)        #✓
findWinner('Gonzaga', 'Oklahoma St' ,trainedModel)      #✓
findWinner('N Dakota St', 'Oklahoma' ,trainedModel)    
findWinner('S Dakota St', 'New Mexico St' ,trainedModel) 
findWinner('Baylor', 'Nebraska' ,trainedModel)  #✓
findWinner('Creighton', 'Lafayette' ,trainedModel)   #✓
findWinner('Oregon', 'BYU' ,trainedModel) #✓
findWinner('Wisconsin', 'American Univ' ,trainedModel)   #✓

Probability that Arizona wins: 0.9450481081581927
Probability that Gonzaga wins: 0.5483375239864647
Probability that Oklahoma wins: 0.7140195399634542
Probability that New Mexico St wins: 0.7553013069186602
Probability that Baylor wins: 0.7838870933382124
Probability that Creighton wins: 0.9566739085148322
Probability that Oregon wins: 0.6950893484918282
Probability that Wisconsin wins: 0.9929276871569452


In [14]:
findWinner('Wichita St','Cal Poly SLO' ,trainedModel)        #✓     
findWinner('Kentucky', 'Kansas St' ,trainedModel)      #✓
findWinner('St Louis', 'NC State' ,trainedModel) #✓
findWinner('Louisville', 'Manhattan' ,trainedModel)   #✓
findWinner('Tennessee', 'Massachusetts' ,trainedModel) #✓
findWinner('Mercer', 'Duke' ,trainedModel) 
findWinner('Texas', 'Arizona St' ,trainedModel) #✓
findWinner('Michigan', 'Wofford' ,trainedModel) #✓

Probability that Wichita St wins: 0.9717982421991935
Probability that Kentucky wins: 0.8285202687378068
Probability that St Louis wins: 0.6318165618920558
Probability that Louisville wins: 0.9526032200424264
Probability that Tennessee wins: 0.8490916766853496
Probability that Duke wins: 0.7016090592531744
Probability that Texas wins: 0.6367838857949485
Probability that Michigan wins: 0.9432413970816135


In [15]:
############################# Round of 32#########################################

findWinner('Florida','Pittsburgh' ,trainedModel)    #✓         
findWinner('UCLA', 'SF Austin' ,trainedModel)      #✓
findWinner('Dayton', 'Syracuse' ,trainedModel) 
findWinner('Stanford', 'Kansas' ,trainedModel) 
print()
findWinner('Virginia','Memphis' ,trainedModel)           #✓  
findWinner('Michigan St', 'Harvard' ,trainedModel)     #✓ 
findWinner('Iowa St', 'North Carolina' ,trainedModel) #✓
findWinner('Connecticut', 'Villanova' ,trainedModel) #✓
print()
findWinner('Arizona','Gonzaga' ,trainedModel)            #✓ 
findWinner('S Dakota St', 'N Dakota St' ,trainedModel)      
findWinner('Baylor', 'Creighton' ,trainedModel) 
findWinner('Wisconsin', 'Oregon' ,trainedModel) #✓
print()
findWinner('Kentucky', 'Wichita St' ,trainedModel) 
findWinner('Louisville', 'St Louis' ,trainedModel) #✓
findWinner('Tennessee', 'Mercer' ,trainedModel) #✓
findWinner('Michigan', 'Texas' ,trainedModel) #✓

Probability that Florida wins: 0.7997356214370853
Probability that UCLA wins: 0.8815146836292781
Probability that Syracuse wins: 0.683947607909837
Probability that Kansas wins: 0.6869930751534805

Probability that Virginia wins: 0.5821178206547932
Probability that Michigan St wins: 0.8582642059286295
Probability that Iowa St wins: 0.7470778522300897
Probability that Connecticut wins: 0.5708688254394986

Probability that Arizona wins: 0.8216965359368915
Probability that N Dakota St wins: 0.816080629417826
Probability that Creighton wins: 0.5674473300347616
Probability that Wisconsin wins: 0.7865014432933227

Probability that Wichita St wins: 0.5050526655289621
Probability that Louisville wins: 0.8396032675449366
Probability that Tennessee wins: 0.7289979514102106
Probability that Michigan wins: 0.7055497661651663


In [16]:
#####################################Round of 16 ###########################################
findWinner('Florida','UCLA' ,trainedModel)          #✓   
findWinner('Dayton', 'Stanford' ,trainedModel)      
findWinner('Michigan St', 'Virginia' ,trainedModel) 
findWinner('Connecticut', 'Iowa St' ,trainedModel)   #✓
findWinner('Arizona', 'S Dakota St' ,trainedModel) #✓
findWinner('Wisconsin', 'Baylor' ,trainedModel) #✓
findWinner('Kentucky', 'Louisville' ,trainedModel) 
findWinner('Michigan', 'Tennessee' ,trainedModel) #✓

Probability that Florida wins: 0.7814677272994124
Probability that Stanford wins: 0.5730496290698291
Probability that Virginia wins: 0.5763623020231956
Probability that Connecticut wins: 0.5828629003708498
Probability that Arizona wins: 0.95764211278621
Probability that Wisconsin wins: 0.7205628618201545
Probability that Louisville wins: 0.5052952521996934
Probability that Michigan wins: 0.7373734143436592


In [17]:
findWinner('Florida', 'Dayton' ,trainedModel) #✓
findWinner('Connecticut', 'Michigan St' ,trainedModel) #✓
findWinner('Wisconsin', 'Arizona' ,trainedModel) 
findWinner('Kentucky', 'Michigan' ,trainedModel) 
print()
findWinner('Connecticut', 'Florida' ,trainedModel) 
findWinner('Kentucky', 'Wisconsin' ,trainedModel) 
print()
findWinner('Connecticut', 'Kentucky' ,trainedModel) #✓

Probability that Florida wins: 0.832510681814112
Probability that Connecticut wins: 0.5044975165955597
Probability that Arizona wins: 0.5356877999032337
Probability that Michigan wins: 0.5229744059500242

Probability that Florida wins: 0.6497615569598787
Probability that Wisconsin wins: 0.6009352369916763

Probability that Connecticut wins: 0.5974686267142687
