In [4]:
import sklearn
import pandas as pd
import numpy as np
from __future__ import division
import collections
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import classification_report



# Visualizing All the Historical Data

Let's first take a look at all the historical data we have. These are stats from the 1985 - 2015 NCAA basketball seasons. 

In [5]:
# This one contains stats for every single regular season game played between 1985 and 2015. It mainly
# contains info on the score of the game, the IDs for each team, and where the game was played.
reg_season_compact_pd = pd.read_csv('RegularSeasonCompactResults.csv')
reg_season_compact_pd.tail()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
145284,2016,132,1114,70,1419,50,N,0
145285,2016,132,1163,72,1272,58,N,0
145286,2016,132,1246,82,1401,77,N,1
145287,2016,132,1277,66,1345,62,N,0
145288,2016,132,1386,87,1433,74,N,0


In [6]:
# This one expands on the previous data frame by going into more in depth stats like 3 point field goals,
# free throws, steals, blocks, etc. 
reg_season_detailed_pd = pd.read_csv('RegularSeasonDetailedResults.csv')
reg_season_detailed_pd.columns

Index([u'Season', u'Daynum', u'Wteam', u'Wscore', u'Lteam', u'Lscore', u'Wloc',
       u'Numot', u'Wfgm', u'Wfga', u'Wfgm3', u'Wfga3', u'Wftm', u'Wfta',
       u'Wor', u'Wdr', u'Wast', u'Wto', u'Wstl', u'Wblk', u'Wpf', u'Lfgm',
       u'Lfga', u'Lfgm3', u'Lfga3', u'Lftm', u'Lfta', u'Lor', u'Ldr', u'Last',
       u'Lto', u'Lstl', u'Lblk', u'Lpf'],
      dtype='object')

In [7]:
# Don't think this data is honestly that important. Just contains the region areas for the tournament each 
# year. There isn't really a distinct "home field" advantage in the tourney because the games are supposed
# to be on neutral sites. 
seasons_pd = pd.read_csv('Seasons.csv')
seasons_pd.head()

Unnamed: 0,Season,Dayzero,Regionw,Regionx,Regiony,Regionz
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/2/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [12]:
teams_pd = pd.read_csv('Teams.csv')
teamList = teams_pd['Team_Name'].tolist()
teams_pd.head()

Unnamed: 0,Team_Id,Team_Name
0,1101,Abilene Chr
1,1102,Air Force
2,1103,Akron
3,1104,Alabama
4,1105,Alabama A&M


In [217]:
# Finding Kansas's Team_id cuz Frank Mason is too good
print teams_pd[teams_pd['Team_Name'] == 'Kansas']

     Team_Id Team_Name
141     1242    Kansas


In [30]:
# Jk, Lonzo Ball > Frank Mason
print teams_pd[teams_pd['Team_Name'] == 'UCLA']

     Team_Id Team_Name
316     1417      UCLA


In [36]:
# This one contains the stats for every single NCAA tournament game from 1985 to 2015
tourney_compact_pd = pd.read_csv('TourneyCompactResults.csv')
tourney_compact_pd.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [35]:
# More deatiled tourney stats (except only stats from 2003 :( )
tourney_detailed_pd = pd.read_csv('TourneyDetailedResults.csv')
tourney_detailed_pd.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [37]:
# This one tells you what seed each team was for a given tournament year
tourney_seeds_pd = pd.read_csv('TourneySeeds.csv')
tourney_seeds_pd.head()

Unnamed: 0,Season,Seed,Team
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [197]:
# Seeing what seed Duke was in every tourney
duke_id = teams_pd[teams_pd['Team_Name'] == 'Duke'].values[0][0]
tourney_seeds_pd[tourney_seeds_pd['Team'] == duke_id]

Unnamed: 0,Season,Seed,Team
34,1985,Y03,1181
64,1986,W01,1181
164,1987,Y05,1181
193,1988,W02,1181
257,1989,W02,1181
322,1990,W03,1181
417,1991,Y02,1181
448,1992,W01,1181
530,1993,X03,1181
593,1994,X02,1181


In [433]:
# Don't know how helpful this is tbh, because it just tells you what the seeds of the stronger
# and weaker seeds are (assuming that the favored team wins??), so its always 1 vs 16 and then 
# 1 vs 8 and 1 vs 4..
tourney_slots_pd = pd.read_csv('TourneySlots.csv')
tourney_slots_pd.head()

Unnamed: 0,Season,Slot,Strongseed,Weakseed
0,1985,R1W1,W01,W16
1,1985,R1W2,W02,W15
2,1985,R1W3,W03,W14
3,1985,R1W4,W04,W13
4,1985,R1W5,W05,W12


# Features

One of the most important parts of this competition is determining what features are the most important
to determining which team in a head to head matchup will win. Possible features???

* Number of regular season wins 
* Seed Number
* Any head to head matchup in the season?
* Number of top 25 wins in a season
* Number of top 25 games played in a season
* W-L record against common opponents
* Points per game scored
* Points per game allowed (Really important imo)
* 3 Pointers  made per game (Really important imo)
* 3 Pointers  allowed per game
* Turnovers per game
* Avg Steals/Blocks/Assists/Rebounds (How important are these stats tho??)
* Team's Conference (ACC, Big Ten, etc)
* Conference W-L record
* Conference Strength of Schedule
* Nonconference W-L record 
* Nonconference Strength of Schedule
* Regular Season Conference Champion?
* Conference Tournament Champion?
* Number of seniors on team (shows experience)
* Number of freshman on team (can they handle the big stage)
* Number of home wins
* Number of road wins

Given that we know the above information for Team X and Team Y, can we make a model that determines who has the 
better probability of winning?

# Model

Let's make this supervised trianing problem where our model with take in 2 d dimensional vectors, representing the two teams playing in the tournament. Each d dimensional vector will contain information about the team for that year. Below are the features we end up using. 

* Regular Season Wins 
* Points per game season average 
* Points per game allowed season average
* Whether or not in Power 5 conference (ACC, Big Ten, Big 12, SEC, Pac 12) - Binary label
* Number of 3's per game
* Number of 3's allowed per game
* Tournament Seed

In [149]:
listACCteams = ['North Carolina','Virginia','Florida St','Louisville','Notre Dame','Syracuse','Duke','Virginia Tech','Georgia Tech','Miami','Wake Forest','Clemson','NC State','Boston College','Pittsburgh']
listPac12teams = ['Arizona','Oregon','UCLA','California','USC','Utah','Washington St','Stanford','Arizona St','Colorado','Washington','Oregon St']
listSECteams = ['Kentucky','South Carolina','Florida','Arkansas','Alabama','Tennessee','Mississippi St','Georgia','Ole Miss','Vanderbilt','Auburn','Texas A&M','LSU','Missouri']
listBig10teams = ['Maryland','Wisconsin','Purdue','Northwestern','Michigan St','Indiana','Iowa','Michigan','Penn St','Nebraska','Minnesota','Illinois','Ohio St','Rutgers']
listBig12teams = ['Kansas','Baylor','West Virginia','Iowa St','TCU','Kansas St','Texas Tech','Oklahoma St','Texas','Oklahoma']

In [192]:
def checkPower5Conference(team_id):
    teamName = teams_pd.values[team_id-1101][1]
    if (teamName in listACCteams or teamName in listBig10teams or teamName in listBig12teams
       or teamName in listSECteams or teamName in listPac12teams):
        return 1
    else:
        return 0

In [436]:
def getSeasonData(team_id, year):
    # The data frame below holds stats for every single game in the given year
    year_data_pd = reg_season_detailed_pd[reg_season_detailed_pd['Season'] == year]
    # Finding number of points per game
    gamesWon = year_data_pd[year_data_pd.Wteam == team_id] 
    totalPointsScored = gamesWon['Wscore'].sum()
    gamesLost = year_data_pd[year_data_pd.Lteam == team_id] 
    totalGames = gamesWon.append(gamesLost)
    numGames = len(totalGames.index)
    totalPointsScored += gamesLost['Lscore'].sum()
    
    # Finding number of points per game allowed
    totalPointsAllowed = gamesWon['Lscore'].sum()
    totalPointsAllowed += gamesLost['Wscore'].sum()
    
    #Finding number of 3's made per game
    total3sMade = (gamesWon['Wfgm3'].sum())
    total3sMade += (gamesLost['Lfgm3'].sum())
    
    #Finding number of 3's allowed per game
    total3sAllowed = (gamesWon['Lfgm3'].sum())
    total3sAllowed += (gamesLost['Wfgm3'].sum())
    
    #Finding tournament seed for that year
    tourneyYear = tourney_seeds_pd[tourney_seeds_pd['Season'] == year]
    seed = tourneyYear[tourneyYear['Team'] == team_id]
    if (len(seed.index) != 0):
        seed = seed.values[0][1]
        tournamentSeed = int(seed[1:3])
    else:
        tournamentSeed = 50 #Not sure how to represent if a team didn't make the tourney
    
    # Finding number of wins and losses
    numWins = len(gamesWon.index)
    # There are some teams who may have dropped to Division 2, so they won't have games 
    # a certain year. In this case, we don't want to divide by 0, so we'll just set the
    # averages to 0 instead
    if numGames == 0:
        avgPointsScored = 0
        avgPointsAllowed = 0
        avg3sMade = 0
        avg3sAllowed = 0
    else:
        avgPointsScored = totalPointsScored/numGames
        avgPointsAllowed = totalPointsAllowed/numGames
        avg3sMade = total3sMade/numGames
        avg3sAllowed = total3sAllowed/numGames
    return [numWins, avgPointsScored, avgPointsAllowed, checkPower5Conference(team_id), avg3sMade, avg3sAllowed,
           tournamentSeed]

In [437]:
def compareTwoTeams(id_1, id_2, year):
    team_1 = getSeasonData(id_1, year)
    team_2 = getSeasonData(id_2, year)
    diff = [a - b for a, b in zip(team_1, team_2)]
    return diff

In [33]:
kentucky_id = teams_pd[teams_pd['Team_Name'] == 'Kentucky'].values[0][0]
duke_id = teams_pd[teams_pd['Team_Name'] == 'Duke'].values[0][0]
ucla_id = teams_pd[teams_pd['Team_Name'] == 'UCLA'].values[0][0]
kansas_id = teams_pd[teams_pd['Team_Name'] == 'Kansas'].values[0][0]
compareTwoTeams(kentucky_id, ucla_id, 2012)

NameError: name 'teams_pd' is not defined

This method returns the team vectors for each NCAA team for the given season. This information is held in a Python dictionary. 

In [439]:
def createSeasonDict(year):
    seasonDictionary = collections.defaultdict(list)
    for team in teamList:
        team_id = teams_pd[teams_pd['Team_Name'] == team].values[0][0]
        team_vector = getSeasonData(team_id, year)
        seasonDictionary[team_id] = team_vector
    return seasonDictionary

This is the most important method, where we create our training set. The idea is that we go through each of the years that are passed in, and we obtain a season dictionary for each year (by calling the previous method). Then, we look at each game that took place over the course of the season. This data is held in the reg_season_compact_pd dataframe. This dataframe contains information about the 5,200 games that occurred. For each of these games, we take a look at the two teams playing, obtain their team vectors, and then take the difference between the two. This new resultant vector is used as a sort of "representation" of the differences between the 2 teams playing. This vector will be our the X (or the input) for our supervised learning problem. The Y (or the label) will be a 1 if Team 1 wins. The way we introduce negative sampling is by associating the negative of the X vector with the label of 0. 

In [476]:
def createTrainingSet(years):
    totalNumGames = 0
    for year in years:
        season = reg_season_compact_pd[reg_season_compact_pd['Season'] == year]
        totalNumGames += len(season.index)
    xTrain = np.zeros(( totalNumGames,len(x[1101]) ))
    yTrain = np.zeros(( totalNumGames ))
    indexCounter = 0
    for year in years:
        team_vectors = createSeasonDict(year)
        season = reg_season_compact_pd[reg_season_compact_pd['Season'] == year]
        numGamesInSeason = len(season.index)
        xTrainSeason = np.zeros(( numGamesInSeason,len(x[1101]) ))
        yTrainSeason = np.zeros(( numGamesInSeason ))
        counter = 0
        for index, row in season.iterrows():
            w_team = row['Wteam']
            w_vector = team_vectors[w_team]
            l_team = row['Lteam']
            l_vector = team_vectors[l_team]
            diff = [a - b for a, b in zip(w_vector, l_vector)]
            if (counter % 2 == 0):
                xTrainSeason[counter] = diff
                yTrainSeason[counter] = 1
            else:
                xTrainSeason[counter] = [ -p for p in diff]
                yTrainSeason[counter] = 0
            counter += 1
        xTrain[indexCounter:numGamesInSeason+indexCounter] = xTrainSeason
        yTrain[indexCounter:numGamesInSeason+indexCounter] = yTrainSeason
        indexCounter += numGamesInSeason
    return xTrain, yTrain

In [477]:
def normalizeInput(arr):
    for i in range(arr.shape[1]):
        minVal = min(arr[:,i])
        maxVal = max(arr[:,i])
        arr[:,i] =  (arr[:,i] - minVal) / (maxVal - minVal)
    return arr

In [478]:
years = [2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016]
xTrain, yTrain = createTrainingSet(years)

In [479]:
X_train, X_test, Y_train, Y_test = train_test_split(xTrain, yTrain)

In [480]:
Y_train.shape

(53430,)

In [481]:
X_test.shape

(17811, 7)

# Testing Models 

SVM: 74.29% 

Logistic Regression: 75.28%

Neural Network: 75.27%

Decision Tree: 65.06%

KNN (k = 101): 74.81% (Obtained value for K using the elbow method)

In [541]:
# These are the different models I tried

#model = tree.DecisionTreeClassifier()
#model = linear_model.LogisticRegression()
model = svm.SVC()
#model = KNeighborsClassifier(n_neighbors=101)
#neuralNetwork()

In [542]:
model.fit(X_train, Y_train)
results = model.fit(X_train, Y_train)
preds = model.predict(X_test)
np.mean(preds == Y_test)

0.74060973555667842

In [547]:
target_names = ['Wins','PPG','PPG Allowed','Power 5','3PG','3PGA','Seed']
target_names
print(classification_report(Y_test, preds, target_names=target_names))

             precision    recall  f1-score   support

       Wins       0.74      0.74      0.74      8893
        PPG       0.74      0.74      0.74      8918

avg / total       0.74      0.74      0.74     17811



In [429]:
def findBestK():
    K = (list)(i for i in range(1,200) if i%2!=0)
    p = []
    for k in K:
        kmeans = KNeighborsClassifier(n_neighbors=k)
        kmeans.fit(X_train, Y_train)
        results = kmeans.fit(X_train, Y_train)
        preds = kmeans.predict(X_test)
        p.append(np.mean(preds == Y_test))
    plt.plot(K, p)
    plt.xlabel('k')
    plt.ylabel('Accuracy')
    plt.title('Selecting k with the Elbow Method')
    plt.show()

In [533]:
def neuralNetwork():
    model = Sequential()
    model.add(Dense(32, input_dim=7))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.20))
    model.add(Dense(48))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    Y_train_categorical = np_utils.to_categorical(Y_train)
    # TRAINING
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, Y_train_categorical, batch_size=64, nb_epoch=10,shuffle=True)
    preds = model.predict(X_test)
    results=[]
    for i in range(preds.shape[0]):
        if preds[i][1] < .5:
            results.append(0)
        else:
            results.append(1)
    print np.mean(results == Y_test)

In [13]:
year_data_pd = reg_season_detailed_pd[reg_season_detailed_pd['Season'] == 2016]

seasons = range(1985,2017)
team_ids = list(teams_pd["Team_Id"])

record = []
team_opponents = {}

for season in seasons:
    for team_id in team_ids: 
        won_games_df = year_data_pd.loc[year_data_pd['Wteam'] == team_id]
        lost_games_df = year_data_pd.loc[year_data_pd['Lteam'] == team_id]
        losing_games = list(lost_games_df["Wteam"])
        winning_games = list(won_games_df["Lteam"])
        opponents = winning_games + losing_games
        team_opponents[(team_id, season)] = opponents
        year_data_pd[year_data_pd.Wteam == team_id]
        OR = 0 # opponent record
        for team_id in opponents:
            OR += len(year_data_pd[year_data_pd.Wteam == team_id].index)
            OR += len(year_data_pd[year_data_pd.Lteam == team_id].index)
        row = [season, team_id, OR]
        record.append(row)
        print row
        
team_sos = pd.DataFrame(record) # Strength of Schedule
team_sos.columns = ["Season", "Team", "OR"]

[1985, 1394, 783]
[1985, 1424, 923]
[1985, 1138, 1021]
[1985, 1246, 1011]
[1985, 1411, 870]
[1985, 1380, 890]
[1985, 1216, 949]
[1985, 1290, 866]
[1985, 1109, 0]
[1985, 1250, 956]
[1985, 1209, 938]
[1985, 1332, 1025]
[1985, 1333, 1003]
[1985, 1111, 928]
[1985, 1105, 1016]
[1985, 1196, 1009]
[1985, 1111, 875]
[1985, 1118, 0]
[1985, 1221, 959]
[1985, 1397, 975]
[1985, 1121, 0]
[1985, 1293, 971]
[1985, 1275, 939]
[1985, 1242, 1016]
[1985, 1122, 902]
[1985, 1366, 898]
[1985, 1306, 914]
[1985, 1128, 0]
[1985, 1161, 924]
[1985, 1199, 1007]
[1985, 1110, 953]
[1985, 1103, 979]
[1985, 1260, 970]
[1985, 1134, 0]
[1985, 1165, 785]
[1985, 1383, 919]
[1985, 1221, 902]
[1985, 1275, 1038]
[1985, 1344, 986]
[1985, 1211, 969]
[1985, 1132, 890]
[1985, 1414, 859]
[1985, 1428, 1026]
[1985, 1205, 783]
[1985, 1233, 1034]
[1985, 1270, 809]
[1985, 1147, 0]
[1985, 1192, 883]
[1985, 1255, 800]
[1985, 1292, 1027]
[1985, 1422, 969]
[1985, 1167, 857]
[1985, 1163, 1027]
[1985, 1273, 873]
[1985, 1210, 981]
[1985, 14

In [9]:
for season in seasons:
    season_df = team_sos[team_sos['Season'] == season]
    for team_id in team_ids:
        key = (team_id, season)
        opponents = team_opponents[key]
        ORR = 0
        for team_id in opponents:
            ORR += season_df[season_df["Team"] == team_id]["OR"]
        print opponents
        print (team_id, ORR)
        team_sos.loc[(team_sos["Season"] == season) & (team_sos["Team"] == team_id), "ORR"]= ORR

NameError: name 'team_sos' is not defined

In [1]:
team_sos

NameError: name 'team_sos' is not defined