In [1]:
#Load all packages and models we will be using
from pandas import DataFrame
import pandas as pd
import numpy as np
import random
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression()
from sklearn import tree
clf = tree.DecisionTreeClassifier()

In [2]:
#Load data
seasonal = DataFrame.from_csv("RegularSeasonDetailedResults.csv", sep=",",index_col=None)
seeds = DataFrame.from_csv("TourneySeeds.csv", sep=",",index_col=None)

#Build raw data frame for feature engineering
seasonalCols = ['Season','Wteam','Wscore','Wfgm3','Wfgm','Wfga','Lteam','Lscore','Lfgm3','Lfgm','Lfga']
seasonalDf = DataFrame(seasonal[seasonal['Season']>2011], columns=seasonalCols)
seedsDf = DataFrame(seeds[seeds['Season']>2011])

#Filling in empty seeds score

#Fill in NAs in case
seasonalDf = seasonalDf.fillna(0)

#Feature engineering
#1.) Score difference rate for win/loss team
WscoreDiff = 1+(seasonalDf['Wscore']-seasonalDf['Lscore'])/(seasonalDf['Wscore']+seasonalDf['Lscore'])
LscoreDiff = 1-(seasonalDf['Wscore']-seasonalDf['Lscore'])/(seasonalDf['Wscore']+seasonalDf['Lscore'])
seasonalDf['WScoreDiff']=WscoreDiff
seasonalDf['LScoreDiff']=LscoreDiff

#2.) Field goal made rate
WfgRate = seasonalDf['Wfgm']/seasonalDf['Wfga']
LfgRate = seasonalDf['Lfgm']/seasonalDf['Lfga']
seasonalDf['WfgRate']=WfgRate
seasonalDf['LfgRate']=LfgRate

seedsDf=DataFrame(seedsDf, columns=['Team','Seed'])

seasonalColsUse = ['Wteam','WfgRate','Wfgm3','WScoreDiff','Lteam','LfgRate','Lfgm3','LScoreDiff']
seasonalColsGen = ['team1','fgRate1','fgm31','ScoreDiff1','team2','fgRate2','fgm32','ScoreDiff2']
seasonalDf = DataFrame(seasonalDf, columns=seasonalColsUse)
seasonalDf.columns=seasonalColsGen

In [3]:
# Shuffle orders to have half positive and half negative examples
revCols = ['team2','fgRate2','fgm32','ScoreDiff2','team1','fgRate1','fgm31','ScoreDiff1']
seasonalColsGenLabel = ['team1','fgRate1','fgm31','ScoreDiff1','team2','fgRate2','fgm32','ScoreDiff2','Label']

seasonalDfNeg = seasonalDf.sample(frac=0.5)
seasonalDf = seasonalDf[~seasonalDf.index.isin(seasonalDfNeg.index)]
seasonalDfNeg = DataFrame(seasonalDfNeg, columns=revCols)

# Put in label
seasonalDfNeg['Label'] = -1
seasonalDf['Label'] = 1
seasonalDfNeg.columns=seasonalColsGenLabel

seasonalDf=seasonalDf.append(seasonalDfNeg,ignore_index=True)

# Shuffle the samples
seasonalDf = seasonalDf.sample(frac=1).reset_index(drop=True)

rawSeeds = np.zeros(shape=len(seasonalDf))
seasonalDf['seedsScore1'] = rawSeeds
seasonalDf['seedsScore2'] = rawSeeds

# Put engineered features into new data frame to use
seasonalColsGenSeed = ['team1','fgRate1','fgm31','ScoreDiff1','seedsScore1','team2','fgRate2','fgm32','ScoreDiff2','seedsScore2','Label']
seasonalDf = DataFrame(seasonalDf, columns=seasonalColsGenSeed)

In [4]:
# Calculating seeding score for each team

# Getting the number value of the seeds
seedsNum = seedsDf['Seed'].str.extract('(\d+)').astype(int)
seedsDf['seedsNum'] = seedsNum

# Getting the counts of each team that were selected as seeds
seedsCount=seedsDf.groupby(['Team']).agg(['count']).reset_index()

# Seed score = (1/(mean of seeding value))*seeding count
seedsDf = seedsDf.sort_values(by=['Team'])
seedsMean = 1/(seedsDf.groupby(['Team'],as_index=False).mean())
seedsScore = np.multiply(seedsMean['seedsNum'],seedsCount['seedsNum']['count'])
seedsCount['seedsScore'] = seedsScore

# Build new table for seeding score values
seedsData = pd.DataFrame(
    {'Team': seedsCount['Team'],
     'seedsScore': seedsCount['seedsScore']})



In [5]:
# This function will get all the seeds score for each team
def getSeedsScore(inputSeasonal):
    seedsScore1x = np.zeros(shape=len(inputSeasonal)+1)
    seedsScore2x = np.zeros(shape=len(inputSeasonal)+1)
    for i in range(len(inputSeasonal)):
        seedsLoc1 = seedsData['Team']==seasonalDf.iloc[i]['team1']
        seedsLoc2 = seedsData['Team']==seasonalDf.iloc[i]['team2']
        if sum(seedsLoc1)>0:
            a = seedsData[seedsLoc1]['seedsScore']
            seedsScore1x[i] = a
        if sum(seedsLoc2)>0:
            b = seedsData[seedsLoc2]['seedsScore']
            seedsScore2x[i] = b
    d1 = {'seedsScore1x':seedsScore1x}
    d2 = {'seedsScore2x':seedsScore2x}
    d1 = DataFrame(d1,columns=['seedsScore1x'])
    d2 = DataFrame(d2,columns=['seedsScore2x'])
    inputSeasonal['seedsScore1']=d1['seedsScore1x']
    inputSeasonal['seedsScore2']=d2['seedsScore2x']
    return inputSeasonal

In [6]:
#Filling in seeds scores for all the data
seasonalDf = getSeedsScore(seasonalDf)

In [7]:
# Randomly sample 80% of training data and 20% of test data
teData = seasonalDf.sample(frac=0.2)
trData = seasonalDf[~seasonalDf.index.isin(teData.index)]

# Set X and Y 
seasonalColsGenSeedTr = ['team1','fgRate1','fgm31','ScoreDiff1','seedsScore1','team2','fgRate2','fgm32','ScoreDiff2','seedsScore2']
trX = DataFrame(trData, columns = seasonalColsGenSeedTr)
teX = DataFrame(teData, columns = seasonalColsGenSeedTr)

trY = trData['Label']
teY = teData['Label']

In [8]:
# Define a dataset for all teams for finding means in the future
allTeams = DataFrame(seasonalDf, columns = ['team1','fgRate1','fgm31','ScoreDiff1','seedsScore1'])
allTeams2 = DataFrame(seasonalDf, columns = ['team2','fgRate2','fgm32','ScoreDiff2','seedsScore2'])
allTeamsCols = ['team','fgRate','fgm3','ScoreDiff','seedsScore']

allTeams.columns = allTeamsCols
allTeams2.columns = allTeamsCols
allTeams = allTeams.append(allTeams2,ignore_index=True)

# Sort by team id and get the mean value of features of each team
allTeams = allTeams.sort_values(by=['team'])
allTeams = allTeams.groupby(['team'],as_index=False).mean()# Define a function to get the average feature values of the test set teams

def getTeFeatures(teTeams):
    meanFGR1 = np.zeros(shape=len(teTeams))
    meanFGM31 = np.zeros(shape=len(teTeams))
    meanSD1 = np.zeros(shape=len(teTeams))
    meanSS1 = np.zeros(shape=len(teTeams))
    meanFGR2 = np.zeros(shape=len(teTeams))
    meanFGM32 = np.zeros(shape=len(teTeams))
    meanSD2 = np.zeros(shape=len(teTeams))
    meanSS2 = np.zeros(shape=len(teTeams))
    for i in range(len(teTeams)):
        meanFGR1[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['fgRate']
        meanFGM31[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['fgm3']
        meanSD1[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['ScoreDiff']
        meanSS1[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['seedsScore']
        meanFGR2[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['fgRate']
        meanFGM32[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['fgm3']
        meanSD2[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['ScoreDiff']
        meanSS2[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['seedsScore']
    d = {'seedsScore2':meanSS2,'ScoreDiff2':meanSD2,'fgm32':meanFGM32,'fgRate2':meanFGR2,'team2':teTeams['team2'],
                       'seedsScore1':meanSS1,'ScoreDiff1':meanSD1,'fgm31':meanFGM31,'fgRate1':meanFGR1,'team1':teTeams['team1']}
    result = DataFrame(d,columns=['team1','fgRate1','fgm31','ScoreDiff1','seedsScore1',
                                  'team2','fgRate2','fgm32','ScoreDiff2','seedsScore2'],
                     index=teTeams.index)
    return result

In [9]:
# Instead of putting real seaonal data, we are going to put in the average for each team from seasonal data for testing
teXTeams = DataFrame(teX, columns = ['team1','team2'])
meanSD2 = np.zeros(shape=len(teXTeams))
teXX = getTeFeatures(teXTeams)

In [10]:
# Decision tree model, usually score around 0.61
clf = clf.fit(trX, trY)
clf
with open("test", 'w') as f:
     f = tree.export_graphviz(clf, out_file=f)
clf.predict(teXX)
clf.score(teXX, teY)

0.61421605401350332

In [29]:
# Logistic regression model usually score around 0.69
logreg.fit(trX,trY)
logreg.predict(teXX)
logreg.score(teXX, teY)

logreg.coef_

array([[  5.70975742e-05,   4.32687441e+00,   1.36131227e-02,
          3.08160084e+01,   3.94562363e-01,  -1.94688738e-04,
         -4.21503362e+00,  -1.32161528e-02,  -3.07294594e+01,
         -4.36366329e-01]])