In [441]:
#Load all packages and models we will be using
from pandas import DataFrame
import pandas as pd
import numpy as np
import math
import random
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression()
from sklearn import tree
clf = tree.DecisionTreeClassifier()

In [442]:
#Load data
seasonal = DataFrame.from_csv("RegularSeasonDetailedResults.csv", sep=",",index_col=None)
seeds = DataFrame.from_csv("TourneySeeds.csv", sep=",",index_col=None)

#Build raw data frame for feature engineering
seasonalCols = ['Season','Wteam','Wscore','Wfgm3','Wfgm','Wfga','Lteam','Lscore','Lfgm3','Lfgm','Lfga']
seasonalDf = DataFrame(seasonal[seasonal['Season']>2010], columns=seasonalCols)
seedsDf = DataFrame(seeds[seeds['Season']>2010])

#Filling in empty seeds score

#Fill in NAs in case
seasonalDf = seasonalDf.fillna(0)

#Feature engineering
#1.) Score difference rate for win/loss team
WscoreDiff = 1+(seasonalDf['Wscore']-seasonalDf['Lscore'])/(seasonalDf['Wscore']+seasonalDf['Lscore'])
LscoreDiff = 1-(seasonalDf['Wscore']-seasonalDf['Lscore'])/(seasonalDf['Wscore']+seasonalDf['Lscore'])
seasonalDf['WScoreDiff']=WscoreDiff
seasonalDf['LScoreDiff']=LscoreDiff

#2.) Field goal made rate
WfgRate = seasonalDf['Wfgm']/seasonalDf['Wfga']
LfgRate = seasonalDf['Lfgm']/seasonalDf['Lfga']
seasonalDf['WfgRate']=WfgRate
seasonalDf['LfgRate']=LfgRate

seedsDf=DataFrame(seedsDf, columns=['Season','Team','Seed'])

seasonalColsUse = ['Season','Wteam','WfgRate','Wfgm3','WScoreDiff','Lteam','LfgRate','Lfgm3','LScoreDiff']
seasonalColsGen = ['Season','team1','fgRate1','fgm31','ScoreDiff1','team2','fgRate2','fgm32','ScoreDiff2']
seasonalDf = DataFrame(seasonalDf, columns=seasonalColsUse)
seasonalDf.columns=seasonalColsGen
seedsDf[1:5]

Unnamed: 0,Season,Team,Seed
1675,2011,1314,W02
1676,2011,1393,W03
1677,2011,1246,W04
1678,2011,1452,W05


In [443]:
# Shuffle orders to have half positive and half negative examples
revCols = ['Season','team2','fgRate2','fgm32','ScoreDiff2','team1','fgRate1','fgm31','ScoreDiff1']
seasonalColsGenLabel = ['Season','team1','fgRate1','fgm31','ScoreDiff1','team2','fgRate2','fgm32','ScoreDiff2','Label']

seasonalDfNeg = seasonalDf.sample(frac=0.5)
seasonalDf = seasonalDf[~seasonalDf.index.isin(seasonalDfNeg.index)]
seasonalDfNeg = DataFrame(seasonalDfNeg, columns=revCols)

# Put in label
seasonalDfNeg['Label'] = -1
seasonalDf['Label'] = 1
seasonalDfNeg.columns=seasonalColsGenLabel

seasonalDf=seasonalDf.append(seasonalDfNeg,ignore_index=True)

# Shuffle the samples
seasonalDf = seasonalDf.sample(frac=1).reset_index(drop=True)

rawSeeds = np.zeros(shape=len(seasonalDf))
seasonalDf['seedsScore1'] = rawSeeds
seasonalDf['seedsScore2'] = rawSeeds

# Put engineered features into new data frame to use
seasonalColsGenSeed = ['Season','team1','fgRate1','fgm31','ScoreDiff1','seedsScore1','team2','fgRate2','fgm32','ScoreDiff2','seedsScore2','Label']
seasonalDf = DataFrame(seasonalDf, columns=seasonalColsGenSeed)
seasonalDf[1:5]

Unnamed: 0,Season,team1,fgRate1,fgm31,ScoreDiff1,seedsScore1,team2,fgRate2,fgm32,ScoreDiff2,seedsScore2,Label
1,2013,1417,0.465517,3,0.937063,0.0,1332,0.45,6,1.062937,0.0,-1
2,2011,1374,0.613636,10,1.163934,0.0,1212,0.387755,5,0.836066,0.0,1
3,2012,1306,0.392857,10,1.107692,0.0,1406,0.333333,2,0.892308,0.0,1
4,2015,1124,0.446809,6,1.029851,0.0,1376,0.407407,5,0.970149,0.0,1


In [444]:
# Calculating seeding score for each team

# Getting the number value of the seeds
seedsNum = seedsDf['Seed'].str.extract('(\d+)').astype(int)
seedsDf['seedsNum'] = seedsNum

# Getting the counts of each team that were selected as seeds
seedsCount=seedsDf.groupby(['Team']).agg(['count']).reset_index()

# Seed score = (1/(mean of seeding value))*seeding count
seedsDf = seedsDf.sort_values(by=['Team'])
seedsMean = 1/(seedsDf.groupby(['Team'],as_index=False).mean())
seedsScore = np.multiply(seedsMean['seedsNum'],seedsCount['seedsNum']['count'])
seedsCount['seedsScore'] = seedsScore

# Build new table for seeding score values
seedsData = pd.DataFrame(
    {'Team': seedsCount['Team'],
     'seedsScore': seedsCount['seedsScore']})
#t = seedsDf[seedsDf.groupby(['Team'],as_index=False)['Season']]
#t
#seedsData



In [445]:
# This function will get all the seeds score for each team
def getSeedsScore(inputSeasonal):
    seedsScore1x = np.zeros(shape=len(inputSeasonal)+1)
    seedsScore2x = np.zeros(shape=len(inputSeasonal)+1)
    for i in range(len(inputSeasonal)):
        seedsLoc1 = seedsData['Team']==seasonalDf.iloc[i]['team1']
        seedsLoc2 = seedsData['Team']==seasonalDf.iloc[i]['team2']
        if sum(seedsLoc1)>0:
            a = seedsData[seedsLoc1]['seedsScore']
            seedsScore1x[i] = a
        if sum(seedsLoc2)>0:
            b = seedsData[seedsLoc2]['seedsScore']
            seedsScore2x[i] = b
    d1 = {'seedsScore1x':seedsScore1x}
    d2 = {'seedsScore2x':seedsScore2x}
    d1 = DataFrame(d1,columns=['seedsScore1x'])
    d2 = DataFrame(d2,columns=['seedsScore2x'])
    inputSeasonal['seedsScore1']=d1['seedsScore1x']
    inputSeasonal['seedsScore2']=d2['seedsScore2x']
    return inputSeasonal

In [446]:
#Filling in seeds scores for all the data
seasonalDf = getSeedsScore(seasonalDf)
seasonalDf[1:5]

Unnamed: 0,Season,team1,fgRate1,fgm31,ScoreDiff1,seedsScore1,team2,fgRate2,fgm32,ScoreDiff2,seedsScore2,Label
1,2013,1417,0.465517,3,0.937063,0.571429,1332,0.45,6,1.062937,0.571429,-1
2,2011,1374,0.613636,10,1.163934,0.166667,1212,0.387755,5,0.836066,0.0,1
3,2012,1306,0.392857,10,1.107692,0.0,1406,0.333333,2,0.892308,0.0,1
4,2015,1124,0.446809,6,1.029851,0.941176,1376,0.407407,5,0.970149,0.0,1


In [447]:
# Randomly sample 80% of training data and 20% of test data
#teData = seasonalDf.sample(frac=0.2)
#trData = seasonalDf[~seasonalDf.index.isin(teData.index)]

teData = seasonalDf[seasonalDf['Season']>2012]
trData = seasonalDf.sample(frac=0.8)

# Set X and Y 
seasonalColsGenSeedTr = ['Season','team1','fgRate1','fgm31','ScoreDiff1','seedsScore1','team2','fgRate2','fgm32','ScoreDiff2','seedsScore2']
trX = DataFrame(trData, columns = seasonalColsGenSeedTr)
teX = DataFrame(teData, columns = seasonalColsGenSeedTr)

trY = trData['Label']
teY = teData['Label']

In [484]:
# This function returns the team ids and season of all its possible matches

def getTeamList(inputTeams):
    empty = np.zeros(shape=(0,3))
    result = pd.DataFrame(empty, columns=['Season','team1','team2'])
    for year in range(2013,2016):
        curTeams = inputTeams[inputTeams['Season']==year]
        matchNum = len(curTeams)*(len(curTeams)-1)/2
        season = np.zeros(shape=matchNum)
        team1 = np.zeros(shape=matchNum)
        team2 = np.zeros(shape=matchNum)
        season[:] = year
        season = season.reshape(matchNum,1)
        curTeam2 = np.array(curTeams['Team'])
        for i in range (len(curTeams)):
            curTeam1 = int(curTeams['Team'][i:i+1])
            curTeam2 = np.delete(curTeam2, 0)
            notZero1 = sum(team1!=0)
            notZero2 = sum(team2!=0)
            i_ind = len(curTeams)-i
            team1[notZero1:notZero1+i_ind-1]=curTeam1
            team2[notZero2:notZero2+i_ind-1]= curTeam2
        team1 = team1.reshape(matchNum,1)
        team2 = team2.reshape(matchNum,1)
        test = np.concatenate((season,team1,team2),axis=1)
        tempDf = pd.DataFrame(test, columns=['Season','team1','team2'])
        result = result.append(tempDf)
    return result


In [485]:
# Define a dataset for all teams for finding means in the future
allTeams = DataFrame(seasonalDf, columns = ['Season','team1','fgRate1','fgm31','ScoreDiff1','seedsScore1'])
allTeams2 = DataFrame(seasonalDf, columns = ['Season','team2','fgRate2','fgm32','ScoreDiff2','seedsScore2'])
allTeamsCols = ['Season','team','fgRate','fgm3','ScoreDiff','seedsScore']

allTeams.columns = allTeamsCols
allTeams2.columns = allTeamsCols
allTeams = allTeams.append(allTeams2,ignore_index=True)

# Sort by team id and get the mean value of features of each team
allTeams = allTeams.sort_values(by=['team'])
allTeams = allTeams.groupby(['team'],as_index=False).mean()# Define a function to get the average feature values of the test set teams

def getTeFeatures(teTeams):
    meanFGR1 = np.zeros(shape=len(teTeams))
    meanFGM31 = np.zeros(shape=len(teTeams))
    meanSD1 = np.zeros(shape=len(teTeams))
    meanSS1 = np.zeros(shape=len(teTeams))
    meanFGR2 = np.zeros(shape=len(teTeams))
    meanFGM32 = np.zeros(shape=len(teTeams))
    meanSD2 = np.zeros(shape=len(teTeams))
    meanSS2 = np.zeros(shape=len(teTeams))
    for i in range(len(teTeams)):
        meanFGR1[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['fgRate']
        meanFGM31[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['fgm3']
        meanSD1[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['ScoreDiff']
        meanSS1[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team1']]['seedsScore']
        meanFGR2[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['fgRate']
        meanFGM32[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['fgm3']
        meanSD2[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['ScoreDiff']
        meanSS2[i] = allTeams[allTeams['team']==teTeams.iloc[i]['team2']]['seedsScore']
    d = {'Season':teTeams['Season'],'seedsScore2':meanSS2,'ScoreDiff2':meanSD2,'fgm32':meanFGM32,'fgRate2':meanFGR2,'team2':teTeams['team2'],
                       'seedsScore1':meanSS1,'ScoreDiff1':meanSD1,'fgm31':meanFGM31,'fgRate1':meanFGR1,'team1':teTeams['team1']}
    result = DataFrame(d,columns=['Season','team1','fgRate1','fgm31','ScoreDiff1','seedsScore1',
                                  'team2','fgRate2','fgm32','ScoreDiff2','seedsScore2'],
                     index=teTeams.index)
    return result

In [486]:
# Instead of putting real seaonal data, we are going to put in the average for each team from seasonal data for testing
#teXTeams = DataFrame(teX, columns = ['Season','team1','team2'])
#teXX = getTeFeatures(teXTeams)


#Getting teams from 2013-2016
teamsUsed = seedsDf[seedsDf['Season']>2012]
teXTeams = getTeamList(teamsUsed)


In [487]:
# Getting team features
teXX = getTeFeatures(teXTeams)

In [488]:
# Decision tree model, usually score around 0.61
clf = clf.fit(trX, trY)
clf
with open("test", 'w') as f:
     f = tree.export_graphviz(clf, out_file=f)
clf.predict_proba(teXX)
#clf.score(teXX, teY)

array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]])

In [489]:
# Logistic regression model usually score around 0.69
logreg.fit(trX,trY)
logres = logreg.predict_proba(teXX)[:,0]
#logreg.score(teXX, teY)
#pred=logreg.predict_proba(teXX)

In [532]:
# Formatting for submission
pd.options.display.float_format = '{:,f}'.format
sub_result = DataFrame(teXTeams,columns=['Season','team1','team2'])
sub_result['pred']=logres
sub_result.Season = sub_result.Season.astype(int)
sub_result.team1 = sub_result.team1.astype(int)
sub_result.team2 = sub_result.team2.astype(int)
sub_result['id']=sub_result['Season'].map(str) +"_"+ sub_result['team1'].map(str)+"_"+sub_result['team2'].map(str)
submission = DataFrame(sub_result,columns=['id','pred'])

Unnamed: 0,id,pred
1,2013_1103_1112,0.884936
2,2013_1103_1125,0.770066
3,2013_1103_1129,0.468014
4,2013_1103_1137,0.505658


In [535]:
# Writting to csv
submission.to_csv('submission.csv', sep=',',index=False)