# Inputting game stats
-note the data is 2016

In [11]:
import numpy as np
import copy

class GameStats(object):          
    
    def __init__(self, homeTeamNameIndex, homeTeamScoreIndex, homeTeamStatsIndex, visitorTeamNameIndex, visitorTeamScoreIndex, visitorTeamStatsIndex):
        #parse the text file
        self.statsFile = open("baseball2016.txt", "r")
        self.topArray = []
        self.sideArray = []  
        self.sc = np.zeros((30,30,30), np.int32) 
        self.sc[:,:,:] = -1  
        self.am = np.zeros((30,30), np.float32)
        self.gameList = []
        
        seenTeams = []
        for line in self.statsFile:
            token = line.split(',')  #tokenize the string
            tokenIndex = [homeTeamNameIndex, homeTeamScoreIndex, visitorTeamNameIndex, visitorTeamScoreIndex] + [i for i in homeTeamStatsIndex] + [i for i in visitorTeamStatsIndex]
            attributes = dict()
            
            for i in range(len(token)):
                if(i in tokenIndex):
                    attributes[i] = self.removeQuotes(token[i])
                        
            self.addScore(attributes[homeTeamNameIndex], attributes[visitorTeamNameIndex], attributes[homeTeamScoreIndex], attributes[visitorTeamScoreIndex])                
            self.addGame(attributes[homeTeamNameIndex], attributes[homeTeamScoreIndex], [attributes[i] for i in homeTeamStatsIndex], attributes[visitorTeamNameIndex], attributes[visitorTeamScoreIndex], [attributes[i] for i in visitorTeamStatsIndex])
            
            if(attributes[homeTeamNameIndex] not in seenTeams):
                seenTeams.append(attributes[homeTeamNameIndex])
            
        self.buildAvgMatrix()
        self.statsFile.close()
        #self.gameList = [bin(2**i)[2:].zfill(len(seenTeams)) if x == seenTeams[i] else x for i in range(len(seenTeams)) for x in self.gameList]
        # take the teams and convert the indexes of the order that they appeared in into the hot one format by rasiing
        # 2 to the power of them and then converting them into binary.
        seenTeamsDict = {k: v for v, k in enumerate(seenTeams)}
        temp = []
        for x in self.gameList:
            tempi = []
            for z in x:
                if(z in seenTeams):
                    tempi.append(bin(2**seenTeamsDict[z])[2:].zfill(len(seenTeamsDict)))
                else:
                    tempi.append(z)
            temp.append(tempi)
        self.gameList = temp
        
    def removeQuotes(self, string):
        if (string.startswith('"') and string.endswith('"')) or (string.startswith("'") and string.endswith("'")):
            return string[1:-1]
        return string  
    
    def addGame(self, team1, score1, stats1, team2, score2, stats2):
        self.gameList.append([team1, score1, stats1, team2, score2, stats2])
        
    #give it two teams, the scores, and it will add it to the matrix
    def addScore(self, team1, team2, score1, score2):
        '''
        for a team in top array, the index in the array corrisponds to the matrix column there located in
        for a team in side array, the index in the array corrisponds to the matrix row there located in
        '''
        #team 1 score entry
        try:
            row = self.sideArray.index(team2)    

        except:
            self.sideArray.append(team2)
            row = self.sideArray.index(team2)    

        try:
            col = self.topArray.index(team1)
        except:
            self.topArray.append(team1)
            col = self.topArray.index(team1)
        temp = self.sc[row, col]
        counter = 0
        for e in temp:
            if (e == -1):
                temp[counter] = score1
                break
            counter += 1
        self.sc[row, col] = temp
        
        #team 2 score entry
        try:
            row = self.sideArray.index(team1)    
        except:
            self.sideArray.append(team1)
            row = self.sideArray.index(team1)    
            
        try:
            col = self.topArray.index(team2)
        except:
            self.topArray.append(team2)
            col = self.topArray.index(team2)
        temp = self.sc[row, col]
        counter = 0
        for e in temp:
            if (e == -1):
                temp[counter] = score2
                break
            counter += 1
        self.sc[row, col] = temp
    
    #returns the score(s) for match up
    def getScore(self, team1, team2, gameSelect = None):
        print(team1, team2)
        try:
            score1 = self.sc[self.sideArray.index(team2), self.topArray.index(team1)]
            score2 = self.sc[self.sideArray.index(team1), self.topArray.index(team2)]
            if (gameSelect == None):
                print(team1, score1)
                print(team2, score2)
            else:
                print(team1, score1[gameSelect])
                print(team2, score2[gameSelect])
        except:
            print('Invalid input of teams')
    
    def getGameList(self):
        return copy.deepcopy(self.gameList)
    
    #constructs a matrix of the avg score in a matchup
    def buildAvgMatrix(self): 
        for col in range(len(self.sc[:,0])):   #depth
            for row in range(len(self.sc[0, :])):  #width
                tempScore = self.sc[row, col]
                avgScore = 0.0
                count = 0.0
                for j in tempScore:
                    if (j != -1):
                        avgScore += j
                        count += 1
                    else:
                        break
                try:
                    avgScore = avgScore / count
                except:
                    avgScore = -1
                self.am[row, col] = avgScore
    
    #get the value of the avg score for a match up
    def getAvgScore(self, team1, team2):
        try:
            score1 = self.am[self.sideArray.index(team2), self.topArray.index(team1)]
            score2 = self.am[self.sideArray.index(team1), self.topArray.index(team2)]
            print(team1, score1)
            print(team2, score2)        
        except:
            print('Invalid input of teams')

In [12]:
gameStats = GameStats(3, 10, [5, 51, 55, 57, 58, 69, 74, 76], 6, 9, [8, 23, 27, 29, 30, 41, 46, 47])
gameList = gameStats.getGameList()
print(gameList[0])
gameList[0] = 0
print(gameList[0])
gameListTest = gameStats.getGameList()
print(gameListTest[0])

['000000000000000000000000000001', '4', ['1', '0', '0', '0', '2', '0', '0', '0'], '000001000000000000000000000000', '3', ['1', '1', '0', '0', '6', '0', '1', '3']]
0
['000000000000000000000000000001', '4', ['1', '0', '0', '0', '2', '0', '0', '0'], '000001000000000000000000000000', '3', ['1', '1', '0', '0', '6', '0', '1', '3']]


In [13]:
# Parse the game files and grab the stats we want
# the indexes are put in the order: homeTeamName, homeTeamScore, homeTeamStats, awayName, awayScore, awayStats
gameStats = GameStats(3, 10, [5, 51, 55, 57, 58, 69, 74, 76], 6, 9, [8, 23, 27, 29, 30, 41, 46, 47])

gameList = gameStats.getGameList() # get the list of games
avgStats = dict()
np.set_printoptions(precision=4)  #IF you wanna remove this i recommend restarting the kernal

#getting rid of the strings
def removeQuotes(gameList):
    for row in gameList:
        for x in range(len(row)):
            #convert scores strings to float
            if (x == 1 or x == 4): row[x] = float(row[x])
            #convert arrays to floats
            if (x == 2 or x == 5): row[x] = list(map(float, row[x]))
    return gameList

#adding up all the teams stats
gameList = removeQuotes(gameList)
#ERROR in the data, scroll down to see the outlire binary number
for row in gameList:
    print(row[2], row[5])

([1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 6.0, 0.0, 1.0, 3.0])
([1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0])
([1.0, 2.0, 1.0, 1.0, 5.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 1.0, 5.0, 0.0, 0.0, 2.0])
([1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [1.0, 3.0, 0.0, 0.0, 7.0, 0.0, 0.0, 1.0])
([1.0, 3.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 3.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])
([1.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0])
([2.0, 1.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0], [2.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0])
([1.0, 0.0, 0.0, 1.0, 6.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0])
([1.0, 2.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0], [1.0, 4.0, 0.0, 1.0, 4.0, 1.0, 0.0, 1.0])
([1.0, 0.0, 0.0, 1.0, 5.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 5.0, 0.0, 0.0, 2.0])
([1.0, 3.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0])
([1.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 3.0, 0.0, 0.0, 5

In [14]:
for row in gameList:
    #home
    print("visitor", row[0])
    if row[0] in avgStats:
        print(avgStats[row[0]])
        avgStats[row[0]][0] = 0   #set the proir games to 0,because there set to right amount next line
        avgStats[row[0]] = np.sum([avgStats[row[0]], row[2]], axis=0)  #total stats + indivudal game stats
    else:
        avgStats[row[0]] = row[2]
    print(row[2])
    print(avgStats[row[0]])
    print('-------------')
    
    #away
    print("home", row[3])
    if row[3] in avgStats:
        print(avgStats[row[3]])
        avgStats[row[3]][0] = 0
        avgStats[row[3]] = np.sum([avgStats[row[3]], row[5]], axis=0)
    else:
        avgStats[row[3]] = row[5]
    print(row[5])
    print(avgStats[row[3]])
    print('-------------')




('visitor', '000000000000000000000000000001')
[1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0]
[1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0]
-------------
('home', '000001000000000000000000000000')
[1.0, 1.0, 0.0, 0.0, 6.0, 0.0, 1.0, 3.0]
[1.0, 1.0, 0.0, 0.0, 6.0, 0.0, 1.0, 3.0]
-------------
('visitor', '000000000000000000000000000010')
[1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]
[1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]
-------------
('home', '000000000000100000000000000000')
[1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0]
[1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0]
-------------
('visitor', '000000000000000000000000000100')
[1.0, 2.0, 1.0, 1.0, 5.0, 0.0, 0.0, 0.0]
[1.0, 2.0, 1.0, 1.0, 5.0, 0.0, 0.0, 0.0]
-------------
('home', '000000001000000000000000000000')
[1.0, 0.0, 0.0, 1.0, 5.0, 0.0, 0.0, 2.0]
[1.0, 0.0, 0.0, 1.0, 5.0, 0.0, 0.0, 2.0]
-------------
('visitor', '000000000000000000000000001000')
[1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
-------------
('home', '00001

In [15]:
#make the average..
keyList = avgStats.keys()    
for key in keyList:
    print(key)
    print(avgStats[key])  #print total stats
    for index in range(1, len(avgStats[key])):  #divide all stats by games played (162, 162)
        avgStats[key][index] = avgStats[key][index] / avgStats[key][0]  
    print(avgStats[key])  #print the avg stats
    print('-------------------------------------------')

000000000000000000000000001000
[ 162.  205.   31.   63.  495.   77.   13.   58.]
[  1.6200e+02   1.2654e+00   1.9136e-01   3.8889e-01   3.0556e+00
   4.7531e-01   8.0247e-02   3.5802e-01]
-------------------------------------------
000000000000000100000000000000
[ 161.  250.   26.   49.  465.   84.   14.   91.]
[  1.6100e+02   1.5528e+00   1.6149e-01   3.0435e-01   2.8882e+00
   5.2174e-01   8.6957e-02   5.6522e-01]
-------------------------------------------
000000000000000010000000000000
[ 162.  320.   18.   39.  453.   65.    9.   79.]
[  1.6200e+02   1.9753e+00   1.1111e-01   2.4074e-01   2.7963e+00
   4.0123e-01   5.5556e-02   4.8765e-01]
-------------------------------------------
000000000000000001000000000000
[ 162.  251.   24.   65.  490.   67.   20.   70.]
[  1.6200e+02   1.5494e+00   1.4815e-01   4.0123e-01   3.0247e+00
   4.1358e-01   1.2346e-01   4.3210e-01]
-------------------------------------------
000000000000001000000000000000
[ 162.  260.   19.   53.  534.   62.   12

In [16]:
#needs to be imported again
gameList = gameStats.getGameList() # get the list of games

#fixing 000000000000000000000000100000 because the hits number is bad (there are more problems in 2015 data)
avgStats['000000000000000000000000100000'][1] = 1.59   #some arbitrary value - i think the problem could be a cancelled game at game 97 or 96


#adding avg values to the list
for row in gameList:
    #home
    temp = avgStats[row[0]]   #using key to get avg stats array
    temp[0] = row[2][0]  #perserve the game num in the swap
    temp = np.around(temp, decimals=4)
    row[2] = temp
    #away
    temp = avgStats[row[3]]   #using key to get avg stats array
    temp[0] = row[5][0]
    temp = np.around(temp, decimals=4)
    row[5] = temp 

In [17]:
# split the stats up into different parts (this takes a lot of time, but I think it will be worth it)

#gameList = gameStats.getGameList() # I want to test what happens if we just use the stats of the game.
gameList = removeQuotes(gameList)   #because i initialized gamestats above, must remove quote again
homeTeamName = np.empty((0, len(list(gameList[0][0])))) 
homeTeamScore = np.empty((0, 1))
homeTeamStats = np.empty((0, len(gameList[0][2])))

visitingTeamName = np.empty((0, len(list(gameList[0][3])))) 
visitingTeamScore = np.empty((0, 1))
visitingTeamStats = np.empty((0, len(gameList[0][5])))

for row in gameList:
    homeTeamName = np.vstack((homeTeamName, list(row[0])))
    homeTeamScore = np.vstack((homeTeamScore, row[1]))
    homeTeamStats = np.vstack((homeTeamStats, row[2]))
    visitingTeamName = np.vstack((visitingTeamName, list(row[3])))
    visitingTeamScore = np.vstack((visitingTeamScore, row[4]))
    visitingTeamStats = np.vstack((visitingTeamStats, row[5]))

In [18]:
from scipy import stats

# take the stats and zscore them, I do a number of transformations on the array because of how the zScore input contstraints 
zScoredStatsHome = np.empty((0, len(homeTeamStats)))
zScoredStatsVisitor = np.empty((0, len(visitingTeamStats)))
for x in range(len(homeTeamStats[0])):
    zScoredStatsHome = np.vstack((zScoredStatsHome, stats.zscore([i[x] for i in homeTeamStats])))
    zScoredStatsVisitor = np.vstack((zScoredStatsVisitor, stats.zscore([i[x] for i in visitingTeamStats])))

homeTeamStatsTemp = np.empty((len(zScoredStatsHome[0]), 0))
visitingTeamStatsTemp = np.empty((len(zScoredStatsVisitor[0]), 0))
for i in zScoredStatsHome:
    homeTeamStatsTemp = np.hstack((homeTeamStatsTemp, [[x] for x in i]))

for i in zScoredStatsVisitor:
    visitingTeamStatsTemp = np.hstack((visitingTeamStatsTemp, [[x] for x in i]))

In [21]:
import tensorflow as tf
import tflearn

tf.reset_default_graph()

# the input layer needs to have the same dimensions as our input (in this case the teams)
homeTeamNameInput = tflearn.input_data(shape=[None, len(homeTeamName[0])], name='nameInput1')
homeTeamStatsInput = tflearn.input_data(shape=[None, len(homeTeamStatsTemp[0])], name='statsInput1')
visitingTeamNameInput = tflearn.input_data(shape=[None, len(visitingTeamName[0])], name='nameInput2')
visitingTeamStatsInput = tflearn.input_data(shape=[None, len(visitingTeamStatsTemp[0])], name='statsInput2')

nameProcess1 = tflearn.fully_connected(homeTeamNameInput, 4)
nameProcess2 = tflearn.fully_connected(visitingTeamNameInput, 4)
net = tflearn.layers.merge_ops.merge([nameProcess1, homeTeamStatsInput, nameProcess2, visitingTeamStatsInput], 'concat', axis=1)
# next we have the hidden layer it is the feature matrix the size is arbitrary.
net = tflearn.fully_connected(net, 30)
#net = tflearn.fully_connected(net, 30)
#net = tflearn.fully_connected(net, 30)
net = tflearn.dropout(net, keep_prob=0.5)
# The output layer
net = tflearn.fully_connected(net, 2)
net = tflearn.regression(net, name='target', learning_rate=0.00000001)

# take only the stats for each team and put them into an array
NNOutput = [[i[1], i[4]] for i in gameList]
NNOutput = np.array(NNOutput)

# Define model
model = tflearn.DNN(net)
# Start training (apply gradient descent algorithm)
model.fit({'nameInput1':homeTeamName, 'statsInput1':homeTeamStatsTemp, 'nameInput2':visitingTeamName, 'statsInput2':visitingTeamStatsTemp}, NNOutput, validation_set=0.1, n_epoch=20, show_metric=True)

Training Step: 699  | total loss: [1m[32m59.62721[0m[0m | time: 0.182s
| Adam | epoch: 020 | loss: 59.62721 - acc: 0.4903 -- iter: 2176/2185
Training Step: 700  | total loss: [1m[32m60.49315[0m[0m | time: 1.195s
| Adam | epoch: 020 | loss: 60.49315 - acc: 0.4881 | val_loss: 68.45396 - val_acc: 0.4774 -- iter: 2185/2185
--


In [22]:
# Prediction Test
from tqdm import tqdm_notebook

# trying to write all of the data seems to crash my browser
# so I'm dumping the raw output to a file to read later.
file = open('statsNetOuput.txt', 'w')

numFalse = 0
testIndex = 0
closest = 10000000000
for i in tqdm_notebook(range(len(homeTeamName))):
    predict = model.predict([[homeTeamName[i]], [homeTeamStatsTemp[i]], [visitingTeamName[i]], [visitingTeamStatsTemp[i]]])

    if((predict[0][0]-float(NNOutput[i][0]))**2+(predict[0][1]-float(NNOutput[i][1]))**2 < closest):
        closest = ((predict[0][0]-float(NNOutput[i][0]))+(predict[0][1]-float(NNOutput[i][1])))**2 
        testIndex = i
    
    file.write("hits, RBI")
    file.write("\n")
    file.write(str(predict))
    file.write("\n")
    file.write(str(NNOutput[i]))
    file.write("\n")
    
    if(predict[0][0]-predict[0][1] < 0 and float(NNOutput[i][0])-float(NNOutput[i][1]) < 0): file.write("True")
    elif(predict[0][0]-predict[0][1] > 0 and float(NNOutput[i][0])-float(NNOutput[i][1]) > 0): file.write("True")
    else: 
        file.write("False")
        numFalse += 1
    file.write("\n")
    
file.close()

print("hits, RBI")
print(testIndex)
print(model.predict([[homeTeamName[i]], [homeTeamStats[i]], [visitingTeamName[i]], [visitingTeamStats[i]]]))
print(NNOutput[testIndex])
print(1-(numFalse/len(NNOutput)))


hits, RBI
1371
[[-0.3652026653289795, 0.1256241351366043]]
[ 1.  0.]
1


In [28]:
learnArr = [1*(10**(x)) for x in xrange(-8, 0)]
regulArr = [1*(2**(x)) for x in xrange(-10, 0)]
layerArr = xrange(2,10)
widthArr = xrange(10, 30)
errorList = np.empty((0, 1))
for learnRate in tqdm_notebook(learnArr):
    for numWidth in tqdm_notebook(widthArr):
        for numLayers in tqdm_notebook(layerArr):                                  
            tf.reset_default_graph()
    
            # the input layer needs to have the same dimensions as our input (in this case the teams)
            homeTeamNameInput = tflearn.input_data(shape=[None, len(homeTeamName[0])], name='nameInput1')
            homeTeamStatsInput = tflearn.input_data(shape=[None, len(homeTeamStatsTemp[0])], name='statsInput1')
            visitingTeamNameInput = tflearn.input_data(shape=[None, len(visitingTeamName[0])], name='nameInput2')
            visitingTeamStatsInput = tflearn.input_data(shape=[None, len(visitingTeamStatsTemp[0])], name='statsInput2')
        
            nameProcess1 = tflearn.fully_connected(homeTeamNameInput, 4)
            nameProcess2 = tflearn.fully_connected(visitingTeamNameInput, 4)
            net = tflearn.layers.merge_ops.merge([nameProcess1, homeTeamStatsInput, nameProcess2, visitingTeamStatsInput], 'concat', axis=1)
            # next we have the hidden layer it is the feature matrix the size is arbitrary.
            for _ in range(numLayers):
                net = tflearn.fully_connected(net, numWidth)
            
            net = tflearn.dropout(net, keep_prob=0.5)
            # The output layer
            net = tflearn.fully_connected(net, 2)
            net = tflearn.regression(net, name='target', learning_rate=learnRate)
        
            # take only the stats for each team and put them into an array
            NNOutput = [[i[1], i[4]] for i in gameList]
            NNOutput = np.array(NNOutput)
        
            # Define model
            model = tflearn.DNN(net)
            # Start training (apply gradient descent algorithm)
            model.fit({'nameInput1':homeTeamName, 'statsInput1':homeTeamStatsTemp, 'nameInput2':visitingTeamName, 'statsInput2':visitingTeamStatsTemp}, NNOutput, validation_set=0.1, n_epoch=20, show_metric=True)
            
            error = np.empty((0, 1))
            for i in tqdm_notebook(range(len(homeTeamName))):
                predict = model.predict([[homeTeamName[i]], [homeTeamStatsTemp[i]], [visitingTeamName[i]], [visitingTeamStatsTemp[i]]])
                error = np.append(error, ((predict[0][0] - NNOutput[0][0])**2 + (predict[0][1] - NNOutput[0][1]))**0.5)
            error = np.sum(error) / len(homeTeamName)
            errorList = np.append(errorList, error)
        

Training Step: 699  | total loss: [1m[32m6.27702[0m[0m | time: 0.308s
| Adam | epoch: 020 | loss: 6.27702 - acc: 0.5271 -- iter: 2176/2185
Training Step: 700  | total loss: [1m[32m6.28897[0m[0m | time: 1.319s
| Adam | epoch: 020 | loss: 6.28897 - acc: 0.5150 | val_loss: 6.06802 - val_acc: 0.5761 -- iter: 2185/2185
--


KeyboardInterrupt: 