# Baseball learning network

In [25]:
import numpy as np
import tensorflow as tf
import tflearn
from tqdm import tqdm_notebook
import copy
from scipy import stats

In [26]:
class GameStats(object):          
    
    def __init__(self, homeTeamNameIndex, homeTeamScoreIndex, homeTeamStatsIndex, visitorTeamNameIndex, visitorTeamScoreIndex, visitorTeamStatsIndex):
        #parse the text file
        self.statsFile = open("baseball2016.txt", "r")
        self.topArray = []
        self.sideArray = []  
        self.sc = np.zeros((30,30,30), np.int32) 
        self.sc[:,:,:] = -1  
        self.am = np.zeros((30,30), np.float32)
        self.gameList = []
        
        seenTeams = []
        for line in self.statsFile:
            token = line.split(',')  #tokenize the string
            tokenIndex = [homeTeamNameIndex, homeTeamScoreIndex, visitorTeamNameIndex, visitorTeamScoreIndex] + [i for i in homeTeamStatsIndex] + [i for i in visitorTeamStatsIndex]
            attributes = dict()
            
            for i in range(len(token)):
                if(i in tokenIndex):
                    attributes[i] = self.removeQuotes(token[i])
                        
            self.addScore(attributes[homeTeamNameIndex], attributes[visitorTeamNameIndex], attributes[homeTeamScoreIndex], attributes[visitorTeamScoreIndex])                
            self.addGame(attributes[homeTeamNameIndex], attributes[homeTeamScoreIndex], [attributes[i] for i in homeTeamStatsIndex], attributes[visitorTeamNameIndex], attributes[visitorTeamScoreIndex], [attributes[i] for i in visitorTeamStatsIndex])
            
            if(attributes[homeTeamNameIndex] not in seenTeams):
                seenTeams.append(attributes[homeTeamNameIndex])
            
        self.buildAvgMatrix()
        self.statsFile.close()
        #self.gameList = [bin(2**i)[2:].zfill(len(seenTeams)) if x == seenTeams[i] else x for i in range(len(seenTeams)) for x in self.gameList]
        # take the teams and convert the indexes of the order that they appeared in into the hot one format by rasiing
        # 2 to the power of them and then converting them into binary.
        seenTeamsDict = {k: v for v, k in enumerate(seenTeams)}
        temp = []
        for x in self.gameList:
            tempi = []
            for z in x:
                if(z in seenTeams):
                    tempi.append(bin(2**seenTeamsDict[z])[2:].zfill(len(seenTeamsDict)))
                else:
                    tempi.append(z)
            temp.append(tempi)
        self.gameList = temp
        
    def removeQuotes(self, string):
        if (string.startswith('"') and string.endswith('"')) or (string.startswith("'") and string.endswith("'")):
            return string[1:-1]
        return string  
    
    def addGame(self, team1, score1, stats1, team2, score2, stats2):
        self.gameList.append([team1, score1, stats1, team2, score2, stats2])
        
    #give it two teams, the scores, and it will add it to the matrix
    def addScore(self, team1, team2, score1, score2):
        '''
        for a team in top array, the index in the array corrisponds to the matrix column there located in
        for a team in side array, the index in the array corrisponds to the matrix row there located in
        '''
        #team 1 score entry
        try:
            row = self.sideArray.index(team2)    

        except:
            self.sideArray.append(team2)
            row = self.sideArray.index(team2)    

        try:
            col = self.topArray.index(team1)
        except:
            self.topArray.append(team1)
            col = self.topArray.index(team1)
        temp = self.sc[row, col]
        counter = 0
        for e in temp:
            if (e == -1):
                temp[counter] = score1
                break
            counter += 1
        self.sc[row, col] = temp
        
        #team 2 score entry
        try:
            row = self.sideArray.index(team1)    
        except:
            self.sideArray.append(team1)
            row = self.sideArray.index(team1)    
            
        try:
            col = self.topArray.index(team2)
        except:
            self.topArray.append(team2)
            col = self.topArray.index(team2)
        temp = self.sc[row, col]
        counter = 0
        for e in temp:
            if (e == -1):
                temp[counter] = score2
                break
            counter += 1
        self.sc[row, col] = temp
    
    #returns the score(s) for match up
    def getScore(self, team1, team2, gameSelect = None):
        print(team1, team2)
        try:
            score1 = self.sc[self.sideArray.index(team2), self.topArray.index(team1)]
            score2 = self.sc[self.sideArray.index(team1), self.topArray.index(team2)]
            if (gameSelect == None):
                print(team1, score1)
                print(team2, score2)
            else:
                print(team1, score1[gameSelect])
                print(team2, score2[gameSelect])
        except:
            print('Invalid input of teams')
    
    def getGameList(self):
        return self.gameList
    
    #constructs a matrix of the avg score in a matchup
    def buildAvgMatrix(self): 
        for col in range(len(self.sc[:,0])):   #depth
            for row in range(len(self.sc[0, :])):  #width
                tempScore = self.sc[row, col]
                avgScore = 0.0
                count = 0.0
                for j in tempScore:
                    if (j != -1):
                        avgScore += j
                        count += 1
                    else:
                        break
                try:
                    avgScore = avgScore / count
                except:
                    avgScore = -1
                self.am[row, col] = avgScore
    
    #get the value of the avg score for a match up
    def getAvgScore(self, team1, team2):
        try:
            score1 = self.am[self.sideArray.index(team2), self.topArray.index(team1)]
            score2 = self.am[self.sideArray.index(team1), self.topArray.index(team2)]
            print(team1, score1)
            print(team2, score2)        
        except:
            print('Invalid input of teams')

Extract the data form the file.

In [27]:
gameStats = GameStats(6, 10, [8, 50, 53, 54, 58], 3, 9, [5, 22, 25, 26, 30])
gameList = gameStats.getGameList() # get the list of games
#getting rid of the strings
def removeQuotes(gameList):
    for row in gameList:
        for x in range(len(row)):
            #convert scores strings to float
            if (x == 1 or x == 4): row[x] = float(row[x])
            #convert arrays to floats
            if (x == 2 or x == 5): row[x] = list(map(float, row[x]))
    return gameList


Print off all the data used into file name 'gameList.txt' so the user can see what stats were working with.

In [28]:
gameListFile = open("gameList.txt", "w")
for row in gameList:
    gameListFile.write('\n-------------------------------------------')
    gameListFile.write('\nHome: ')
    gameListFile.write(row[0])
    gameListFile.write('\nScore: ')
    gameListFile.write(str(row[1]))
    
    gameListFile.write('\n  game: ')
    gameListFile.write(str(row[2][0]))
    gameListFile.write('\n  hits: ')
    gameListFile.write(str(row[2][1]))
    gameListFile.write('\n  home runs: ')
    gameListFile.write(str(row[2][2]))
    gameListFile.write('\n  RBI: ')
    gameListFile.write(str(row[2][3]))
    gameListFile.write('\n  walks: ')
    gameListFile.write(str(row[2][4]))
    
    gameListFile.write('\n-----------------')
    
    gameListFile.write('\nAway: ')
    gameListFile.write((row[3]))
    gameListFile.write('\nScore: ')
    gameListFile.write(str(row[4]))
    
    gameListFile.write('\n  game: ')
    gameListFile.write(str(row[5][0]))
    gameListFile.write('\n  hits: ')
    gameListFile.write(str(row[5][1]))
    gameListFile.write('\n  home runs: ')
    gameListFile.write(str(row[5][2]))
    gameListFile.write('\n  RBI: ')
    gameListFile.write(str(row[5][3]))
    gameListFile.write('\n  walks: ')
    gameListFile.write(str(row[5][4]))
    gameListFile.write('\n-------------------------------------------')
gameListFile.close()
gameList = removeQuotes(gameList) #now thats its printed to the file, lets remove the quotes from the #

Now lets make the average stats for each team.

In [29]:
avgStats = dict()
np.set_printoptions(precision=4)  #IF you wanna remove this i recommend restarting the kernal

#add up all the stats for each team
for row in gameList:
    #home
    if row[0] in avgStats:
        avgStats[row[0]][0] = 0   #set the proir games to 0,because there set to right amount next line
        avgStats[row[0]] = np.sum([avgStats[row[0]], row[2]], axis=0)  #total stats + indivudal game stats
    else:
        avgStats[row[0]] = row[2]
    
    #away
    if row[3] in avgStats:
        avgStats[row[3]][0] = 0
        avgStats[row[3]] = np.sum([avgStats[row[3]], row[5]], axis=0)
    else:
        avgStats[row[3]] = row[5]

#divide the sum, to make the average..
keyList = avgStats.keys()    
for key in keyList:
    for index in range(1, len(avgStats[key])):  #divide all stats by games played (162, 162)
        avgStats[key][index] = avgStats[key][index] / avgStats[key][0]  

In [30]:
# split the stats up into different parts (this takes a lot of time, but I think it will be worth it)

#gameList = gameStats.getGameList() # I want to test what happens if we just use the stats of the game.
gameList = removeQuotes(gameList)   #because i initialized gamestats above, must remove quote again
homeTeamName = np.empty((0, len(list(gameList[0][0])))) 
homeTeamScore = np.empty((0, 1))
homeTeamStats = np.empty((0, len(gameList[0][2])))

visitingTeamName = np.empty((0, len(list(gameList[0][3])))) 
visitingTeamScore = np.empty((0, 1))
visitingTeamStats = np.empty((0, len(gameList[0][5])))

for row in gameList:
    homeTeamName = np.vstack((homeTeamName, list(row[0])))
    homeTeamScore = np.vstack((homeTeamScore, row[1]))
    homeTeamStats = np.vstack((homeTeamStats, row[2]))
    visitingTeamName = np.vstack((visitingTeamName, list(row[3])))
    visitingTeamScore = np.vstack((visitingTeamScore, row[4]))
    visitingTeamStats = np.vstack((visitingTeamStats, row[5]))

In [32]:
# take the stats and zscore them, I do a number of transformations on the array because of how the zScore input contstraints 
zScoredStatsHome = np.empty((0, len(homeTeamStats)))
zScoredStatsVisitor = np.empty((0, len(visitingTeamStats)))
for x in range(len(homeTeamStats[0])):
    zScoredStatsHome = np.vstack((zScoredStatsHome, stats.zscore([i[x] for i in homeTeamStats])))
    zScoredStatsVisitor = np.vstack((zScoredStatsVisitor, stats.zscore([i[x] for i in visitingTeamStats])))

homeTeamStatsTemp = np.empty((len(zScoredStatsHome[0]), 0))
visitingTeamStatsTemp = np.empty((len(zScoredStatsVisitor[0]), 0))
for i in zScoredStatsHome:
    homeTeamStatsTemp = np.hstack((homeTeamStatsTemp, [[x] for x in i]))

for i in zScoredStatsVisitor:
    visitingTeamStatsTemp = np.hstack((visitingTeamStatsTemp, [[x] for x in i]))

In [33]:
learnArr = [1*(10**(x)) for x in [-6]]
regulArr = [1*(2**(x)) for x in xrange(-10)]
layerArr = [4]
widthArr = [30]
paramErr = dict()

for learnRate in tqdm_notebook(learnArr):
    for numWidth in tqdm_notebook(widthArr):
        for numLayers in tqdm_notebook(layerArr):                                  
            tf.reset_default_graph()
    
            # the input layer needs to have the same dimensions as our input (in this case the teams)
            homeTeamNameInput = tflearn.input_data(shape=[None, len(homeTeamName[0])], name='nameInput1')
            homeTeamStatsInput = tflearn.input_data(shape=[None, len(homeTeamStatsTemp[0])], name='statsInput1')
            visitingTeamNameInput = tflearn.input_data(shape=[None, len(visitingTeamName[0])], name='nameInput2')
            visitingTeamStatsInput = tflearn.input_data(shape=[None, len(visitingTeamStatsTemp[0])], name='statsInput2')
        
            nameProcess1 = tflearn.fully_connected(homeTeamNameInput, 4)
            nameProcess2 = tflearn.fully_connected(visitingTeamNameInput, 4)
            net = tflearn.layers.merge_ops.merge([nameProcess1, homeTeamStatsInput, nameProcess2, visitingTeamStatsInput], 'concat', axis=1)
            # next we have the hidden layer it is the feature matrix the size is arbitrary.
            for _ in range(numLayers):
                net = tflearn.fully_connected(net, numWidth)
            
            net = tflearn.dropout(net, keep_prob=0.5)
            # The output layer
            net = tflearn.fully_connected(net, 2)
            net = tflearn.regression(net, name='target', learning_rate=learnRate)
        
            # take only the stats for each team and put them into an array
            NNOutput = [[i[1], i[4]] for i in gameList]
            NNOutput = np.array(NNOutput)
        
            # Define model
            model = tflearn.DNN(net)
            # Start training (apply gradient descent algorithm)
            model.fit({'nameInput1':homeTeamName, 'statsInput1':homeTeamStatsTemp, 'nameInput2':visitingTeamName, 'statsInput2':visitingTeamStatsTemp}, NNOutput, validation_set=0.1, n_epoch=20, show_metric=True)

Training Step: 699  | total loss: [1m[32m6.16042[0m[0m | time: 0.254s
| Adam | epoch: 020 | loss: 6.16042 - acc: 0.5953 -- iter: 2176/2185
Training Step: 700  | total loss: [1m[32m6.23273[0m[0m | time: 1.262s
| Adam | epoch: 020 | loss: 6.23273 - acc: 0.6014 | val_loss: 6.11604 - val_acc: 0.6337 -- iter: 2185/2185
--



In [34]:
for i in tqdm_notebook(range(len(homeTeamName))):
    predict = model.predict([[homeTeamName[i]], [homeTeamStatsTemp[i]], [visitingTeamName[i]], [visitingTeamStatsTemp[i]]])
    print(predict)

[[8.648396760690957e-05, 7.396349974442273e-05]]
[[8.497802627971396e-05, 7.926917896838859e-05]]
[[8.461883408017457e-05, 6.824581214459613e-05]]
[[8.591054938733578e-05, 8.089279435807839e-05]]
[[8.092743519227952e-05, 7.283744344022125e-05]]
[[8.01481437520124e-05, 7.595709757879376e-05]]
[[8.515044464729726e-05, 7.359946903306991e-05]]
[[7.955776527523994e-05, 6.898077845107764e-05]]
[[9.248727292288095e-05, 7.784565968904644e-05]]
[[8.544801676180214e-05, 7.156085484893993e-05]]
[[8.081635314738378e-05, 7.538073987234384e-05]]
[[9.223461529472843e-05, 7.70094120525755e-05]]
[[8.459286618744954e-05, 8.671903924550861e-05]]
[[8.050242468016222e-05, 8.29850323498249e-05]]
[[8.495346264680848e-05, 7.413722050841898e-05]]
[[8.09663615655154e-05, 7.539196667494252e-05]]
[[8.271495607914403e-05, 7.46085643186234e-05]]
[[8.53006640681997e-05, 8.081816486082971e-05]]
[[8.370581781491637e-05, 7.826211367500946e-05]]
[[8.801338844932616e-05, 7.476264727301896e-05]]
[[9.247614798368886e-05, 7