# Baseball learning network

In [6]:
import numpy as np
import tensorflow as tf
import tflearn
from tqdm import tqdm_notebook
import copy
from scipy import stats

In [7]:
class GameStats(object):          
    
    def __init__(self, homeTeamNameIndex, homeTeamScoreIndex, homeTeamStatsIndex, visitorTeamNameIndex, visitorTeamScoreIndex, visitorTeamStatsIndex):
        #parse the text file
        self.statsFile = open("baseball2016.txt", "r")
        self.topArray = []
        self.sideArray = []  
        self.sc = np.zeros((30,30,30), np.int32) 
        self.sc[:,:,:] = -1  
        self.am = np.zeros((30,30), np.float32)
        self.gameList = []
        
        seenTeams = []
        for line in self.statsFile:
            token = line.split(',')  #tokenize the string
            tokenIndex = [homeTeamNameIndex, homeTeamScoreIndex, visitorTeamNameIndex, visitorTeamScoreIndex] + [i for i in homeTeamStatsIndex] + [i for i in visitorTeamStatsIndex]
            attributes = dict()
            
            for i in range(len(token)):
                if(i in tokenIndex):
                    attributes[i] = self.removeQuotes(token[i])
                        
            self.addScore(attributes[homeTeamNameIndex], attributes[visitorTeamNameIndex], attributes[homeTeamScoreIndex], attributes[visitorTeamScoreIndex])                
            self.addGame(attributes[homeTeamNameIndex], attributes[homeTeamScoreIndex], [attributes[i] for i in homeTeamStatsIndex], attributes[visitorTeamNameIndex], attributes[visitorTeamScoreIndex], [attributes[i] for i in visitorTeamStatsIndex])
            
            if(attributes[homeTeamNameIndex] not in seenTeams):
                seenTeams.append(attributes[homeTeamNameIndex])
            
        self.buildAvgMatrix()
        self.statsFile.close()
        #self.gameList = [bin(2**i)[2:].zfill(len(seenTeams)) if x == seenTeams[i] else x for i in range(len(seenTeams)) for x in self.gameList]
        # take the teams and convert the indexes of the order that they appeared in into the hot one format by rasiing
        # 2 to the power of them and then converting them into binary.
        seenTeamsDict = {k: v for v, k in enumerate(seenTeams)}
        temp = []
        for x in self.gameList:
            tempi = []
            for z in x:
                if(z in seenTeams):
                    tempi.append(bin(2**seenTeamsDict[z])[2:].zfill(len(seenTeamsDict)))
                else:
                    tempi.append(z)
            temp.append(tempi)
        self.gameList = temp
        
    def removeQuotes(self, string):
        if (string.startswith('"') and string.endswith('"')) or (string.startswith("'") and string.endswith("'")):
            return string[1:-1]
        return string  
    
    def addGame(self, team1, score1, stats1, team2, score2, stats2):
        self.gameList.append([team1, score1, stats1, team2, score2, stats2])
        
    #give it two teams, the scores, and it will add it to the matrix
    def addScore(self, team1, team2, score1, score2):
        '''
        for a team in top array, the index in the array corrisponds to the matrix column there located in
        for a team in side array, the index in the array corrisponds to the matrix row there located in
        '''
        #team 1 score entry
        try:
            row = self.sideArray.index(team2)    

        except:
            self.sideArray.append(team2)
            row = self.sideArray.index(team2)    

        try:
            col = self.topArray.index(team1)
        except:
            self.topArray.append(team1)
            col = self.topArray.index(team1)
        temp = self.sc[row, col]
        counter = 0
        for e in temp:
            if (e == -1):
                temp[counter] = score1
                break
            counter += 1
        self.sc[row, col] = temp
        
        #team 2 score entry
        try:
            row = self.sideArray.index(team1)    
        except:
            self.sideArray.append(team1)
            row = self.sideArray.index(team1)    
            
        try:
            col = self.topArray.index(team2)
        except:
            self.topArray.append(team2)
            col = self.topArray.index(team2)
        temp = self.sc[row, col]
        counter = 0
        for e in temp:
            if (e == -1):
                temp[counter] = score2
                break
            counter += 1
        self.sc[row, col] = temp
    
    #returns the score(s) for match up
    def getScore(self, team1, team2, gameSelect = None):
        print(team1, team2)
        try:
            score1 = self.sc[self.sideArray.index(team2), self.topArray.index(team1)]
            score2 = self.sc[self.sideArray.index(team1), self.topArray.index(team2)]
            if (gameSelect == None):
                print(team1, score1)
                print(team2, score2)
            else:
                print(team1, score1[gameSelect])
                print(team2, score2[gameSelect])
        except:
            print('Invalid input of teams')
    
    def getGameList(self):
        return self.gameList
    
    #constructs a matrix of the avg score in a matchup
    def buildAvgMatrix(self): 
        for col in range(len(self.sc[:,0])):   #depth
            for row in range(len(self.sc[0, :])):  #width
                tempScore = self.sc[row, col]
                avgScore = 0.0
                count = 0.0
                for j in tempScore:
                    if (j != -1):
                        avgScore += j
                        count += 1
                    else:
                        break
                try:
                    avgScore = avgScore / count
                except:
                    avgScore = -1
                self.am[row, col] = avgScore
    
    #get the value of the avg score for a match up
    def getAvgScore(self, team1, team2):
        try:
            score1 = self.am[self.sideArray.index(team2), self.topArray.index(team1)]
            score2 = self.am[self.sideArray.index(team1), self.topArray.index(team2)]
            print(team1, score1)
            print(team2, score2)        
        except:
            print('Invalid input of teams')

Extract the data form the file.

In [8]:
gameStats = GameStats(6, 10, [8, 50, 53, 54, 58], 3, 9, [5, 22, 25, 26, 30])
gameList = gameStats.getGameList() # get the list of games
#getting rid of the strings
def removeQuotes(gameList):
    for row in gameList:
        for x in range(len(row)):
            #convert scores strings to float
            if (x == 1 or x == 4): row[x] = float(row[x])
            #convert arrays to floats
            if (x == 2 or x == 5): row[x] = list(map(float, row[x]))
    return gameList


Print off all the data used into file name 'gameList.txt' so the user can see what stats were working with.

In [9]:
gameListFile = open("gameList.txt", "w")
for row in gameList:
    gameListFile.write('\n-------------------------------------------')
    gameListFile.write('\nHome: ')
    gameListFile.write(row[0])
    gameListFile.write('\nScore: ')
    gameListFile.write(str(row[1]))
    
    gameListFile.write('\n  game: ')
    gameListFile.write(str(row[2][0]))
    gameListFile.write('\n  hits: ')
    gameListFile.write(str(row[2][1]))
    gameListFile.write('\n  home runs: ')
    gameListFile.write(str(row[2][2]))
    gameListFile.write('\n  RBI: ')
    gameListFile.write(str(row[2][3]))
    gameListFile.write('\n  walks: ')
    gameListFile.write(str(row[2][4]))
    
    gameListFile.write('\n-----------------')
    
    gameListFile.write('\nAway: ')
    gameListFile.write((row[3]))
    gameListFile.write('\nScore: ')
    gameListFile.write(str(row[4]))
    
    gameListFile.write('\n  game: ')
    gameListFile.write(str(row[5][0]))
    gameListFile.write('\n  hits: ')
    gameListFile.write(str(row[5][1]))
    gameListFile.write('\n  home runs: ')
    gameListFile.write(str(row[5][2]))
    gameListFile.write('\n  RBI: ')
    gameListFile.write(str(row[5][3]))
    gameListFile.write('\n  walks: ')
    gameListFile.write(str(row[5][4]))
    gameListFile.write('\n-------------------------------------------')
gameListFile.close()
gameList = removeQuotes(gameList) #now thats its printed to the file, lets remove the quotes from the #

Now lets make the average stats for each team.

In [10]:
avgStats = dict()
np.set_printoptions(precision=4)  #IF you wanna remove this i recommend restarting the kernal

#add up all the stats for each team
for row in gameList:
    #home
    if row[0] in avgStats:
        avgStats[row[0]][0] = 0   #set the proir games to 0,because there set to right amount next line
        avgStats[row[0]] = np.sum([avgStats[row[0]], row[2]], axis=0)  #total stats + indivudal game stats
    else:
        avgStats[row[0]] = row[2]
    
    #away
    if row[3] in avgStats:
        avgStats[row[3]][0] = 0
        avgStats[row[3]] = np.sum([avgStats[row[3]], row[5]], axis=0)
    else:
        avgStats[row[3]] = row[5]

#divide the sum, to make the average..
keyList = avgStats.keys()    
for key in keyList:
    for index in range(1, len(avgStats[key])):  #divide all stats by games played (162, 162)
        avgStats[key][index] = avgStats[key][index] / avgStats[key][0]  

In [58]:
# split the stats up into different parts (this takes a lot of time, but I think it will be worth it)

#gameList = gameStats.getGameList() # I want to test what happens if we just use the stats of the game.
gameList = removeQuotes(gameList)   #because i initialized gamestats above, must remove quote again
homeTeamName = np.empty((0, len(list(gameList[0][0])))) 
homeTeamScore = np.empty((0, 1))
homeTeamStats = np.empty((0, len(gameList[0][2])))

visitingTeamName = np.empty((0, len(list(gameList[0][3])))) 
visitingTeamScore = np.empty((0, 1))
visitingTeamStats = np.empty((0, len(gameList[0][5])))

for row in gameList:
    homeTeamName = np.vstack((homeTeamName, list(row[0])))
    homeTeamScore = np.vstack((homeTeamScore, row[1]))
    homeTeamStats = np.vstack((homeTeamStats, row[2]))
    visitingTeamName = np.vstack((visitingTeamName, list(row[3])))
    visitingTeamScore = np.vstack((visitingTeamScore, row[4]))
    visitingTeamStats = np.vstack((visitingTeamStats, row[5]))

In [77]:
# take the stats and zscore them, I do a number of transformations on the array because of how the zScore input contstraints 
zScoredStatsHome = np.empty((0, len(homeTeamStats)))
zScoredStatsVisitor = np.empty((0, len(visitingTeamStats)))
for x in range(len(homeTeamStats[0])):
    zScoredStatsHome = np.vstack((zScoredStatsHome, stats.zscore([i[x] for i in homeTeamStats])))
    zScoredStatsVisitor = np.vstack((zScoredStatsVisitor, stats.zscore([i[x] for i in visitingTeamStats])))

homeTeamStatsTemp = np.empty((len(zScoredStatsHome[0]), 0))
visitingTeamStatsTemp = np.empty((len(zScoredStatsVisitor[0]), 0))
for i in zScoredStatsHome:
    homeTeamStatsTemp = np.hstack((homeTeamStatsTemp, [[x] for x in i]))

for i in zScoredStatsVisitor:
    visitingTeamStatsTemp = np.hstack((visitingTeamStatsTemp, [[x] for x in i]))

In [126]:
from hyperopt import hp, fmin, tpe
import math

def runModel(width, depth, dropOut, learningRate):
    tf.reset_default_graph()
    
    # the input layer needs to have the same dimensions as our input (in this case the teams)
    homeTeamNameInput = tflearn.input_data(shape=[None, len(homeTeamName[0])], name='nameInput1')
    homeTeamStatsInput = tflearn.input_data(shape=[None, len(homeTeamStatsTemp[0])], name='statsInput1')
    visitingTeamNameInput = tflearn.input_data(shape=[None, len(visitingTeamName[0])], name='nameInput2')
    visitingTeamStatsInput = tflearn.input_data(shape=[None, len(visitingTeamStatsTemp[0])], name='statsInput2')

    nameProcess1 = tflearn.fully_connected(homeTeamNameInput, 4)
    nameProcess2 = tflearn.fully_connected(visitingTeamNameInput, 4)
    net = tflearn.layers.merge_ops.merge([nameProcess1, homeTeamStatsInput, nameProcess2, visitingTeamStatsInput], 'concat', axis=1)
    # next we have the hidden layer it is the feature matrix the size is arbitrary.
    for _ in range(depth):
        net = tflearn.fully_connected(net, width)
    
    net = tflearn.layers.normalization.batch_normalization(net)
    #net = tflearn.dropOut(keep_prob=dropOut)
    # The output layer
    net = tflearn.fully_connected(net, 2, activation="RELU")
    net = tflearn.regression(net, name='target', learning_rate=learningRate)

    # take only the stats for each team and put them into an array
    NNOutput = [[i[1], i[4]] for i in gameList]
    NNOutput = np.array(NNOutput)

    # Define model
    model = tflearn.DNN(net)
    # Start training (apply gradient descent algorithm)
    model.fit({'nameInput1':homeTeamName, 'statsInput1':homeTeamStatsTemp, 'nameInput2':visitingTeamName, 'statsInput2':visitingTeamStatsTemp}, NNOutput, validation_set=0.1, n_epoch=20, show_metric=True)
    return [model.predict([[homeTeamName[i]], [homeTeamStatsTemp[i]], [visitingTeamName[i]], [visitingTeamStatsTemp[i]]]) for i in range(len(homeTeamStats))]
    
def objective(args):
    width = args[0]
    depth = args[1]
    dropOut = args[2]
    learningRate = args[3]
    
    file = open("bayesTest.txt", "a+") 
    
    NNOutput = [[i[1], i[4]] for i in gameList]
    NNOutput = np.array(NNOutput)
    
    predict = runModel(width, depth, dropOut, learningRate)
    error = np.sum(([(predict[i][0][0]-NNOutput[i][0])**2+(predict[i][0][1]-NNOutput[i][1])**2 for i in range(len(NNOutput))]))
    
    file.write(str(args))
    file.write("\t")
    file.write(str(error))
    file.write("\n---------------------------\n")
    file.close()
    
    if(math.isnan(error) or error == 144414.0):
        error = 10*(10**50)
    
    return error

open("bayesTest.txt", 'w').close()

space = [hp.choice('width', range(4, 30)), hp.choice('depth', range(1, 20)), hp.uniform('dropOut', 0.5, 1), hp.uniform('learningRate', 0, 0.05)]
best = fmin(objective, space=space, algo=tpe.suggest, max_evals=50)
print(best)

Training Step: 699  | total loss: 5.57029 | time: 0.553s
| Adam | epoch: 020 | loss: 5.57029 - acc: 0.9009 -- iter: 2176/2185
Training Step: 700  | total loss: 5.51940 | time: 1.575s
| Adam | epoch: 020 | loss: 5.51940 - acc: 0.9077 | val_loss: 5.80657 - val_acc: 0.9095 -- iter: 2185/2185
--
{'width': 20, 'depth': 3, 'dropOut': 0.812295321409586, 'learningRate': 0.01811675281641176}


In [121]:
print(np.array(runModel(best['width'], best['depth'], best['dropOut'], best['learningRate'])))

Training Step: 699  | total loss: 5.29699 | time: 0.726s
| Adam | epoch: 020 | loss: 5.29699 - acc: 0.9366 -- iter: 2176/2185
Training Step: 700  | total loss: 5.26975 | time: 1.748s
| Adam | epoch: 020 | loss: 5.26975 - acc: 0.9367 | val_loss: 5.57406 - val_acc: 0.9753 -- iter: 2185/2185
--
[[[ 1.9548  1.4401]]

 [[ 1.6642  2.3743]]

 [[ 1.9898  0.4011]]

 ..., 
 [[ 3.9573  1.2103]]

 [[ 5.0834  2.6941]]

 [[ 4.8862  3.8192]]]


In [116]:
print(np.array(runModel(30, 5, 0.5, 0.01)))
print(np.array(runModel(30, 8, 0.5, 0.01)))

Training Step: 699  | total loss: 104.20676 | time: 0.754s
| Adam | epoch: 020 | loss: 104.20676 - acc: 0.5187 -- iter: 2176/2185
Training Step: 700  | total loss: 104.93923 | time: 1.788s
| Adam | epoch: 020 | loss: 104.93923 - acc: 0.5169 | val_loss: 99.49442 - val_acc: 0.5226 -- iter: 2185/2185
--
[[[ 0.0735 -0.0712]]

 [[ 0.0735 -0.0712]]

 [[ 0.0735 -0.0712]]

 ..., 
 [[ 0.0735 -0.0712]]

 [[ 0.0735 -0.0712]]

 [[ 0.0735 -0.0712]]]


In [119]:
print(runModel(15, 1, 0.8951874132466489, 0.023931862762443558))

Training Step: 699  | total loss: 6.86074 | time: 0.415s
| Adam | epoch: 020 | loss: 6.86074 - acc: 0.5384 -- iter: 2176/2185
Training Step: 700  | total loss: 6.72550 | time: 1.431s
| Adam | epoch: 020 | loss: 6.72550 - acc: 0.5455 | val_loss: 6.36976 - val_acc: 0.5144 -- iter: 2185/2185
--
[[[10.912230491638184, 10.251882553100586]], [[-23.89528465270996, -22.88817024230957]], [[-20.34653663635254, -20.007579803466797]], [[10.183149337768555, 9.662129402160645]], [[18.892345428466797, 17.7828369140625]], [[43.53056335449219, 41.37114334106445]], [[-34.79275894165039, -33.202945709228516]], [[-41.710445404052734, -39.8624153137207]], [[-47.207027435302734, -45.28887939453125]], [[-17.769399642944336, -17.253089904785156]], [[8.385429382324219, 7.595714092254639]], [[-17.243541717529297, -16.366008758544922]], [[-8.641778945922852, -8.309176445007324]], [[17.240583419799805, 16.244531631469727]], [[0.22023573517799377, -0.11804108321666718]], [[8.75871753692627, 8.291305541992188]], [[

In [122]:
print(runModel(13, 14, 0.9794684410170154, 0.0232353944492859))

Training Step: 699  | total loss: 101.41436 | time: 0.649s
| Adam | epoch: 020 | loss: 101.41436 - acc: 0.5138 -- iter: 2176/2185
Training Step: 700  | total loss: 102.49802 | time: 1.667s
| Adam | epoch: 020 | loss: 102.49802 - acc: 0.5030 | val_loss: 97.59928 - val_acc: 0.5473 -- iter: 2185/2185
--
[[[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [[0.6376098990440369, 0.0]], [

In [123]:
print(runModel(13, 14, 0.9794684410170154, 0.01))

Training Step: 699  | total loss: 5.66805 | time: 0.600s
| Adam | epoch: 020 | loss: 5.66805 - acc: 0.9288 -- iter: 2176/2185
Training Step: 700  | total loss: 5.71905 | time: 1.614s
| Adam | epoch: 020 | loss: 5.71905 - acc: 0.9359 | val_loss: 5.67627 - val_acc: 0.9753 -- iter: 2185/2185
--
[[[0.6122021079063416, 0.517082154750824]], [[0.5252010226249695, 0.6689401865005493]], [[0.6810218095779419, 0.39695918560028076]], [[0.3169148564338684, 1.0324982404708862]], [[0.6324098706245422, 0.48180997371673584]], [[0.6129065752029419, 0.5158525705337524]], [[0.5161089897155762, 0.6848101019859314]], [[0.6161758899688721, 0.510145902633667]], [[0.4138452410697937, 0.8633087277412415]], [[0.5588492155075073, 0.6102081537246704]], [[0.6487983465194702, 0.4532043933868408]], [[0.3279695212841034, 1.0132025480270386]], [[0.22057199478149414, 1.2006618976593018]], [[0.42141032218933105, 0.850104033946991]], [[0.4931246042251587, 0.7249286770820618]], [[0.5395039319992065, 0.643974781036377]], [[

In [124]:
print(runModel(34, 1, 0.7192399016588593, 0.00915038472556007))

Training Step: 699  | total loss: 5.82519 | time: 0.601s
| Adam | epoch: 020 | loss: 5.82519 - acc: 0.9416 -- iter: 2176/2185
Training Step: 700  | total loss: 5.80138 | time: 1.626s
| Adam | epoch: 020 | loss: 5.80138 - acc: 0.9459 | val_loss: 5.44269 - val_acc: 0.9712 -- iter: 2185/2185
--
[[[2.0978503227233887, 1.6088217496871948]], [[1.765230417251587, 2.501530408859253]], [[2.1804239749908447, 0.817314624786377]], [[0.15302270650863647, 4.042115211486816]], [[1.7962449789047241, 1.2910664081573486]], [[1.6726056337356567, 1.2492170333862305]], [[1.3097375631332397, 2.4924120903015137]], [[1.1868106126785278, 0.9544075727462769]], [[2.7871618270874023, 4.858866214752197]], [[1.6534777879714966, 2.220454692840576]], [[2.442999839782715, 1.2896119356155396]], [[1.9351158142089844, 5.782303810119629]], [[0.2013871669769287, 5.979429244995117]], [[0.652956485748291, 2.8921544551849365]], [[1.1159355640411377, 2.5256009101867676]], [[0.05404886603355408, 1.0887620449066162]], [[1.526279

{'width': 34, 'depth': 1, 'dropOut': 0.7192399016588593, 'learningRate': 0.00915038472556007}

New Model

In [127]:
print(runModel(20, 3, 0.812295321409586, 0.01811675281641176))

Training Step: 699  | total loss: 5.31932 | time: 0.446s
| Adam | epoch: 020 | loss: 5.31932 - acc: 0.9353 -- iter: 2176/2185
Training Step: 700  | total loss: 5.32454 | time: 1.464s
| Adam | epoch: 020 | loss: 5.32454 - acc: 0.9402 | val_loss: 4.99514 - val_acc: 0.9877 -- iter: 2185/2185
--
[[[3.0693655014038086, 2.0337347984313965]], [[3.0942611694335938, 4.40205717086792]], [[3.1394295692443848, 0.8377951383590698]], [[0.5851091146469116, 7.5668768882751465]], [[2.8613171577453613, 1.8465676307678223]], [[2.4439377784729004, 1.6676623821258545]], [[2.8957176208496094, 5.001878261566162]], [[2.2186529636383057, 2.0631706714630127]], [[5.415373802185059, 9.467059135437012]], [[2.8709640502929688, 3.4108669757843018]], [[3.5185842514038086, 1.437737226486206]], [[4.181451797485352, 11.1953706741333]], [[1.1884318590164185, 11.217314720153809]], [[1.5648335218429565, 5.748600006103516]], [[2.235537052154541, 4.614236831665039]], [[0.13679760694503784, 1.4186620712280273]], [[2.699946403

In [128]:
print(NNOutput)

[[  4.   3.]
 [  3.   5.]
 [  4.   1.]
 ..., 
 [  7.   1.]
 [ 10.   4.]
 [ 10.   7.]]
