# Models.ipynb

## This is the file where all of my models will go

### The code below is used to convert loaded data into x and y vectors 

data is loaded from game objects into x and y vectors. The x vector will store info about the previous games for both the home and away team of the current game. It also will store information about the strength of opposition in each of these games and the player selection. 

In [5]:
vLen = 734
def timeStepX(x):
    xRet = []
    for xt in x:
        xTemp = []
        for i in range(5):
            h = xt[i * vLen: (i +1) * vLen]
            a = xt[(i+5) * vLen: (i + 6) * vLen]
            val = h + a
            xTemp.append(val)
        xRet.append(xTemp)
    return xRet
def resVal(y):
    yVals = []
    for val in y:
        if val[0] == 1:
            yVals.append(3)
        elif val[1] == 1:
            yVals.append(1)
        else:
            yVals.append(0)
    return yVals

#changes the output of a game to a one hot encoding [win, tie, loss]
def resOneHot(y):
    if y == 3:
        return [1 , 0, 0]
    elif y == 1:
        return [0, 1, 0]
    else:
        return [0, 0, 1]

#creates a flat version of the x as a 1 x 8240 matrix and the y's as one hot vectors.
def createDataSet2(gameInfo, season):
    
    ''' code used to create x and y values from a given season and set of games
    creates a y value that is listed as a one hot vector
     and x as a 1 x 8240 matrix that give information about the past 5 games'''
    x = []
    y = []
    cnt = 0
    teamDic = dict()
    teams = season.teams
    #this code block is meant to fix a mistake in the definition of my data structure, should really make performances
    # a dictionary which each team name as a key, and the performances as a val. 
    for team in teams:
        teamDic[team] = cnt
        cnt += 1
    for game in gameInfo:
        example = []
        stage = game[0]
        res = game[5]
        #the result of the game will constitute our y set. 0 = home loss, 1 = home draw, 3 = home win.
        y.append(resOneHot(res))
        home = game[1]
        away = game[2]
        homeLoc = teamDic[home]
        awayLoc = teamDic[away]
        #a training example will be a 10 * 824, with the home teams most recent 5 games as the first 5 values
        #followed by the away teams most recent 5 games.
        for i in range((stage-6), (stage - 1)):
            game = season.performances[homeLoc].performance[i]
            #first two items in game are the stage and away team api Id which I don't use in this
            # model
            game1 = game[2:]
            if len(game1) < 734:
                for k in range(len(game1), 734):
                    game1.append(0)
            if len(game1) != 734:        
                print(len(game1))
            for d in game1:
                example.append(d)
        for j in range((stage-6), (stage - 1)):
            game = season.performances[awayLoc].performance[i]
            game1 = game[2:]
            if len(game1) < 734:
                for k in range(len(game1), 734):
                    game1.append(0)
            if len(game1) != 734:        
                print(len(game1))
            for d in game1:
                example.append(d)
        x.append(example)
    return x, y

def generateSets(seasons):
    trainX = []
    trainY = []
    testX = []
    testY = []
    for i in range(6):
        season1 = seasons.seasonData[i].pVals[0]
        numTeams = len(season1.teams)
        n = int(numTeams / 2 *  5)
        t = season1.info[n:]
        x ,y = createDataSet2(t, season1)
        trainX = trainX + x
        trainY = trainY + y
    for j in range(2):
        season1 = seasons.seasonData[j + 6].pVals[0]
        numTeams = len(season1.teams)
        t = season1.info[n:]
        x ,y = createDataSet2(t, season1)
        testX = testX + x
        testY = testY + y
    return trainX, trainY, testX, testY

def filterX(tx):
    xRet = []
    for x in tx:
        xTemp = []
        for i in x:
            if type(i) != type(1) and type(i) != type(1.):
                xTemp.append(0)
            else:
                xTemp.append(i)
        xRet.append(xTemp)
    return xRet

# Loading Data

this is where data is loaded from the pickle files with the information stored in them and converted to train and test data for each league

In [6]:
import pickle
with open('soccerPerformanceEPL.pickle', 'rb') as handle:
    prem = pickle.load(handle)
with open('soccerPerformanceSL.pickle', 'rb') as handle:
    ll = pickle.load(handle)
with open('soccerPerformanceSWL.pickle', 'rb') as handle:
    swl = pickle.load(handle)
with open('soccerPerformanceFL.pickle', 'rb') as handle:
    fl = pickle.load(handle)
with open('soccerPerformanceNL.pickle', 'rb') as handle:
    nl = pickle.load(handle)
trainX1, trainY1, testX1, testY1 = generateSets(prem)
trainX2, trainY2, testX2, testY2 = generateSets(ll)
trainX3, trainY3, testX3, testY3 = generateSets(nl)
trainX4, trainY4, testX4, testY4 = generateSets(fl)
#trainX4, trainY4, testX4, testY4 = generateSets(swl)
trainX = trainX1 + trainX2 + trainX3 + trainX4 
trainY = trainY1 + trainY2 + trainY3 + trainY4
testX = testX1 + testX2 + testX3 + testX4 
testY = testY1 + testY2 + testY3 + testY4 
trainX = filterX(trainX)
testX = filterX(testX)

print(len(trainY), len(testY))



7506 2502


# Convolutional Neural Network

this is an attempt at running a convolutional neural network on the data. right now it is defunct, may go back and edit this

In [None]:
import math

import random as rand
POOL = 4
W1 = 32
W2 = 16
W3 = 8
WF1 = 256
NUMFEATURES = 3
D1 = 32 #depths
D2 = 64
D3 = 128


def cw(nLayers, p, w= 824):
    ''' calculates the width for fully connected layer
    args are number of layers,
    pool size, and the width of the argument default to 824'''
    w1 = float(w)
    for i in range(nLayers):
        w1 = math.ceil(w1/p)
    return w1
W_FINAL = cw(3, POOL)

def genBatch(x, y, n =100, shuffle = True):
    x1 = []
    y1 = []
    a = []
    for j in range(len(y)):
        a.append(j)
    if shuffle:
        rand.shuffle(a)
    for i in range(n):
        j = a[i]
        x1.append(x[j])
        y1.append(y[j])
    return x1, y1
    
import tensorflow as tf
sess = tf.InteractiveSession()
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 4, 2, 1],
                        strides=[1, 4, 2, 1], padding='SAME')

x = tf.placeholder(tf.float32, shape=[None, 8240])
y_ = tf.placeholder(tf.float32, shape=[None, 3])

W_conv1 = weight_variable([W1, 10, 1, D1])
b_conv1 = bias_variable([D1])
x_image = tf.reshape(x, [-1,824,10,1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

W_conv2 = weight_variable([W2, 4, D1, D2])
b_conv2 = bias_variable([D2])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

W_conv3 = weight_variable([W3, 2, D2, D3])
b_conv3 = bias_variable([D3])

h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
h_pool3 = max_pool_2x2(h_conv3)

W_fc1 = weight_variable([W_FINAL * 2 * D3, WF1])
b_fc1 = bias_variable([WF1])
h_pool3_flat = tf.reshape(h_pool3, [-1,W_FINAL * 2 * D3])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)

keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([256, 3])
b_fc2 = bias_variable([3])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2


cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

sess.run(tf.initialize_all_variables())
for i in range(1000):
    x_batch, y_batch = genBatch(trainX, trainY) 
  
    if i%100 == 0:
        train_accuracy = accuracy.eval(feed_dict={
            x:x_batch, y_: y_batch, keep_prob: 1.0})
        print("step %d, training accuracy %g"%(i, train_accuracy))
        train_step.run(feed_dict={x: x_batch, y_:y_batch, keep_prob: 0.5})
acc = 0
for i in range(18):
    x_btest = testX[i * 139: (i+1) * 139]
    y_btest = testY[i * 139: (i+1) * 139]
    temp = accuracy.eval(feed_dict={x: x_btest, y_: y_btest, keep_prob: 1.0})
    acc += temp / 18.0
print('accuracy is %g'%acc)

## Test distribution 

this cell gives information about the test distribution

In [7]:

s = 0
s1 = 0
s2 = 0
for i in range(len(testY)):
    if testY[i][0] == 1:
        s+= 1
    elif testY[i][1] == 1:
        s1 += 1
    else:
        s2 += 1
print(s/len(testY), s1/len(testY), s2/len(testY))

0.4572342126298961 0.24620303756994405 0.2965627498001599


# Random Forest Classifier

this is a random forest classifier that predicts values based on our training set. It is currently our best model but we are hoping to beat its performance with an RNN

In [8]:
#first model that is better than a naive guess woot 46% is naive!!!
from sklearn.ensemble import RandomForestClassifier
import numpy as np
trainY2 = resVal(trainY)
testY2 = resVal(testY)
clf = RandomForestClassifier(n_estimators=150)
clf.fit(trainX, trainY2)
print(clf.score(trainX, trainY2))
print(clf.score(testX, testY2))


1.0
0.521183053557


# RNN MODEL

this is the part of the code that trains a lstm rnn model on the simple data set above. It takes input as a time sequence
which is a 5 * 12 vector denoting 12 stats for the previous 5 games, the home team result goals scored and goals against, and strength of the oppsotion, and similiarly the away teams result goals scored and goals against and strength of opposition. 
It then has to 512 * 512 nodes in which it learns features and dependencies and finaly this is brought together in a 3 softmax classificaiton. 

In [10]:
if 'session' in locals() and session is not None:
    print('Close interactive session')
    session.close()

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(5, 1468)))
model.add(Dropout(0))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0))
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

trainX1 = timeStepX(trainX)
testX1 = timeStepX(testX)

for iteration in range(1, 10):
    print
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(trainX1, trainY, batch_size=139, nb_epoch=1)
score = model.evaluate(testX1, testY, verbose=0)
yVals = model.predict_on_batch(testX1)
ytVals = model.predict(trainX1, batch_size=32)
print(len(yVals))

--------------------------------------------------
Iteration 1
Epoch 1/1
--------------------------------------------------
Iteration 2
Epoch 1/1
--------------------------------------------------
Iteration 3
Epoch 1/1
--------------------------------------------------
Iteration 4
Epoch 1/1
--------------------------------------------------
Iteration 5
Epoch 1/1
--------------------------------------------------
Iteration 6
Epoch 1/1
--------------------------------------------------
Iteration 7
Epoch 1/1
--------------------------------------------------
Iteration 8
Epoch 1/1
--------------------------------------------------
Iteration 9
Epoch 1/1
2502


# RNN TEST PERFORMANCE

this maps the test performance of the recurrent neural network

In [11]:
s4 = 0

    

for i in range(len(yVals)):
    y = yVals[i]
    y = list(y)
    idx = y.index(max(y))
    if testY[i][idx] == 1:
        s4 += 1
print(s4/2502)

0.4572342126298961


# this is code that I'm working on to create an RNN with just tensorflow 

still a work in progress, will come back to this shortly

In [None]:
import tensorflow as tf
lstm_size = 20
batch_size = 50
sess = tf.InteractiveSession()
lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
# Initial state of the LSTM memory.
state = tf.zeros([batch_size, lstm_size])
probabilities = []
loss = 0.0

trainX1 = timeStepX(trainX)
testX1 = timeStepX(testX)

for game in trainX1:
    
    # The value of state is updated after processing each batch of words.
    output, state = lstm(game, state)

    # The LSTM output can be used to make next word predictions
    logits = tf.matmul(output, softmax_w) + softmax_b
    probabilities.append(tf.nn.softmax(logits))
    