https://medium.com/@curiousily/tensorflow-for-hackers-part-ii-building-simple-neural-network-2d6779d2f91b

In [52]:
import pandas as pd
from math import floor
import tensorflow as tf
import numpy as np
from sklearn import preprocessing

In [53]:
data1 = pd.read_csv('premier_14_15.csv')
data2 = pd.read_csv('premier_15_16.csv')
data3 = pd.read_csv('premier_16_17.csv')
data4 = pd.read_csv('premier_17_18.csv')

dataCon = [data1,data2,data3,data4]
data = pd.concat(dataCon)

In [54]:
b = np.random.uniform(low=-0.01,high=0.01,size=(3,3))
a = pd.DataFrame(b,columns=['a','b','c'])
a

Unnamed: 0,a,b,c
0,-0.004052,-0.003067,0.00784
1,0.004441,-0.004384,-0.008359
2,-0.007877,-0.00737,-0.006368


In [55]:
def normalization(raw_data):
    for col_num in range(raw_data.shape[1]):
        if raw_data.iloc[:,col_num].dtype == np.float or raw_data.iloc[:,col_num].dtype == np.int:
            raw_data.iloc[:,col_num] = (raw_data.iloc[:,col_num] - raw_data.iloc[:,col_num].mean()) / (raw_data.iloc[:,col_num].max() - raw_data.iloc[:,col_num].min())
    return raw_data

In [56]:
def embedding_matrix(column):
    labels = []
    embeddings = np.array([])
    num_of_uniques = len(np.unique(column))
    for i in range(num_of_uniques):
        if embeddings.size == 0:
            embeddings = np.random.uniform(low=-0.01,high=0.01,size=(min(50,(num_of_uniques+1)//2),1))
        else:
            embeddings = np.append(embeddings,np.random.uniform(low=-0.01,high=0.01,size=(min(50,(num_of_uniques+1)//2),1)),axis=1)
        labels.append(np.unique(column)[i])
    print("embeddings.shape:",embeddings.shape)
    print("labels.shape:",len(labels))
    return pd.DataFrame(data=embeddings,columns=labels)

In [57]:
em = embedding_matrix(data['HomeTeam'])
em.head()

embeddings.shape: (13, 26)
labels.shape: 26


Unnamed: 0,Arsenal,Aston Villa,Bournemouth,Brighton,Burnley,Chelsea,Crystal Palace,Everton,Huddersfield,Hull,...,Norwich,QPR,Southampton,Stoke,Sunderland,Swansea,Tottenham,Watford,West Brom,West Ham
0,0.009482,0.00354,-0.001117,0.007661,-0.002521,-0.001396,0.002739,0.003336,-0.000349,0.007904,...,-0.000243,0.003307,-0.007425,-0.006154,-0.001202,0.009593,0.00959,-0.00589,0.004915,-0.006597
1,-0.00493,0.00476,0.005727,0.00687,0.007289,0.008137,0.001416,0.001077,0.008796,-0.003759,...,0.0048,0.006528,-0.009955,-0.005135,-0.001517,-0.000657,0.000997,0.000307,0.000867,0.000593
2,-0.00083,6.6e-05,-0.009269,0.006025,0.007971,0.007626,0.009229,-0.007929,0.002798,0.006845,...,-0.009823,0.009596,0.001406,0.005775,0.003084,0.003508,0.001323,-0.000558,-0.006736,-0.004647
3,-0.002128,0.001575,0.009596,-0.001026,0.003067,0.002954,0.002109,-0.001817,-0.006577,-0.009358,...,0.004314,-0.002393,-0.007971,-0.006591,-0.005913,-0.008133,-0.000215,0.008346,-0.008767,-0.008361
4,-0.000325,-0.005892,0.009058,0.004134,0.006893,-0.005885,-0.007285,0.001174,0.004744,0.001155,...,-0.007234,-0.001093,-0.002983,0.001343,-0.006763,-0.005245,-0.003617,-0.003745,0.001057,-0.002464


In [58]:
#x_data = data.drop(['FTR','FTAG','FTHG'],1)
data = normalization(data)
x_data = np.column_stack((np.transpose(em[data['HomeTeam']].values),
                          np.transpose(em[data['AwayTeam']].values),
                          data['B365H'][:,None],
                          data['B365D'][:,None],
                          data['B365A'][:,None]))
y_data = data['FTR']

y_data = pd.get_dummies(y_data)

#differences = data['FTHG']-data['FTAG']

In [59]:
#x_data = data.drop(['FTR','FTAG','FTHG'],1)
x_data = np.column_stack((data['B365H'][:,None],
                          data['B365D'][:,None],
                          data['B365A'][:,None]))
y_data = data['FTR']

y_data = pd.get_dummies(y_data)

#differences = data['FTHG']-data['FTAG']

In [60]:
x_data.shape

(1520, 3)

In [61]:
# Two first games
x_data[:,0:2]

array([[-0.10172656,  0.23538092],
       [ 0.02076088, -0.07461908],
       [-0.09481701,  0.08538092],
       ...,
       [-0.06089741, -0.03561908],
       [-0.09670143,  0.18538092],
       [-0.02634968, -0.06461908]])

In [62]:
y_data.shape

(1520, 3)

In [63]:
y_data.head()

Unnamed: 0,A,D,H
0,0,0,1
1,0,1,0
2,1,0,0
3,1,0,0
4,1,0,0


In [64]:
train_size = 0.9
valid_size = 0.3

train_cnt = floor(x_data.shape[0] * train_size)

x_train = x_data[0:train_cnt]
y_train = y_data.iloc[0:train_cnt].values

valid_cnt = floor((x_data.shape[0] - train_cnt) * valid_size)

x_valid = x_data[train_cnt:train_cnt+valid_cnt]
y_valid = y_data.iloc[train_cnt:train_cnt+valid_cnt].values

x_test = x_data[train_cnt+valid_cnt:]
y_test = y_data.iloc[train_cnt+valid_cnt:]

print("x_train:",x_train.shape)
print("x_valid:",x_valid.shape)
print("x_test:",x_test.shape)

x_train: (1368, 3)
x_valid: (45, 3)
x_test: (107, 3)


In [65]:
x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)

In [66]:
# Parameters
LEARNING_RATE_STA = 1e-3
num_epochs = 50
batch_size = 128
display_step = 5

In [67]:
# Network Parameters
num_input = x_data.shape[1]
num_classes = y_data.shape[1]
num_hidden_1 = 50
num_hidden_2 = 50
KEEP_PROB_STA = 0.9

In [68]:
def neural_network(x,weights,biases,keep_prob):
    layer_1 = tf.add(tf.matmul(x,weights['w1']),biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    layer_1 = tf.nn.dropout(layer_1,keep_prob)
    
    layer_2 = tf.add(tf.matmul(layer_1,weights['w2']),biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    layer_2 = tf.nn.dropout(layer_2,keep_prob)
    
    layer_out = tf.add(tf.matmul(layer_2, weights['out']), biases['out'])
    return layer_out

In [69]:
# Store layers weight & bias
weights = {
    'w1': tf.Variable(tf.random_normal([num_input,num_hidden_1])),
    'w2': tf.Variable(tf.random_normal([num_hidden_1,num_hidden_2])),
    'out': tf.Variable(tf.random_normal([num_hidden_2, num_classes]))
}

biases = {
    'b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'b2': tf.Variable(tf.random_normal([num_hidden_2])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

keep_prob = tf.placeholder("float")

In [70]:
predictions = neural_network(x, weights, biases, keep_prob)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predictions, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE_STA).minimize(cost)

In [71]:
costs = []
lrn_rate_sizes = []

In [72]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        avg_cost = 0.0
        total_batch = int(len(x_train) / batch_size)
        x_batches = np.array_split(x_train, total_batch)
        y_batches = np.array_split(y_train, total_batch)
        
        for i in range(total_batch):
            batch_x, batch_y = x_batches[i], y_batches[i]
            _, c = sess.run([optimizer, cost], 
                            feed_dict={
                                x: batch_x, 
                                y: batch_y, 
                                keep_prob: KEEP_PROB_STA
                            })
            avg_cost += c / total_batch
            
        if epoch % display_step == 0:
            print("Train: Epoch:", '%04d' % (epoch+display_step), "cost=", "{:.9f}".format(avg_cost))
            _, valid_cost = sess.run([optimizer, cost], feed_dict={x: x_valid, y: y_valid, keep_prob: 1})
            print("Valid: Epoch:", '%04d' % (epoch+display_step), "cost=", "{:.9f}".format(valid_cost))
            
            correct_prediction = tf.equal(tf.argmax(predictions, 1), tf.argmax(y, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
            print("Accuracy:", accuracy.eval({x: x_test, y: y_test, keep_prob: 1.0}))
            print("______________________________________")
    print("Optimization Finished!")
    correct_prediction = tf.equal(tf.argmax(predictions, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: x_test, y: y_test, keep_prob: 1.0}))

Train: Epoch: 0005 cost= 14.639687252
Valid: Epoch: 0005 cost= 7.217744350
Accuracy: 0.45794392
______________________________________
Train: Epoch: 0010 cost= 8.123380804
Valid: Epoch: 0010 cost= 3.757234097
Accuracy: 0.39252338
______________________________________
Train: Epoch: 0015 cost= 7.011940718
Valid: Epoch: 0015 cost= 2.546033621
Accuracy: 0.44859812
______________________________________
Train: Epoch: 0020 cost= 6.122287321
Valid: Epoch: 0020 cost= 2.085423470
Accuracy: 0.48598132
______________________________________
Train: Epoch: 0025 cost= 5.233317709
Valid: Epoch: 0025 cost= 2.182467699
Accuracy: 0.57009345
______________________________________
Train: Epoch: 0030 cost= 4.904307556
Valid: Epoch: 0030 cost= 2.305070400
Accuracy: 0.5420561
______________________________________
Train: Epoch: 0035 cost= 4.445039535
Valid: Epoch: 0035 cost= 2.415178537
Accuracy: 0.5607477
______________________________________
Train: Epoch: 0040 cost= 3.930668139
Valid: Epoch: 0040 cost= 2