In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from datetime import datetime,timedelta
from collections import Counter
from tqdm import tqdm
import glob

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt
%matplotlib inline

from data_augment import *

### Importation of the Excel files - 1 per year (from tennis.co.uk)
 Some preprocessing is necessary because for several years the odds are not present
 
 We consider only the odds of Bet365 and Pinnacle.

In [2]:
filenames=list(glob.glob("Data/Men/20*.xls*"))
print(len(filenames))
l_df = [pd.read_excel(file) for file in filenames]


21


In [3]:
l_df = [d[list(d.columns)[:13]+["Wsets","Lsets","Comment"]] for d in l_df ]
df = pd.concat(l_df,0)

### Data cleaning

In [4]:
df = df.sort_values("Date")
df["WRank"]=df["WRank"].replace(np.nan,0)
df["WRank"]=df["WRank"].replace("NR",2000)
df["LRank"]=df["LRank"].replace(np.nan,0)
df["LRank"]=df["LRank"].replace("NR",2000)
df["WRank"]=df["WRank"].astype(int)
df["LRank"]=df["LRank"].astype(int)
df["Wsets"]=df["Wsets"].astype(float)
df["Lsets"]=df["Lsets"].replace("`1",1)
df["Lsets"]=df["Lsets"].astype(float)
df=df.reset_index(drop=True)

In [5]:
df.to_csv("Generated Data/atp_data.csv",index=False)

### Building training set
#Focusing on years post 2008

In [6]:
data=pd.read_csv("Generated Data/atp_data.csv", encoding = "ISO-8859-1")
data.Date = data.Date.apply(lambda x:datetime.strptime(x, '%Y-%m-%d'))

In [7]:
beg = datetime(2008,1,1) 
end = data.Date.iloc[-1]
indices = data[(data.Date>beg)&(data.Date<=end)].index
data.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,Wsets,Lsets,Comment
0,1,Adelaide,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,63,77,2.0,0.0,Completed
1,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Kiefer N.,Tarango J.,6,59,2.0,0.0,Completed
2,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Gaudio G.,Luxa P.,73,174,2.0,1.0,Completed
3,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,El Aynaoui Y.,Dupuis A.,33,78,2.0,1.0,Completed
4,3,Doha,Qatar Open,2000-01-03,International,Outdoor,Hard,1st Round,3,Cherkasov A.,Arazi H.,206,35,2.0,0.0,Completed


In [8]:
last_days = [15, 30, 60, 90, 120]
data_aug = Momentum(data, indices, last_days, 1000, 360*4)

  0%|                                                                                        | 0/32270 [00:00<?, ?it/s]

0


 16%|███████████▉                                                                 | 5002/32270 [06:32<34:29, 13.18it/s]

5000


 31%|███████████████████████▌                                                    | 10002/32270 [12:56<27:46, 13.36it/s]

10000


 46%|███████████████████████████████████▎                                        | 15001/32270 [19:18<22:27, 12.81it/s]

15000


 62%|███████████████████████████████████████████████                             | 20001/32270 [25:40<15:19, 13.34it/s]

20000


 77%|██████████████████████████████████████████████████████████▉                 | 25001/32270 [32:17<09:16, 13.07it/s]

25000


 93%|██████████████████████████████████████████████████████████████████████▋     | 30001/32270 [38:37<02:50, 13.34it/s]

30000


100%|████████████████████████████████████████████████████████████████████████████| 32270/32270 [41:46<00:00, 12.88it/s]


In [11]:
#toto = data_aug.copy()
#data_aug = toto.copy()

In [46]:
data_aug.head()

Unnamed: 0,Player1,Player2,pc_win1_30,pc_win2_60,pc_win1_15,pc_win1_60,pc_win2_15,pc_win2_90,pc_win1_90,pc_win2_30,...,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Best of_3,Best of_5
23191,Kohlschreiber P.,Volandri F.,0.5,0.5,0.5,0.125,0.166667,0.5,0.5,0.5,...,0,1,0,0,0,0,0,0,1,0
23192,Sirianni J.,Querrey S.,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0,1,0,0,0,0,0,0,1,0
23193,Mathieu P.H.,Guccione C.,0.5,0.5,0.5,0.192308,0.1,0.5,0.5,0.5,...,0,1,0,0,0,0,0,0,1,0
23194,Becker B.,Russell M.,0.5,0.5,0.5,0.0,0.136364,0.5,0.5,0.5,...,0,1,0,0,0,0,0,0,1,0
23195,Johansson T.,Berrer M.,0.5,0.5,0.5,0.125,0.166667,0.5,0.5,0.5,...,0,1,0,0,0,0,0,0,1,0


### Augmenting the datset with dummy variables

In [12]:
dummy_fields = ['Location', 'Series', 'Court', 'Surface', 'Round', "Best of"]
print("Shape before {}".format(data_aug.shape))
for each in dummy_fields:
    dummies = pd.get_dummies(data_aug[each], prefix=each, drop_first=False)
    data_aug = pd.concat([data_aug, dummies], axis=1)
    data_aug.drop([each], axis=1, inplace=True)
data_aug.drop(["Tournament","Date","Wsets","Lsets","Loser","ATP"], axis=1, inplace=True)

Shape before (55460, 38)


### Keeping only the data post 2008

In [13]:
data_aug = data_aug.iloc[list(indices),:]
data_aug = data_aug.loc[data_aug.loc[:,"Comment"]=="Completed",:]
data_aug.drop(["Comment"], axis=1, inplace=True)
print("Shape after {}".format(data_aug.shape))

Shape after (31013, 166)


### Doubling the size of the data sets as we try to predict winners, irrespective of the order of the variables
from player1 or player2 point of view

In [50]:
last_days_list = ['pc_win1_'+str(d) for d in last_days] + ['pc_win2_'+ str(d) for d in last_days]
last_days_list_rev = ['pc_win2_'+ str(d) for d in last_days] + ['pc_win1_'+str(d) for d in last_days]

data_temp = data_aug.copy()
data_temp.loc[:,["Player1",
                 "Player2"] + 
              last_days_list + 
              ["Duel1","Duel2","WRank","LRank"]] = data_aug.loc[:,["Player2",
                                                                  "Player1"] +
                                                               last_days_list_rev +
                                                               ["Duel2","Duel1", "LRank", "WRank"]].values
data_aug2 = pd.concat([data_aug, data_temp], axis=0)
data_aug2.loc[:,"Winner"] = ((data_aug2.loc[:,"Player1"]==data_aug2.loc[:,"Winner"]) + 0 )
data_aug2.drop(["Duel2"], axis=1, inplace=True)
data_aug2.rename(columns={"Duel1":"Duel_from1",  "WRank":"Player1_Rank","LRank":"Player2_Rank"}, inplace=True)
data_aug2.reset_index(drop=True, inplace=True)
print("Final Shape {}".format(data_aug2.shape))

Final Shape (62026, 165)


### Normalizing Data

In [51]:
toto2 = data_aug2.copy()
#data_aug2 = toto2.copy()
print(data_aug2.columns)

Index(['Player1', 'Player2', 'pc_win1_30', 'pc_win2_60', 'pc_win1_15',
       'pc_win1_60', 'pc_win2_15', 'pc_win2_90', 'pc_win1_90', 'pc_win2_30',
       ...
       'Round_1st Round', 'Round_2nd Round', 'Round_3rd Round',
       'Round_4th Round', 'Round_Quarterfinals', 'Round_Round Robin',
       'Round_Semifinals', 'Round_The Final', 'Best of_3', 'Best of_5'],
      dtype='object', length=165)


In [52]:
m, std = np.mean(data_aug2.loc[:, "Player1_Rank"]), np.std(data_aug2.loc[:, "Player1_Rank"])
data_aug2.loc[:, ["Player1_Rank", "Player2_Rank"]] = (data_aug2.loc[:, ["Player1_Rank", "Player2_Rank"]]-m)/std
data_aug2.loc[:, "Player_Rank_diff"]= data_aug2.loc[:, "Player1_Rank"] - data_aug2.loc[:, "Player2_Rank"]

In [53]:
pc_col = ["pc" in names for names in data_aug2.columns]
m, std = np.mean(data_aug2.loc[:, pc_col]), np.std(data_aug2.loc[:, pc_col])
data_aug2.loc[:, pc_col]=(data_aug2.loc[:, pc_col]-m)/std


### Storing the Data for a latter use

In [55]:
data_aug2.to_csv("Generated Data/atp_data_augmented2.csv",index=False)

### Now Doing the training
#Focusing on years post 2008

In [None]:
data=pd.read_csv("Generated Data/atp_data_augmented2.csv", encoding = "ISO-8859-1")
num_df = data.iloc[:,2:].copy()
Target = np.array(num_df.loc[:,"Winner"]).reshape((num_df.shape[0], 1))
Data = np.array(num_df.loc[:, [c != "Winner" for c in num_df.columns]])
print(Data.shape, Target.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Data, Target, test_size=0.25, random_state=42)
input_dim = Data.shape[1]

X_train = torch.from_numpy(X_train).type(torch.FloatTensor)
X_test = torch.from_numpy(X_test).type(torch.FloatTensor)
y_train = torch.from_numpy(y_train).type(torch.FloatTensor)
y_test = torch.from_numpy(y_test).type(torch.FloatTensor)

In [None]:
Model = nn.Sequential(nn.Linear(input_dim, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(p=0.3),
                     nn.Linear(512, 128),nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(p=0.3),
                      nn.Linear(128, 16), nn.ReLU(), nn.Dropout(p=0.3),
                     nn.Linear(16, 1), nn.Sigmoid())
Model

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(Model.parameters(), 0.000001)

n_epochs = 50
batch_size = 40

In [None]:
train_set = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_set, shuffle=True, batch_size=batch_size)

test_set = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_set, shuffle=False, batch_size=batch_size)

save_path = "trained_models\mlp_batch.pt"

In [None]:
test_loss_min = np.Inf

list_train_loss, list_test_loss, list_accuracy = [], [], []

for e in range(n_epochs):
    train_loss = 0.0
    test_loss = 0.0
    accuracy = 0.0
    ###################
    # train the model #
    ###################
    Model.train()
    for batch_idx, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        y_hat = Model(x)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #if (batch_idx+1)%2000==0 or (batch_idx+1)==len(train_loader):
        #    print('Current Epoch: {}/{}, Current progress {}/{}, Train Loss: {:.6f}'.format(e+1, n_epochs,
        #                                                                                   batch_idx+1, len(train_loader),
        #                                                                                  train_loss))
    ######################    
    # Test the model #
    ######################  
    Model.eval()
    for batch_idx, (x, y) in enumerate(test_loader):
        ## update the average validation loss
        with torch.no_grad():
            y_hat = Model(x)
            loss = criterion(y_hat, y)
            test_loss += loss.item() 
            
            pred_y_hat = y_hat>0.5
            equals = pred_y_hat.type(torch.FloatTensor)==y
            accuracy += torch.mean(equals.type(torch.FloatTensor))
            
    test_loss /= len(test_loader)
    accuracy /= len(test_loader)
    print('Epoch: {}/{} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}\tTest Accuracy: {:.3f} %'.format(e+1, 
                                                                                                     n_epochs,
                                                                                                     train_loss,
                                                                                                     test_loss,
                                                                                                     accuracy*100))
    
     ## TODO: save the model if validation loss has decreased
    if test_loss < test_loss_min:
        print('Test loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(test_loss_min, test_loss))
        torch.save(Model.state_dict(), save_path)
        test_loss_min = test_loss
        
    list_train_loss +=[train_loss]
    list_test_loss += [test_loss]
    list_accuracy += [accuracy]
    
fig = plt.figure(figsize=(20,4))
ax = fig.add_subplot(1, 2, 1)
ax.plot(np.arange(1, n_epochs+1), list_train_loss,"--g", label='Train Loss')
ax.plot(np.arange(1, n_epochs+1), list_test_loss, "-b", label='Test Loss')
plt.xlabel("n epochs")
plt.ylabel("Loss")
plt.legend()
plt.ylim(0.4,0.8)
    
ax = fig.add_subplot(1, 2, 2)
ax.plot(np.arange(1, n_epochs+1), list_accuracy, "-b", label='Accuracy')
plt.xlabel("n epochs")
plt.ylabel("Loss")
plt.legend()
plt.ylim(0.5,1)