<a href="https://colab.research.google.com/github/AeroEng16/TennisPrediction/blob/main/dataPrepAndModelCreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [16]:
import pandas as pd
import numpy as np
import math
import datetime
import seaborn as sns
import plotly.express as px
import ast
import plotly.graph_objects as go
from dateutil.relativedelta import relativedelta
#Imports to find best matchf for player without matching name
from difflib import SequenceMatcher
from operator import itemgetter
import ast

In [26]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [39]:
import requests
from pathlib import Path

# Download helper functions from Learn PyTorch repo
if Path("helper_functions.py").is_file():
  pass
else:
    print("Downloading helper_functions.py")
    request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
    with open("helper_functions.py","wb") as f:
      f.write(request.content)

from helper_functions import accuracy_fn

## Data Loading

In [18]:
url ="https://raw.githubusercontent.com/AeroEng16/TennisPrediction/main/subsetTrainingData_140724.csv"
df = pd.read_csv(url)
columnsToDrop_notUseable =[
    'Date',
    'Court Type',
    'Tournament',
    'ScoreProgression_Sets',
    'ScoreProgression_Games',
    'RatingCalculated',
     'matchID',
     'Player1',
     'Player2',
     'Unnamed: 0',
    'matchID',
    'percentageToVictory',
    'percentageOfGamesWon',
    'dominanceRatio'

]
columnsToDrop_notEnoughData = [
     'Player1_Aces',
 'Player1_BreakpointsWon',
 'Player1_DoubleFaults',
 'Player1_FirstServePointsWon',
 'Player1_FirstServeSuccessful',
 'Player1_MaxPointsInARow',
 'Player1_PointsWon',
 'Player1_SecondServePointsWon',
 'Player1_SecondServeSuccessful',
 'Player1_ServiceGamesWon',
 'Player1_ServicePointsLost',
 'Player1_ServicePointsWon',
 'Player1_TiebreaksWon',
 'Player1_TotalBreakpoints',
 'Player1_GamesWon',
 'Player1_MaxGamesInARow',
 'Player2_Aces',
 'Player2_BreakpointsWon',
 'Player2_DoubleFaults',
 'Player2_FirstServePointsWon',
 'Player2_FirstServeSuccessful',
 'Player2_MaxPointsInARow',
 'Player2_PointsWon',
 'Player2_SecondServePointsWon',
 'Player2_SecondServeSuccessful',
 'Player2_ServiceGamesWon',
 'Player2_ServicePointsLost',
 'Player2_ServicePointsWon',
 'Player2_TiebreaksWon',
 'Player2_TotalBreakpoints',
 'Player2_GamesWon',
 'Player2_MaxGamesInARow',
]
allColumnsToDrop = columnsToDrop_notEnoughData+columnsToDrop_notUseable
#df = unfiltered_df.drop(columnsToDrop,axis=1)
#df = unfiltered_df.drop([],axis=1)

# 1-hot encoding for playhand (1 for Right handed, 0 for left handed)
df.Player1_PlayHand = df.Player1_PlayHand.apply(lambda x: 1 if x == 'R' else 0 )
df.Player2_PlayHand = df.Player2_PlayHand.apply(lambda x: 1 if x == 'R' else 0 )

# 1-hot encoding for winner( 1 for player 1, 0 for player 2)

df.Winner = df.Winner.apply(lambda x: 1 if x == 'Player1' else 0 )


#NEED TO CCONVERT PLAYHAND AND 3 DERIVED STATS TO FLOAT
df.dominanceRatio = df.dominanceRatio.apply(ast.literal_eval)
df.percentageToVictory = df.percentageToVictory.apply(ast.literal_eval)
df.percentageOfGamesWon = df.percentageOfGamesWon.apply(ast.literal_eval)
df[['Player1DominanceRatio','Player2DominanceRatio']] = pd.DataFrame(df.dominanceRatio.tolist(), index= df.index)
df[['Player1percentageToVictory','Player2percentageToVictory']] = pd.DataFrame(df.percentageToVictory.tolist(), index= df.index)
df[['Player1percentageOfGamesWon','Player2percentageOfGamesWon']] = pd.DataFrame(df.percentageOfGamesWon.tolist(), index= df.index)

df.head()

Unnamed: 0.1,Unnamed: 0,Date,Tournament,Court Type,Player1,Player1_Rank,Player1_RankingPoints,Player2,Player2_Rank,Player2_RankingPoints,...,Player2_WeightKg,percentageToVictory,percentageOfGamesWon,dominanceRatio,Player1DominanceRatio,Player2DominanceRatio,Player1percentageToVictory,Player2percentageToVictory,Player1percentageOfGamesWon,Player2percentageOfGamesWon
0,0,2024-05-16,Rome,red-clay,"Hurkacz, Hubert",9,3730,"Paul, Tommy",16,2300,...,82.0,"[0.083, 0.0]","[1.0, 0.0]","[1, 0.0]",1.0,0.0,0.083,0.0,1.0,0.0
1,1,2024-05-16,Rome,red-clay,"Hurkacz, Hubert",9,3730,"Paul, Tommy",16,2300,...,82.0,"[0.083, 0.083]","[0.5, 0.5]","[1.0, 1.0]",1.0,1.0,0.083,0.083,0.5,0.5
2,2,2024-05-16,Rome,red-clay,"Hurkacz, Hubert",9,3730,"Paul, Tommy",16,2300,...,82.0,"[0.083, 0.167]","[0.333, 0.667]","[0.5, 2.0]",0.5,2.0,0.083,0.167,0.333,0.667
3,3,2024-05-16,Rome,red-clay,"Hurkacz, Hubert",9,3730,"Paul, Tommy",16,2300,...,82.0,"[0.167, 0.167]","[0.5, 0.5]","[1.0, 1.0]",1.0,1.0,0.167,0.167,0.5,0.5
4,4,2024-05-16,Rome,red-clay,"Hurkacz, Hubert",9,3730,"Paul, Tommy",16,2300,...,82.0,"[0.167, 0.25]","[0.4, 0.6]","[0.667, 1.5]",0.667,1.5,0.167,0.25,0.4,0.6


## Clean any missing data and regularize input data

In [19]:
#To begin with just drop rows with Nan values
#in sample dataset only 144 rows of 7700 missing (~1.5%, not removing significant data)
df = df.dropna()

NEED TO REGULARISE THE INPUT DATA

## Model Hyperparameters

In [20]:
BATCH_SIZE = 64

## Prep Dataset

### Split dataset into training, test and validation

In [21]:
# Percentage split for train test and validation

train_split, test_split, valid_split = 0.8,0.1,0.1

#Split by match ids not samples to give better representation

matchSamples = list(reversed(df.matchID.unique()))

#ADD SORTING FOR MATCH SAMPLES BY DATE

trainSplitIDs = matchSamples[0:int(len(matchSamples) * train_split)]
testSplitIDs = matchSamples[int(len(matchSamples) * train_split) :int(len(matchSamples) * (train_split+test_split))]
validSplitIDs = matchSamples[int(len(matchSamples) * (train_split+test_split)) :int(len(matchSamples) * (train_split+test_split+valid_split))]

len(trainSplitIDs),len(testSplitIDs),len(validSplitIDs)

train_df = df[df.matchID.isin(trainSplitIDs)]
test_df = df[df.matchID.isin(testSplitIDs)]
valid_df = df[df.matchID.isin(validSplitIDs)]

len(train_df), len(test_df), len(valid_df)

(6153, 740, 769)

## Dataset Creation

In [22]:
# Create the dataset

class tennisMatchDataset(Dataset):
  def __init__(self,dataFrame,ignoredFeatures = None):

    self.df = dataFrame.drop(ignoredFeatures,axis=1)

  def __len__(self):
    return len(self.df)
  def __getitem__(self,idx):

    #Extract Labels
    self.label = self.df.iloc[idx].Winner
    #convert to float
    self.label = self.label.astype(float)
    # Then convert to tensor

    self.label = torch.from_numpy(np.array(self.label))

    #Extract Features
    self.features = self.df.iloc[idx].drop("Winner")
    #Convert to float
    self.features = self.features.astype(float)
    # Then convert to tensor
    self.features = torch.from_numpy(self.features.values)
    return self.features, self.label

In [23]:
trainingDataSet = tennisMatchDataset(dataFrame = train_df,
                                     ignoredFeatures =allColumnsToDrop)
testDataSet = tennisMatchDataset(dataFrame = test_df,
                                     ignoredFeatures =allColumnsToDrop)
validationDataSet = tennisMatchDataset(dataFrame = valid_df,
                                     ignoredFeatures =allColumnsToDrop)

In [24]:
trainDataLoader = DataLoader(trainingDataSet, batch_size=BATCH_SIZE, shuffle=True)

In [25]:
train_features, train_labels = next(iter(trainDataLoader))

train_features, train_labels

(tensor([[1.4200e+02, 4.3900e+02, 7.0000e+01,  ..., 8.3300e-01, 2.8600e-01,
          7.1400e-01],
         [1.4200e+02, 4.3900e+02, 1.0400e+02,  ..., 7.5000e-01, 4.1200e-01,
          5.8800e-01],
         [8.1000e+01, 7.1800e+02, 6.1000e+01,  ..., 8.3300e-01, 4.4400e-01,
          5.5600e-01],
         ...,
         [7.2000e+01, 7.6000e+02, 2.0700e+02,  ..., 5.0000e-01, 5.0000e-01,
          5.0000e-01],
         [5.0000e+00, 5.4350e+03, 1.0000e+02,  ..., 4.1700e-01, 4.4400e-01,
          5.5600e-01],
         [1.0600e+02, 5.9000e+02, 8.0000e+00,  ..., 2.5000e-01, 5.7100e-01,
          4.2900e-01]], dtype=torch.float64),
 tensor([0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0.,
         0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
         0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0.,
         0., 1., 0., 1., 1., 0., 0., 1., 0., 1.], dtype=torch.float64))

In [32]:
train_features.size(),train_labels.size()

(torch.Size([64, 62]), torch.Size([64]))

## Setup Model

In [28]:
# Device agnostic code

# Make device agnostic code

device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cpu'

In [35]:
# Create model class
class tennisPredModel(nn.Module):
  '''
    First pass at tennis prediction model. Network design has had no real thought
  '''

  def __init__(self,
               input_shape: int,
               hidden_units: int,
               output_shape: int,
               numberOfLayers: int = None):
    super().__init__()
    self.layer_stack = nn.Sequential(
        nn.Linear(in_features=input_shape,
                  out_features = hidden_units),
        nn.ReLU(),
        nn.Linear(in_features = hidden_units,
                  out_features = hidden_units),
        nn.ReLU(),
        nn.Linear(in_features = hidden_units,
                  out_features = output_shape),
        nn.ReLU(),
        nn.Softmax()
    )
    def forward(self, x: torch.Tensor):
      return self.layer_stack(x)

In [37]:
# Instance of model
# Set random seed
torch.manual_seed(42)

model1 = tennisPredModel(input_shape = 62,
                         hidden_units = 30,
                         output_shape = 2).to(device)

In [None]:
def train_step(model:torch.nn.Module,
              dataLoader:torch.utils.data.DataLoader,
              loss_fn:torch.nn.Module,
              optimiser:torch.optim.Optimizer,
              accuracy_fn,
              device:torch.device = device):
  """
  Performs training step with model trying to learn on dataloader
  """

  ### Put model into training mode
  model.train()

  ### Training

  train_loss, train_acc = 0, 0

  # Add a loop to loop through the training batches

  for batch, (X,y) in enumerate(dataLoader):
    X,y = X.to(device),y.to(device)
    # 1. Forward pass (outputs raw logits from model)
    y_pred=model(X)
    # 2. Calculate loss (per batch)
    loss = loss_fn(y_pred,y)
    train_loss +=loss   #accumulate training loss
    train_acc += accuracy_fn(y_true=y,
                             y_pred=y_pred.argmax(dim=1))
    # 3. Zero grad
    optimiser.zero_grad()
    # 4. Loss backwards
    loss.backward()
    # 5. Optimiser step
    optimiser.step()  # Optimizer will update model parameters once per batch rather than once per epoch

  # Divide total train loss by length of train dataloader
  train_loss /=len(dataLoader)
  train_acc /= len(dataLoader)

  print(f"Train loss: {train_loss:.4f} | Train acc: {train_acc:.2f}%")