In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [2]:
#ELO rating
def ELO_math(ELO1,ELO2,ScalingFactor,winner):
    Prob1=1/(1+(10**((ELO2-ELO1)/400)))
    Prob2=1-Prob1
    if winner==1:
        SA,SB=1,0
    else:
        SB,SA=0,1
    ELO1N=ELO1+(ScalingFactor*(SA-Prob1))
    ELO2N=ELO2+(ScalingFactor*(SB-Prob2))
    return ELO1N,ELO2N

In [3]:
ELO_math(1600,1700,32,1)

(1620.4820799936924, 1679.5179200063076)

In [4]:
df=pd.read_csv('atp_matches_2015_2024.csv')

In [5]:
df1=df

In [6]:
def get_h2h_features(df, player1, player2):
    h2h = df[
        ((df['winner_name'] == player1) & (df['loser_name'] == player2)) |
        ((df['winner_name'] == player2) & (df['loser_name'] == player1))
    ]

    total_matches = len(h2h)
    player1_wins = (h2h['winner_name'] == player1).sum()
    player2_wins = (h2h['winner_name'] == player2).sum()

    if total_matches > 0:
        player1_winrate = player1_wins / total_matches
    else:
        player1_winrate = 0.5

    return {
        'h2h_matches': total_matches,
        'player1_h2h_wins': player1_wins,
        'player2_h2h_wins': player2_wins,
        'player1_h2h_winrate': player1_winrate
    }

In [7]:
k=get_h2h_features(df1,'Carlos Alcaraz','Jannik Sinner')

In [8]:
k['player1_h2h_wins']

np.int64(6)

In [9]:
def add_h2h_columns(df):

    df['player1_h2h_wins'] = 0
    df['player2_h2h_wins'] = 0

    h2h_records = {}

    for idx, row in df.iterrows():
        p1, p2 = row['winner_name'], row['loser_name'] 
        key = tuple(sorted([p1, p2]))

        if key in h2h_records:
            w1, w2 = h2h_records[key]
        else:
            w1, w2 = 0, 0

        if key[0] == p1:
            df.at[idx, 'player1_h2h_wins'] = w1
            df.at[idx, 'player2_h2h_wins'] = w2
        else:
            df.at[idx, 'player1_h2h_wins'] = w2
            df.at[idx, 'player2_h2h_wins'] = w1

        if row['winner_name'] == p1:
            if key[0] == p1:
                h2h_records[key] = (w1 + 1, w2)
            else:
                h2h_records[key] = (w1, w2 + 1)
        else:
            if key[0] == p2:
                h2h_records[key] = (w1 + 1, w2)
            else:
                h2h_records[key] = (w1, w2 + 1)

    return df


In [10]:
df=add_h2h_columns(df)

In [11]:
df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,season,player1_h2h_wins,player2_h2h_wins
0,2015-339,Brisbane,Hard,28,A,20150104,6,105238,7.0,,...,8.0,11.0,15.0,23.0,1455.0,71.0,700.0,2015,0,0
1,2015-339,Brisbane,Hard,28,A,20150104,25,103819,1.0,,...,8.0,3.0,7.0,2.0,9625.0,11.0,3645.0,2015,0,0
2,2015-339,Brisbane,Hard,28,A,20150104,26,105683,3.0,,...,18.0,3.0,3.0,8.0,4440.0,5.0,5025.0,2015,0,0
3,2015-339,Brisbane,Hard,28,A,20150104,8,105032,,,...,8.0,1.0,4.0,85.0,586.0,84.0,595.0,2015,0,0
4,2015-339,Brisbane,Hard,28,A,20150104,7,103997,,Q,...,11.0,4.0,7.0,177.0,282.0,16.0,2080.0,2015,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27667,2024-7696,Next Gen Finals,Hard,8,F,20241218,399,211663,8.0,,...,9.0,10.0,14.0,145.0,409.0,128.0,471.0,2024,0,0
27668,2024-7696,Next Gen Finals,Hard,8,F,20241218,388,209414,6.0,,...,11.0,11.0,13.0,128.0,471.0,50.0,1115.0,2024,0,0
27669,2024-7696,Next Gen Finals,Hard,8,F,20241218,389,210506,2.0,,...,12.0,4.0,7.0,41.0,1245.0,138.0,440.0,2024,0,0
27670,2024-7696,Next Gen Finals,Hard,8,F,20241218,390,210506,2.0,,...,11.0,1.0,3.0,41.0,1245.0,128.0,471.0,2024,0,1


In [12]:
df['player2_h2h_wins'].unique()

array([ 0,  1,  2,  4,  3,  8,  9,  7,  5,  6, 10, 11])

In [13]:
df.columns


Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'season', 'player1_h2h_wins', 'player2_h2h_wins'],
      dtype='object')

In [14]:
columns_to_drop = [
    'tourney_id',
    'tourney_name',
    'draw_size',
    'tourney_level',
    'match_num',
    'winner_id',
    'loser_id',
    'winner_entry',
    'loser_entry',
    'winner_hand',
    'loser_hand',
    'winner_ht',
    'loser_ht',
    'score',       
    'minutes',    
    'season'      
]


In [15]:
df=df.drop(columns_to_drop,axis=1)

In [16]:
# Keep only rows where Surface is 'Hard'
df = df[df['surface'] == 'Hard'].reset_index(drop=True)

In [17]:
df['surface'].unique()

array(['Hard'], dtype=object)

In [18]:
len(df['winner_name'].unique())

667

In [19]:
df.columns

Index(['surface', 'tourney_date', 'winner_seed', 'winner_name', 'winner_ioc',
       'winner_age', 'loser_seed', 'loser_name', 'loser_ioc', 'loser_age',
       'best_of', 'round', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon',
       'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df',
       'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved',
       'l_bpFaced', 'winner_rank', 'winner_rank_points', 'loser_rank',
       'loser_rank_points', 'player1_h2h_wins', 'player2_h2h_wins'],
      dtype='object')

In [20]:
df.dtypes

surface                object
tourney_date            int64
winner_seed           float64
winner_name            object
winner_ioc             object
winner_age            float64
loser_seed            float64
loser_name             object
loser_ioc              object
loser_age             float64
best_of                 int64
round                  object
w_ace                 float64
w_df                  float64
w_svpt                float64
w_1stIn               float64
w_1stWon              float64
w_2ndWon              float64
w_SvGms               float64
w_bpSaved             float64
w_bpFaced             float64
l_ace                 float64
l_df                  float64
l_svpt                float64
l_1stIn               float64
l_1stWon              float64
l_2ndWon              float64
l_SvGms               float64
l_bpSaved             float64
l_bpFaced             float64
winner_rank           float64
winner_rank_points    float64
loser_rank            float64
loser_rank

In [21]:
df=df.rename(columns={'loser_name':'Player2','winner_name':'Player1'})
df=df.drop(['w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon','w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced','l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon','l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced'],axis=1)

In [22]:
df=df.drop(['tourney_date','best_of'],axis=1)

In [23]:
LL=df.columns

In [24]:
len(LL)

16

In [25]:
LL[0]

'surface'

In [26]:
LL

Index(['surface', 'winner_seed', 'Player1', 'winner_ioc', 'winner_age',
       'loser_seed', 'Player2', 'loser_ioc', 'loser_age', 'round',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'player1_h2h_wins', 'player2_h2h_wins'],
      dtype='object')

In [27]:
L=[]
for i in LL:
    if df[i].dtypes==object:
        L.append(i)

In [28]:
df[L]

Unnamed: 0,surface,Player1,winner_ioc,Player2,loser_ioc,round
0,Hard,Alexandr Dolgopolov,UKR,Carlos Berlocq,ARG,R32
1,Hard,Roger Federer,SUI,Grigor Dimitrov,BUL,SF
2,Hard,Milos Raonic,CAN,Kei Nishikori,JPN,SF
3,Hard,Sam Groth,AUS,Lleyton Hewitt,AUS,R32
4,Hard,Lukasz Kubot,POL,Kevin Anderson,RSA,R32
...,...,...,...,...,...,...
16320,Hard,Joao Fonseca,BRA,Luca Van Assche,FRA,SF
16321,Hard,Luca Van Assche,FRA,Juncheng Shang,CHN,RR
16322,Hard,Alex Michelsen,USA,Nishesh Basavareddy,USA,RR
16323,Hard,Alex Michelsen,USA,Luca Van Assche,FRA,RR


In [29]:
df

Unnamed: 0,surface,winner_seed,Player1,winner_ioc,winner_age,loser_seed,Player2,loser_ioc,loser_age,round,winner_rank,winner_rank_points,loser_rank,loser_rank_points,player1_h2h_wins,player2_h2h_wins
0,Hard,7.0,Alexandr Dolgopolov,UKR,26.1,,Carlos Berlocq,ARG,31.9,R32,23.0,1455.0,71.0,700.0,0,0
1,Hard,1.0,Roger Federer,SUI,33.4,4.0,Grigor Dimitrov,BUL,23.6,SF,2.0,9625.0,11.0,3645.0,0,0
2,Hard,3.0,Milos Raonic,CAN,24.0,2.0,Kei Nishikori,JPN,25.0,SF,8.0,4440.0,5.0,5025.0,0,0
3,Hard,,Sam Groth,AUS,27.2,,Lleyton Hewitt,AUS,33.8,R32,85.0,586.0,84.0,595.0,0,0
4,Hard,,Lukasz Kubot,POL,32.6,5.0,Kevin Anderson,RSA,28.6,R32,177.0,282.0,16.0,2080.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16320,Hard,8.0,Joao Fonseca,BRA,18.3,6.0,Luca Van Assche,FRA,20.5,SF,145.0,409.0,128.0,471.0,0,0
16321,Hard,6.0,Luca Van Assche,FRA,20.5,4.0,Juncheng Shang,CHN,19.8,RR,128.0,471.0,50.0,1115.0,0,0
16322,Hard,2.0,Alex Michelsen,USA,20.3,7.0,Nishesh Basavareddy,USA,19.6,RR,41.0,1245.0,138.0,440.0,0,0
16323,Hard,2.0,Alex Michelsen,USA,20.3,6.0,Luca Van Assche,FRA,20.5,RR,41.0,1245.0,128.0,471.0,0,1


In [30]:
df['winner_seed'] = df['winner_seed'].fillna(0)
df['loser_seed'] = df['loser_seed'].fillna(0)
df['winner_rank'] = df['winner_rank'].fillna(0)
df['loser_rank'] = df['loser_rank'].fillna(0)
df['winner_rank_points'] = df['winner_rank_points'].fillna(0)
df['loser_rank_points'] = df['loser_rank_points'].fillna(0)

In [31]:
df['seed_diff'] = df['winner_seed'] - df['loser_seed']
df['rank_diff'] = df['winner_rank'] - df['loser_rank']
df['rank_point_diff'] = df['winner_rank_points'] - df['loser_rank_points']
df['age_diff'] = df['winner_age'] - df['loser_age']

In [32]:
df=df.dropna()

In [33]:
df

Unnamed: 0,surface,winner_seed,Player1,winner_ioc,winner_age,loser_seed,Player2,loser_ioc,loser_age,round,winner_rank,winner_rank_points,loser_rank,loser_rank_points,player1_h2h_wins,player2_h2h_wins,seed_diff,rank_diff,rank_point_diff,age_diff
0,Hard,7.0,Alexandr Dolgopolov,UKR,26.1,0.0,Carlos Berlocq,ARG,31.9,R32,23.0,1455.0,71.0,700.0,0,0,7.0,-48.0,755.0,-5.8
1,Hard,1.0,Roger Federer,SUI,33.4,4.0,Grigor Dimitrov,BUL,23.6,SF,2.0,9625.0,11.0,3645.0,0,0,-3.0,-9.0,5980.0,9.8
2,Hard,3.0,Milos Raonic,CAN,24.0,2.0,Kei Nishikori,JPN,25.0,SF,8.0,4440.0,5.0,5025.0,0,0,1.0,3.0,-585.0,-1.0
3,Hard,0.0,Sam Groth,AUS,27.2,0.0,Lleyton Hewitt,AUS,33.8,R32,85.0,586.0,84.0,595.0,0,0,0.0,1.0,-9.0,-6.6
4,Hard,0.0,Lukasz Kubot,POL,32.6,5.0,Kevin Anderson,RSA,28.6,R32,177.0,282.0,16.0,2080.0,0,0,-5.0,161.0,-1798.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16320,Hard,8.0,Joao Fonseca,BRA,18.3,6.0,Luca Van Assche,FRA,20.5,SF,145.0,409.0,128.0,471.0,0,0,2.0,17.0,-62.0,-2.2
16321,Hard,6.0,Luca Van Assche,FRA,20.5,4.0,Juncheng Shang,CHN,19.8,RR,128.0,471.0,50.0,1115.0,0,0,2.0,78.0,-644.0,0.7
16322,Hard,2.0,Alex Michelsen,USA,20.3,7.0,Nishesh Basavareddy,USA,19.6,RR,41.0,1245.0,138.0,440.0,0,0,-5.0,-97.0,805.0,0.7
16323,Hard,2.0,Alex Michelsen,USA,20.3,6.0,Luca Van Assche,FRA,20.5,RR,41.0,1245.0,128.0,471.0,0,1,-4.0,-87.0,774.0,-0.2


In [34]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') 
encoded_surface = encoder.fit_transform(df[['surface']])
encoded_cols = encoder.get_feature_names_out(['surface'])
encoded_df = pd.DataFrame(encoded_surface, columns=encoded_cols, index=df.index)
df = pd.concat([df.drop(columns='surface'), encoded_df], axis=1)
round_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_round = round_encoder.fit_transform(df[['round']])
encoded_round_cols = round_encoder.get_feature_names_out(['round'])
encoded_round_df = pd.DataFrame(encoded_round, columns=encoded_round_cols, index=df.index)
df = pd.concat([df.drop(columns='round'), encoded_round_df], axis=1)
print(df.head())



   winner_seed              Player1 winner_ioc  winner_age  loser_seed  \
0          7.0  Alexandr Dolgopolov        UKR        26.1         0.0   
1          1.0        Roger Federer        SUI        33.4         4.0   
2          3.0         Milos Raonic        CAN        24.0         2.0   
3          0.0            Sam Groth        AUS        27.2         0.0   
4          0.0         Lukasz Kubot        POL        32.6         5.0   

           Player2 loser_ioc  loser_age  winner_rank  winner_rank_points  ...  \
0   Carlos Berlocq       ARG       31.9         23.0              1455.0  ...   
1  Grigor Dimitrov       BUL       23.6          2.0              9625.0  ...   
2    Kei Nishikori       JPN       25.0          8.0              4440.0  ...   
3   Lleyton Hewitt       AUS       33.8         85.0               586.0  ...   
4   Kevin Anderson       RSA       28.6        177.0               282.0  ...   

   surface_Hard  round_BR  round_F  round_QF  round_R128  round_R16 

In [35]:
df

Unnamed: 0,winner_seed,Player1,winner_ioc,winner_age,loser_seed,Player2,loser_ioc,loser_age,winner_rank,winner_rank_points,...,surface_Hard,round_BR,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF
0,7.0,Alexandr Dolgopolov,UKR,26.1,0.0,Carlos Berlocq,ARG,31.9,23.0,1455.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,Roger Federer,SUI,33.4,4.0,Grigor Dimitrov,BUL,23.6,2.0,9625.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3.0,Milos Raonic,CAN,24.0,2.0,Kei Nishikori,JPN,25.0,8.0,4440.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,Sam Groth,AUS,27.2,0.0,Lleyton Hewitt,AUS,33.8,85.0,586.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,Lukasz Kubot,POL,32.6,5.0,Kevin Anderson,RSA,28.6,177.0,282.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16320,8.0,Joao Fonseca,BRA,18.3,6.0,Luca Van Assche,FRA,20.5,145.0,409.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16321,6.0,Luca Van Assche,FRA,20.5,4.0,Juncheng Shang,CHN,19.8,128.0,471.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16322,2.0,Alex Michelsen,USA,20.3,7.0,Nishesh Basavareddy,USA,19.6,41.0,1245.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16323,2.0,Alex Michelsen,USA,20.3,6.0,Luca Van Assche,FRA,20.5,41.0,1245.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [36]:
df = df.rename(columns={
    'winner_seed': 'Player1_seed',
    'loser_seed': 'Player2_seed',
    'winner_age': 'Player1_age',
    'loser_age': 'Player2_age',
    'winner_rank': 'Player1_rank',
    'loser_rank': 'Player2_rank',
    'winner_rank_points': 'Player1_rank_points',
    'loser_rank_points': 'Player2_rank_points'
})


In [37]:
df

Unnamed: 0,Player1_seed,Player1,winner_ioc,Player1_age,Player2_seed,Player2,loser_ioc,Player2_age,Player1_rank,Player1_rank_points,...,surface_Hard,round_BR,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF
0,7.0,Alexandr Dolgopolov,UKR,26.1,0.0,Carlos Berlocq,ARG,31.9,23.0,1455.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,Roger Federer,SUI,33.4,4.0,Grigor Dimitrov,BUL,23.6,2.0,9625.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3.0,Milos Raonic,CAN,24.0,2.0,Kei Nishikori,JPN,25.0,8.0,4440.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,Sam Groth,AUS,27.2,0.0,Lleyton Hewitt,AUS,33.8,85.0,586.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,Lukasz Kubot,POL,32.6,5.0,Kevin Anderson,RSA,28.6,177.0,282.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16320,8.0,Joao Fonseca,BRA,18.3,6.0,Luca Van Assche,FRA,20.5,145.0,409.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16321,6.0,Luca Van Assche,FRA,20.5,4.0,Juncheng Shang,CHN,19.8,128.0,471.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16322,2.0,Alex Michelsen,USA,20.3,7.0,Nishesh Basavareddy,USA,19.6,41.0,1245.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16323,2.0,Alex Michelsen,USA,20.3,6.0,Luca Van Assche,FRA,20.5,41.0,1245.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [38]:
df=df.drop(['winner_ioc','loser_ioc'],axis=1)

In [39]:
df.columns

Index(['Player1_seed', 'Player1', 'Player1_age', 'Player2_seed', 'Player2',
       'Player2_age', 'Player1_rank', 'Player1_rank_points', 'Player2_rank',
       'Player2_rank_points', 'player1_h2h_wins', 'player2_h2h_wins',
       'seed_diff', 'rank_diff', 'rank_point_diff', 'age_diff', 'surface_Hard',
       'round_BR', 'round_F', 'round_QF', 'round_R128', 'round_R16',
       'round_R32', 'round_R64', 'round_RR', 'round_SF'],
      dtype='object')

In [40]:
df = df.loc[:, ~df.columns.duplicated()]


In [41]:
df  

Unnamed: 0,Player1_seed,Player1,Player1_age,Player2_seed,Player2,Player2_age,Player1_rank,Player1_rank_points,Player2_rank,Player2_rank_points,...,surface_Hard,round_BR,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF
0,7.0,Alexandr Dolgopolov,26.1,0.0,Carlos Berlocq,31.9,23.0,1455.0,71.0,700.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,Roger Federer,33.4,4.0,Grigor Dimitrov,23.6,2.0,9625.0,11.0,3645.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3.0,Milos Raonic,24.0,2.0,Kei Nishikori,25.0,8.0,4440.0,5.0,5025.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,Sam Groth,27.2,0.0,Lleyton Hewitt,33.8,85.0,586.0,84.0,595.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,Lukasz Kubot,32.6,5.0,Kevin Anderson,28.6,177.0,282.0,16.0,2080.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16320,8.0,Joao Fonseca,18.3,6.0,Luca Van Assche,20.5,145.0,409.0,128.0,471.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
16321,6.0,Luca Van Assche,20.5,4.0,Juncheng Shang,19.8,128.0,471.0,50.0,1115.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16322,2.0,Alex Michelsen,20.3,7.0,Nishesh Basavareddy,19.6,41.0,1245.0,138.0,440.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16323,2.0,Alex Michelsen,20.3,6.0,Luca Van Assche,20.5,41.0,1245.0,128.0,471.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [42]:
df.columns

Index(['Player1_seed', 'Player1', 'Player1_age', 'Player2_seed', 'Player2',
       'Player2_age', 'Player1_rank', 'Player1_rank_points', 'Player2_rank',
       'Player2_rank_points', 'player1_h2h_wins', 'player2_h2h_wins',
       'seed_diff', 'rank_diff', 'rank_point_diff', 'age_diff', 'surface_Hard',
       'round_BR', 'round_F', 'round_QF', 'round_R128', 'round_R16',
       'round_R32', 'round_R64', 'round_RR', 'round_SF'],
      dtype='object')

In [43]:
df['ELO1']=1500
df['ELO2']=1500

In [44]:
df["ELO1"] = df["ELO1"].astype(float)
df["ELO2"] = df["ELO2"].astype(float)


In [45]:
def get_ELO_rating(df):

    # Dictionary to store latest ratings
    ratings = {}

    df = df.reset_index(drop=True)

    for i in range(len(df)):
        p1, p2 = df.loc[i, "Player1"], df.loc[i, "Player2"]

        # Get latest ratings (default 1500)
        r1 = ratings.get(p1, 1500.0)
        r2 = ratings.get(p2, 1500.0)

        # Update row with current ratings
        df.loc[i, "ELO1"] = r1
        df.loc[i, "ELO2"] = r2

        # Compute new ratings (Player1 always winner here)
        new_r1, new_r2 = ELO_math(r1, r2, 32, 1)

        # Save back to dictionary
        ratings[p1] = new_r1
        ratings[p2] = new_r2

    return df


In [46]:
df=get_ELO_rating(df)

In [47]:
df

Unnamed: 0,Player1_seed,Player1,Player1_age,Player2_seed,Player2,Player2_age,Player1_rank,Player1_rank_points,Player2_rank,Player2_rank_points,...,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,ELO1,ELO2
0,7.0,Alexandr Dolgopolov,26.1,0.0,Carlos Berlocq,31.9,23.0,1455.0,71.0,700.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000
1,1.0,Roger Federer,33.4,4.0,Grigor Dimitrov,23.6,2.0,9625.0,11.0,3645.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1500.000000,1500.000000
2,3.0,Milos Raonic,24.0,2.0,Kei Nishikori,25.0,8.0,4440.0,5.0,5025.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1500.000000,1500.000000
3,0.0,Sam Groth,27.2,0.0,Lleyton Hewitt,33.8,85.0,586.0,84.0,595.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000
4,0.0,Lukasz Kubot,32.6,5.0,Kevin Anderson,28.6,177.0,282.0,16.0,2080.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16317,8.0,Joao Fonseca,18.3,6.0,Luca Van Assche,20.5,145.0,409.0,128.0,471.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1602.878162,1517.998016
16318,6.0,Luca Van Assche,20.5,4.0,Juncheng Shang,19.8,128.0,471.0,50.0,1115.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1505.830937,1611.356733
16319,2.0,Alex Michelsen,20.3,7.0,Nishesh Basavareddy,19.6,41.0,1245.0,138.0,440.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1644.211400,1484.096453
16320,2.0,Alex Michelsen,20.3,6.0,Luca Van Assche,20.5,41.0,1245.0,128.0,471.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1653.319001,1526.546460


In [48]:
df['target']=1

In [49]:
import pandas as pd
winners_df = df.copy()
winners_df['target'] = 1

losers_df = df.copy()
swap_pairs = [
    ('Player1_seed', 'Player2_seed'),
    ('Player1', 'Player2'),
    ('Player1_age', 'Player2_age'),
    ('Player1_rank', 'Player2_rank'),
    ('Player1_rank_points', 'Player2_rank_points')
]
for p1, p2 in swap_pairs:
    losers_df[p1], losers_df[p2] = df[p2], df[p1]
losers_df['seed_diff'] = losers_df['Player1_seed'] - losers_df['Player2_seed']
losers_df['rank_diff'] = losers_df['Player1_rank'] - losers_df['Player2_rank']
losers_df['rank_point_diff'] = losers_df['Player1_rank_points'] - losers_df['Player2_rank_points']
losers_df['age_diff'] = losers_df['Player1_age'] - losers_df['Player2_age']
losers_df['target'] = 0
final_df = pd.concat([winners_df, losers_df], ignore_index=True)

print(final_df['target'].value_counts())



target
1    16322
0    16322
Name: count, dtype: int64


In [50]:
print(final_df['target'].value_counts())

target
1    16322
0    16322
Name: count, dtype: int64


In [51]:
final_df = final_df.drop(columns=['surface_Hard'])


In [52]:
dftrain=final_df.drop(['Player1','Player2'],axis=1)

In [53]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32644 entries, 0 to 32643
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player1_seed         32644 non-null  float64
 1   Player1_age          32644 non-null  float64
 2   Player2_seed         32644 non-null  float64
 3   Player2_age          32644 non-null  float64
 4   Player1_rank         32644 non-null  float64
 5   Player1_rank_points  32644 non-null  float64
 6   Player2_rank         32644 non-null  float64
 7   Player2_rank_points  32644 non-null  float64
 8   player1_h2h_wins     32644 non-null  int64  
 9   player2_h2h_wins     32644 non-null  int64  
 10  seed_diff            32644 non-null  float64
 11  rank_diff            32644 non-null  float64
 12  rank_point_diff      32644 non-null  float64
 13  age_diff             32644 non-null  float64
 14  round_BR             32644 non-null  float64
 15  round_F              32644 non-null 

In [54]:
df.columns

Index(['Player1_seed', 'Player1', 'Player1_age', 'Player2_seed', 'Player2',
       'Player2_age', 'Player1_rank', 'Player1_rank_points', 'Player2_rank',
       'Player2_rank_points', 'player1_h2h_wins', 'player2_h2h_wins',
       'seed_diff', 'rank_diff', 'rank_point_diff', 'age_diff', 'surface_Hard',
       'round_BR', 'round_F', 'round_QF', 'round_R128', 'round_R16',
       'round_R32', 'round_R64', 'round_RR', 'round_SF', 'ELO1', 'ELO2',
       'target'],
      dtype='object')

In [55]:
dftrain.columns

Index(['Player1_seed', 'Player1_age', 'Player2_seed', 'Player2_age',
       'Player1_rank', 'Player1_rank_points', 'Player2_rank',
       'Player2_rank_points', 'player1_h2h_wins', 'player2_h2h_wins',
       'seed_diff', 'rank_diff', 'rank_point_diff', 'age_diff', 'round_BR',
       'round_F', 'round_QF', 'round_R128', 'round_R16', 'round_R32',
       'round_R64', 'round_RR', 'round_SF', 'ELO1', 'ELO2', 'target'],
      dtype='object')

In [56]:
y=dftrain['target']
X=dftrain.drop(['target'],axis=1)

In [57]:
y.unique()

array([1, 0])

In [58]:
X

Unnamed: 0,Player1_seed,Player1_age,Player2_seed,Player2_age,Player1_rank,Player1_rank_points,Player2_rank,Player2_rank_points,player1_h2h_wins,player2_h2h_wins,...,round_F,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,ELO1,ELO2
0,7.0,26.1,0.0,31.9,23.0,1455.0,71.0,700.0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000
1,1.0,33.4,4.0,23.6,2.0,9625.0,11.0,3645.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1500.000000,1500.000000
2,3.0,24.0,2.0,25.0,8.0,4440.0,5.0,5025.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1500.000000,1500.000000
3,0.0,27.2,0.0,33.8,85.0,586.0,84.0,595.0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000
4,0.0,32.6,5.0,28.6,177.0,282.0,16.0,2080.0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32639,6.0,20.5,8.0,18.3,128.0,471.0,145.0,409.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1602.878162,1517.998016
32640,4.0,19.8,6.0,20.5,50.0,1115.0,128.0,471.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1505.830937,1611.356733
32641,7.0,19.6,2.0,20.3,138.0,440.0,41.0,1245.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1644.211400,1484.096453
32642,6.0,20.5,2.0,20.3,128.0,471.0,41.0,1245.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1653.319001,1526.546460


In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32644 entries, 0 to 32643
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player1_seed         32644 non-null  float64
 1   Player1_age          32644 non-null  float64
 2   Player2_seed         32644 non-null  float64
 3   Player2_age          32644 non-null  float64
 4   Player1_rank         32644 non-null  float64
 5   Player1_rank_points  32644 non-null  float64
 6   Player2_rank         32644 non-null  float64
 7   Player2_rank_points  32644 non-null  float64
 8   player1_h2h_wins     32644 non-null  int64  
 9   player2_h2h_wins     32644 non-null  int64  
 10  seed_diff            32644 non-null  float64
 11  rank_diff            32644 non-null  float64
 12  rank_point_diff      32644 non-null  float64
 13  age_diff             32644 non-null  float64
 14  round_BR             32644 non-null  float64
 15  round_F              32644 non-null 

In [61]:
dftrain['Player1_age'].unique()

array([26.1, 33.4, 24. , 27.2, 32.6, 22.9, 27. , 27.8, 25.4, 25.5, 25. ,
       22.2, 18.7, 23.6, 32.7, 35.8, 30.8, 29.3, 29.7, 28.9, 26.7, 27.6,
       26.6, 31.5, 23.4, 31.3, 29.2, 28.5, 24.6, 24.5, 31.6, 27.7, 21.4,
       30. , 34.5, 31.1, 22.8, 18.1, 24.1, 29. , 26.3, 33. , 27.9, 21.5,
       26.5, 20.8, 28.6, 26.9, 31.4, 24.7, 22.4, 23.5, 29.8, 19.7, 27.1,
       28.7, 25.9, 24.2, 29.4, 31.2, 33.6, 30.9, 29.5, 25.8, 33.5, 33.3,
       33.9, 28.3, 30.3, 23.9, 20.9, 28.4, 30.4, 22.7, 28.8, 29.6, 27.4,
       25.7, 31.7, 30.1, 23.7, 26.8, 27.3, 25.1, 22.3, 35.9, 18.3, 19.3,
       26.2, 29.1, 28. , 24.8, 18.2, 26. , 17.3, 18.8, 32.8, 27.5, 24.9,
       25.6, 22. , 24.3, 23. , 21.8, 19.5, 33.1, 21.9, 21.7, 31.8, 20.6,
       19. , 25.3, 31. , 34.6, 23.8, 18.9, 25.2, 32.9, 22.5, 19.8, 23.1,
       24.4, 33.7, 17.9, 30.2, 29.9, 28.1, 32.1, 33.8, 17.4, 35.4, 23.3,
       31.9, 22.6, 23.2, 17.7, 20. , 36.6, 36.3, 35. , 34.9, 26.4, 32.2,
       34.1, 34.4, 28.2, 19.2, 36.4, 30.6, 20.2, 33

In [62]:
df2=df

In [63]:
df2

Unnamed: 0,Player1_seed,Player1,Player1_age,Player2_seed,Player2,Player2_age,Player1_rank,Player1_rank_points,Player2_rank,Player2_rank_points,...,round_QF,round_R128,round_R16,round_R32,round_R64,round_RR,round_SF,ELO1,ELO2,target
0,7.0,Alexandr Dolgopolov,26.1,0.0,Carlos Berlocq,31.9,23.0,1455.0,71.0,700.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000,1
1,1.0,Roger Federer,33.4,4.0,Grigor Dimitrov,23.6,2.0,9625.0,11.0,3645.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1500.000000,1500.000000,1
2,3.0,Milos Raonic,24.0,2.0,Kei Nishikori,25.0,8.0,4440.0,5.0,5025.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1500.000000,1500.000000,1
3,0.0,Sam Groth,27.2,0.0,Lleyton Hewitt,33.8,85.0,586.0,84.0,595.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000,1
4,0.0,Lukasz Kubot,32.6,5.0,Kevin Anderson,28.6,177.0,282.0,16.0,2080.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1500.000000,1500.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16317,8.0,Joao Fonseca,18.3,6.0,Luca Van Assche,20.5,145.0,409.0,128.0,471.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1602.878162,1517.998016,1
16318,6.0,Luca Van Assche,20.5,4.0,Juncheng Shang,19.8,128.0,471.0,50.0,1115.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1505.830937,1611.356733,1
16319,2.0,Alex Michelsen,20.3,7.0,Nishesh Basavareddy,19.6,41.0,1245.0,138.0,440.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1644.211400,1484.096453,1
16320,2.0,Alex Michelsen,20.3,6.0,Luca Van Assche,20.5,41.0,1245.0,128.0,471.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1653.319001,1526.546460,1


In [64]:
def get_player_rating_from_df(df, player):
    p1_matches = df[df["Player1"] == player][["ELO1"]]
    p2_matches = df[df["Player2"] == player][["ELO2"]]
    all_matches = []
    if not p1_matches.empty:
        all_matches.append(p1_matches.iloc[-1, 0])  # last ELO1
    if not p2_matches.empty:
        all_matches.append(p2_matches.iloc[-1, 0])  # last ELO2
    
    if not all_matches:  # player not found
        return 1500.0
    
    return all_matches[-1] 


In [65]:
X_train=X_train.dropna()

In [66]:
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled  = scaler.fit_transform(X_test.values)
X_train_tensor = torch.FloatTensor(X_train_scaled)
X_test_tensor  = torch.FloatTensor(X_test_scaled)

y_train_tensor = torch.LongTensor(y_train.values)
y_test_tensor  = torch.LongTensor(y_test.values)
class Model(nn.Module):
    def __init__(self, in_features=21, h1=42, h2=21,h3=21, out_features=2): 
        super().__init__()
        self.fc1 = nn.Linear(in_features, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, h3)
        self.out = nn.Linear(h3, out_features)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.out(x)

model = Model(in_features=X_train_tensor.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
epochs = 200
for epoch in range(epochs):
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f"epoch: {epoch} loss: {loss.item():.4f}")


epoch: 9 loss: 0.5896
epoch: 19 loss: 0.4788
epoch: 29 loss: 0.4329
epoch: 39 loss: 0.4097
epoch: 49 loss: 0.4008
epoch: 59 loss: 0.3949
epoch: 69 loss: 0.3901
epoch: 79 loss: 0.3857
epoch: 89 loss: 0.3865
epoch: 99 loss: 0.3811
epoch: 109 loss: 0.3773
epoch: 119 loss: 0.3765
epoch: 129 loss: 0.3727
epoch: 139 loss: 0.3711
epoch: 149 loss: 0.3693
epoch: 159 loss: 0.3702
epoch: 169 loss: 0.3666
epoch: 179 loss: 0.3655
epoch: 189 loss: 0.3671
epoch: 199 loss: 0.3634


In [67]:
with torch.no_grad():
    y_pred_test = model(X_test_tensor)
    predicted_classes = torch.argmax(y_pred_test, dim=1)
    accuracy = (predicted_classes == y_test_tensor).sum().item() / len(y_test_tensor)
    print(f"Test Accuracy: {accuracy*100:.2f}%")


Test Accuracy: 77.93%


In [68]:
import numpy as np

print("NaNs in X_train:", X_train.isnull().sum().sum())
print("Infs in X_train:", np.isinf(X_train.values).sum())
print("NaNs in y_train:", y_train.isnull().sum())
print(X_train.dtypes)


NaNs in X_train: 0
Infs in X_train: 0
NaNs in y_train: 0
Player1_seed           float64
Player1_age            float64
Player2_seed           float64
Player2_age            float64
Player1_rank           float64
Player1_rank_points    float64
Player2_rank           float64
Player2_rank_points    float64
player1_h2h_wins         int64
player2_h2h_wins         int64
seed_diff              float64
rank_diff              float64
rank_point_diff        float64
age_diff               float64
round_BR               float64
round_F                float64
round_QF               float64
round_R128             float64
round_R16              float64
round_R32              float64
round_R64              float64
round_RR               float64
round_SF               float64
ELO1                   float64
ELO2                   float64
dtype: object


In [69]:
X_train['Player1_rank'].isnull().sum()

np.int64(0)

In [70]:
player1_seed = 1
player1_name = 'Jannik Sinner'
player1_age = 23
player1_rank = 1
player1_rank_points = 11480

player2_seed = 2
player2_name = 'Carlos Alcaraz'
player2_age = 22
player2_rank = 2
player2_rank_points = 9590
ELO1=get_player_rating_from_df(df2,player1_name)
ELO2=get_player_rating_from_df(df2,player2_name)

#Head to head
h3hh=get_h2h_features(df1,player1_name,player2_name)

seed_diff = (player1_seed - player2_seed)
rank_diff = (player1_rank - player2_rank)
rank_point_diff = (player1_rank_points - player2_rank_points)
age_diff = (player1_age - player2_age)

round_cols = {
    'round_BR': 0, 'round_F': 1, 'round_QF': 0,
    'round_R128': 0, 'round_R16': 0, 'round_R32': 0,
    'round_R64': 0, 'round_RR': 0, 'round_SF': 0
}

# ==== Create DataFrame for Match ====
match_df = pd.DataFrame([{
    'Player1_seed': player1_seed,
    'Player1_age': player1_age,
    'Player2_seed': player2_seed,
    'Player2_age': player2_age,
    'Player1_rank': player1_rank,
    'Player1_rank_points': player1_rank_points,
    'Player2_rank': player2_rank,
    'Player2_rank_points': player2_rank_points,
    'player1_h2h_wins':h3hh['player1_h2h_wins'],
    'player2_h2h_wins':h3hh['player2_h2h_wins'],
    'seed_diff': seed_diff,
    'rank_diff': rank_diff,
    'rank_point_diff': rank_point_diff,
    'age_diff': age_diff,
    **round_cols,
    'ELO1':ELO1,
    'ELO2':ELO2
}])
X_tensor = torch.tensor(match_df.values, dtype=torch.float32)

model.eval() 
with torch.no_grad():
    output = model(X_tensor)
    probs = F.softmax(output, dim=1)
    pred_class = torch.argmax(probs, dim=1).item()
    pred_prob = probs[0][pred_class].item() * 100

winner = player1_name if pred_class == 1 else player2_name

print(f"Predicted Winner: {winner}")

Predicted Winner: Jannik Sinner
