In [161]:
import pandas as pd
import numpy as np

data_combined = pd.read_csv('./../data/data_combined.csv')
data_head_to_head = pd.read_csv('./../data/data_head_to_head.csv')



In [162]:
combined_dict = {}
for index, row in data_combined.iterrows():
    combined_dict[row['PlayerName']] = row
    
def get_player_data(player_name, column):
    if player_name not in combined_dict:
        def_row = pd.Series(
            data ={
            'PlayerName': player_name,
            'EloOutdoor': 1500,
            'EloIndoor': 1500,
            'EloHard': 1500,
            'EloClay': 1500,
            'EloGrass': 1500,
            'EloCarpet': 1500,
            'Last30': 0,
            'Last7': 0,
            'AvgOddsProb': 0.0
        })
        return def_row[column]
    else:
        return combined_dict[player_name][column]
    
def get_player_data_row(player_name):
    if player_name not in combined_dict:
        def_row = pd.Series(
            data ={
            'PlayerName': player_name,
            'EloOutdoor': 1500,
            'EloIndoor': 1500,
            'EloHard': 1500,
            'EloClay': 1500,
            'EloGrass': 1500,
            'EloCarpet': 1500,
            'Last30': 0,
            'Last7': 0,
            'AvgOddsProb': 0.0
        })
        return def_row
    else:
        return combined_dict[player_name]

In [163]:
head_head_dict = {}
for index, row in data_head_to_head.iterrows():
    player1 = row['PlayerName']
    for column in data_head_to_head.columns[1:]:
        player2 = column
        key = f'{player1} vs {player2}'
        val = row[column]
        head_head_dict[key] = val

def get_win_amount_row(player_1, player_2):
    key = f'{player_1} vs {player_2}'
    return head_head_dict.get(key, 0)

In [164]:
max_elo = 4000
all_columns = ['EloDiff', 'HeadToHead', 'ProbDiff', 'Winner']


def get_nn_input(player1, player2, surface):
    player1_data = get_player_data_row(player1)
    player2_data = get_player_data_row(player2)
    p1_vs_p2 = get_win_amount_row(player_1=player1, player_2=player2)
    p2_vs_p1 = get_win_amount_row(player_1=player2, player_2=player1)
    
    elo_diff = player1_data[f'Elo{surface}'] - player2_data[f'Elo{surface}']
    head_to_head = p1_vs_p2 - p2_vs_p1
    prob_dif = player1_data['AvgOddsProb'] - player2_data['AvgOddsProb']
    
    elo_diff /= max_elo
    if head_to_head > 0:
        head_to_head = 1 - 1/head_to_head
    if head_to_head < 0:
        head_to_head = -1 + 1/abs(head_to_head)
    
    prob_dif = prob_dif
    
    return [elo_diff, head_to_head, prob_dif]

def get_data_set_for_nn(dataset : pd.DataFrame):
    data = []
    for index, row in dataset.iterrows():
        player1 = row['Player_1']
        player2 = row['Player_2']
        surface = row['Surface']
        winner = row['Winner']

        nn_in = get_nn_input(player1, player2, surface)
        data.append(
            nn_in + [1 if player1 == winner else 0]
        )
     

    nn_data = pd.DataFrame(data, columns=all_columns)
    nn_data = nn_data.sample(frac=1).reset_index(drop=True)
    return nn_data

data_for_training = pd.read_csv('./../data/data_for_training.csv')
nn_data = get_data_set_for_nn(dataset=data_for_training)
print(nn_data.head())
nn_data.to_csv('./../data/nn_data.csv', index=False)

    EloDiff  HeadToHead  ProbDiff  Winner
0 -0.006789         0.5 -0.010572       1
1  0.022271         0.0 -0.027261       1
2 -0.002064         0.0  0.000000       1
3  0.028253        -0.5 -0.076674       0
4  0.052406         0.0  0.595692       0


In [165]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf

input_data = nn_data[all_columns[:-1]]
output_data = nn_data['Winner']

X_train, y_train = input_data, output_data

model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=30, batch_size=50, validation_split=0.2)


# _, accuracy = model.evaluate(X_test, y_test)
# print(f'Accuracy: {accuracy}')

tf.keras.models.save_model(model, './../models/nn_model.keras')

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1028/1028[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6879 - loss: 0.5852 - val_accuracy: 0.7317 - val_loss: 0.5294
Epoch 2/30
[1m1028/1028[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7235 - loss: 0.5505 - val_accuracy: 0.7309 - val_loss: 0.5275
Epoch 3/30
[1m1028/1028[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7275 - loss: 0.5450 - val_accuracy: 0.7303 - val_loss: 0.5284
Epoch 4/30
[1m1028/1028[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7287 - loss: 0.5434 - val_accuracy: 0.7309 - val_loss: 0.5294
Epoch 5/30
[1m1028/1028[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7256 - loss: 0.5487 - val_accuracy: 0.7279 - val_loss: 0.5294
Epoch 6/30
[1m1028/1028[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7263 - loss: 0.5466 - val_accuracy: 0.7329 - val_loss: 0.5273
Epoch 7/30
[1m1028/1028[0

In [166]:
# TESTING

data_for_testing = pd.read_csv('./../data/data_for_testing.csv')
print(len(data_for_testing))

test_data = get_data_set_for_nn(dataset=data_for_testing)
X_test, y_test = test_data[all_columns[:-1]], test_data['Winner']

_, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy:.3f}')

588
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6422 - loss: 0.6534 
Accuracy: 0.636


In [167]:
def predict_match(player1, player2, surface):
    nn_in = get_nn_input(player1, player2, surface)
    input_data = np.array([nn_in])
    prediction = model.predict(input_data)
    
    res = pd.DataFrame({
        'PlayerName': [player1, player2],
        'WinProb': [1 - prediction[0][0], prediction[0][0]]
    })
    
    return res


model = tf.keras.models.load_model('./../models/nn_model.keras')

# player1 = 'Martinez P.'
# player2 = 'Brooksby J.'
# surface = 'Hard'

player1 = 'Tiafoe F.'
player2 = 'Davidovich Fokina A.'
surface = 'Hard'

prediction1 = predict_match(player1, player2, surface)
prediction2 = predict_match(player2, player1, surface)

print('==================================')
print(prediction1)
print('==================================')
print(prediction2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
             PlayerName   WinProb
0             Tiafoe F.  0.133112
1  Davidovich Fokina A.  0.866888
             PlayerName   WinProb
0  Davidovich Fokina A.  0.857679
1             Tiafoe F.  0.142321
