In [47]:
import pandas as pd

############ load and digitize the data
data=pd.read_csv('/home/ebotian/MCM/tennis.csv')
data = pd.get_dummies(data, columns=['winner_shot_type','serve_width','serve_depth','return_depth'])
############

############# pre-process the data
# fill nan with 0, and replace AD with 50
data = data.fillna(0)
data = data.replace('AD', 50.0)
data['point_victor']=data["point_victor"].replace(2,0)
#print(data.iloc[:,15])

# split the data into different match
grouped = dict(tuple(data.groupby(data['match_id'].ne(data['match_id'].shift()).cumsum())))

# Rename the subdata
subdata = {df['match_id'].iloc[0]: df for _, df in grouped.items()}

# Create a new dataset from the first column, excluding duplicates
match = pd.DataFrame(data.iloc[:, 0].drop_duplicates()).iloc[:,0].tolist()
#print(match_id[0])
##############


In [48]:
# the id of the match
id=0

In [49]:
# Calculate the time difference between consecutive rows

# Convert the timestamp column to datetime format
subdata[match[id]]['elapsed_time'] = pd.to_timedelta(subdata[match[id]]['elapsed_time'])

# Calculate the time difference between consecutive rows
subdata[match[id]]['time_diff'] = subdata[match[id]]['elapsed_time'].diff()

# Calculate the 5th and 95th percentiles
lower_threshold = subdata[match[id]]['time_diff'].quantile(0.05)
upper_threshold = subdata[match[id]]['time_diff'].quantile(0.95)

# Exclude the top 5% and bottom 5% of periods
filtered_diff = subdata[match[id]]['time_diff'][(subdata[match[id]]['time_diff'] > lower_threshold) & (subdata[match[id]]['time_diff'] < upper_threshold)]

# Calculate the average of the remaining intervals
average_interval = filtered_diff.mean()

print(f'Average interval (excluding top 5% and bottom 5%): {average_interval}')

# Convert the time differences to integer seconds
subdata[match[id]]['time_diff'] = subdata[match[id]]['time_diff'].dt.total_seconds()

# Replace 'NaT' values with 0
subdata[match[id]]['time_diff'] = subdata[match[id]]['time_diff'].fillna(0).astype(int)
#fill nan with 0
#subdata[match[id]]['time_diff'] = subdata[match[id]]['time_diff'].fillna(0)

Average interval (excluding top 5% and bottom 5%): 0 days 00:00:43.784090909


In [50]:
############ add features
add_feature=["score_diff"]

########## defining the new features

#subdata[match_id[0]][add_feature[0]] = subdata[match_id[0]]['p1_games'] - subdata[match_id[0]]['p2_games']

In [51]:
# split the data into features and target
target=pd.DataFrame(subdata[match[id]]["point_victor"])
# Add the "elapsed_time" column to the "target" DataFrame
target.insert(0, 'elapsed_time', subdata[match[id]]['elapsed_time'])
target['elapsed_time'] = target['elapsed_time'].dt.total_seconds()

subdata[match[id]]=subdata[match[id]].drop(columns=["point_victor"])
features=subdata[match[id]].iloc[:,4:]
#print(target)

In [52]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.linear_model import LinearRegression
import numpy as np

# Assuming 'target' is your time series data...

# Split data into training set and test set
train_size = int(len(target) * 0.7)
train, test = target[0:train_size], target[train_size:len(target)]
train=train.astype(float)

In [53]:

# ARIMA
model_arima = ARIMA(train["point_victor"], order=(5,1,0))
model_arima_fit = model_arima.fit()
forecast_output = model_arima_fit.forecast(steps=len(test))
#print(forecast_output)
#print(test)
rmse_arima = sqrt(mean_squared_error(test["point_victor"], forecast_output))
print(rmse_arima)

0.5253652185900073


In [54]:

# SARIMA
model_sarima = SARIMAX(train["point_victor"], order=(1, 1, 1), seasonal_order=(0, 0, 0, 0))
model_sarima_fit = model_sarima.fit(disp=0)
predictions_sarima = model_sarima_fit.forecast(steps=len(test))
rmse_sarima = sqrt(mean_squared_error(test["point_victor"], predictions_sarima))
print(rmse_sarima)


0.49952779636230954


In [61]:
# LSTM
from re import X


X_train = np.array([[train["elapsed_time"].values[i]] for i in range(1, len(train))])
y_train = np.array([train["point_victor"].values[i] for i in range(1, len(train))])
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(1, 1)))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')
model_lstm.fit(X_train, y_train, epochs=50, verbose=0)
X_test = np.array([[test["elapsed_time"].values[i]] for i in range(1, len(test))])
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
predictions_lstm = model_lstm.predict(X_test)
print(X_test)
print(predictions_lstm)
rmse_lstm = sqrt(mean_squared_error(test["point_victor"][1:], predictions_lstm))
print(rmse_lstm)

[[[ 9989.]]

 [[10157.]]

 [[10203.]]

 [[10248.]]

 [[10272.]]

 [[10300.]]

 [[10355.]]

 [[10390.]]

 [[10443.]]

 [[10483.]]

 [[10510.]]

 [[10553.]]

 [[10629.]]

 [[10666.]]

 [[10712.]]

 [[10755.]]

 [[10788.]]

 [[10840.]]

 [[10907.]]

 [[10945.]]

 [[11007.]]

 [[11046.]]

 [[11104.]]

 [[11136.]]

 [[11176.]]

 [[11229.]]

 [[11258.]]

 [[11342.]]

 [[11395.]]

 [[11427.]]

 [[11554.]]

 [[11571.]]

 [[11592.]]

 [[11627.]]

 [[11647.]]

 [[11684.]]

 [[11712.]]

 [[11753.]]

 [[11818.]]

 [[11842.]]

 [[11875.]]

 [[11924.]]

 [[11953.]]

 [[11986.]]

 [[12021.]]

 [[12054.]]

 [[12178.]]

 [[12235.]]

 [[12303.]]

 [[12328.]]

 [[12364.]]

 [[12422.]]

 [[12488.]]

 [[12536.]]

 [[12561.]]

 [[12594.]]

 [[12631.]]

 [[12747.]]

 [[12781.]]

 [[12825.]]

 [[12864.]]

 [[12893.]]

 [[12932.]]

 [[12954.]]

 [[12986.]]

 [[13012.]]

 [[13041.]]

 [[13172.]]

 [[13214.]]

 [[13260.]]

 [[13292.]]

 [[13336.]]

 [[13368.]]

 [[13429.]]

 [[13463.]]

 [[13510.]]

 [[13548.]]


In [57]:

# Linear Regression
from re import X


X_train = np.array([i for i in range(0, len(train))]).reshape(-1, 1)
y_train = train.values
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)
X_test = np.array([i for i in range(len(train), len(train) + len(test))]).reshape(-1, 1)
predictions_linear = model_linear.predict(X_test)
rmse_linear = sqrt(mean_squared_error(test, predictions_linear))

print(X_test)

# Print RMSE values
print('RMSE values:')
print('ARIMA: ', rmse_arima)
print('SARIMA: ', rmse_sarima)
print('LSTM: ', rmse_lstm)
print('Linear Regression: ', rmse_linear)

[[210]
 [211]
 [212]
 [213]
 [214]
 [215]
 [216]
 [217]
 [218]
 [219]
 [220]
 [221]
 [222]
 [223]
 [224]
 [225]
 [226]
 [227]
 [228]
 [229]
 [230]
 [231]
 [232]
 [233]
 [234]
 [235]
 [236]
 [237]
 [238]
 [239]
 [240]
 [241]
 [242]
 [243]
 [244]
 [245]
 [246]
 [247]
 [248]
 [249]
 [250]
 [251]
 [252]
 [253]
 [254]
 [255]
 [256]
 [257]
 [258]
 [259]
 [260]
 [261]
 [262]
 [263]
 [264]
 [265]
 [266]
 [267]
 [268]
 [269]
 [270]
 [271]
 [272]
 [273]
 [274]
 [275]
 [276]
 [277]
 [278]
 [279]
 [280]
 [281]
 [282]
 [283]
 [284]
 [285]
 [286]
 [287]
 [288]
 [289]
 [290]
 [291]
 [292]
 [293]
 [294]
 [295]
 [296]
 [297]
 [298]
 [299]]
RMSE values:
ARIMA:  0.5253652185900073
SARIMA:  0.49952779636230954
LSTM:  0.17932097083592433
Linear Regression:  63.11021858487515
