In [60]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [61]:
torch.version.cuda

'11.8'

In [62]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [63]:
fights_df = pd.read_csv('fights_final.csv')
fights_df.head()

Unnamed: 0.1,Unnamed: 0,fighter_1,fighter_2,method,round,time,time format,kd_fighter_1,kd_fighter_2,sig_landed_fighter_1,...,age_fighter_2,slpm_fighter_2,str_acc_fighter_2,sapm_fighter_2,str_def_fighter_2,td_avg_fighter_2,td_acc_fighter_2,td_def_fighter_2,sub_avg_fighter_2,winner
0,0,Israel Adesanya,Sean Strickland,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),0,1,94,...,32.0,5.82,41,4.24,63,0.92,64,84,0.2,1
1,1,Tai Tuivasa,Alexander Volkov,Submission,2,4:37,3 Rnd (5-5-5),0,1,28,...,35.0,5.1,57,3.0,54,0.49,63,73,0.2,1
2,2,Manel Kape,Felipe dos Santos,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),1,0,112,...,23.0,6.6,32,7.47,38,0.0,0,0,0.0,0
3,3,Justin Tafa,Austen Lane,KO/TKO,1,1:22,3 Rnd (5-5-5),1,0,11,...,36.0,4.48,47,2.85,48,0.0,0,0,0.0,0
4,4,Tyson Pedro,Anton Turkalj,KO/TKO,1,2:12,3 Rnd (5-5-5),1,0,16,...,27.0,1.72,53,2.48,49,6.55,51,16,0.0,0


In [64]:
fights_df.isnull().values.any()

False

In [65]:
def get_time(round, time):
  minutes, seconds =  [int(s) for s in time.split(":")]
  return (round - 1)*5 + minutes + seconds/60

In [66]:
for ind in fights_df.index:
  time = get_time(fights_df["round"][ind], fights_df["time"][ind])
  fights_df.loc[ind, "sig_landed_fighter_1"] /= time
  fights_df.loc[ind, "sig_landed_fighter_2"] /= time
  fights_df.loc[ind, "td_landed_fighter_1"] /= time
  fights_df.loc[ind, "td_landed_fighter_2"] /= time

In [67]:
X_features = ["slpm_fighter_1","sapm_fighter_2","str_acc_fighter_1","str_def_fighter_1","slpm_fighter_2","sapm_fighter_1","str_acc_fighter_2","str_def_fighter_2"]
y_features = ["sig_landed_fighter_1", "sig_landed_fighter_2"]

num_fights = 250

X, y = fights_df[X_features], fights_df[y_features]
X_train, X_test, y_train, y_test = X[num_fights:], X[:num_fights], y[num_fights:], y[:num_fights]

In [68]:
my_model = XGBRegressor()
my_model.fit(X_train, y_train,
             early_stopping_rounds=5,
             eval_set=[(X_test, y_test)],
             verbose=False)



In [69]:
preds = my_model.predict(X_test)

In [70]:
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error: " + str(mean_absolute_error(preds, y_test)))

Mean Absolute Error: 1.715958491199919


In [71]:
fights_df.loc[y_test.index, ["sig_landed_fighter_1", "sig_landed_fighter_2"]]=preds

In [72]:
fights_df.head()

Unnamed: 0.1,Unnamed: 0,fighter_1,fighter_2,method,round,time,time format,kd_fighter_1,kd_fighter_2,sig_landed_fighter_1,...,age_fighter_2,slpm_fighter_2,str_acc_fighter_2,sapm_fighter_2,str_def_fighter_2,td_avg_fighter_2,td_acc_fighter_2,td_def_fighter_2,sub_avg_fighter_2,winner
0,0,Israel Adesanya,Sean Strickland,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),0,1,5.032218,...,32.0,5.82,41,4.24,63,0.92,64,84,0.2,1
1,1,Tai Tuivasa,Alexander Volkov,Submission,2,4:37,3 Rnd (5-5-5),0,1,6.122884,...,35.0,5.1,57,3.0,54,0.49,63,73,0.2,1
2,2,Manel Kape,Felipe dos Santos,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),1,0,6.730522,...,23.0,6.6,32,7.47,38,0.0,0,0,0.0,0
3,3,Justin Tafa,Austen Lane,KO/TKO,1,1:22,3 Rnd (5-5-5),1,0,5.117199,...,36.0,4.48,47,2.85,48,0.0,0,0,0.0,0
4,4,Tyson Pedro,Anton Turkalj,KO/TKO,1,2:12,3 Rnd (5-5-5),1,0,3.727228,...,27.0,1.72,53,2.48,49,6.55,51,16,0.0,0


In [73]:
def get_time(round, time):
  minutes, seconds =  [int(s) for s in time.split(":")]
  return (round - 1)*5 + minutes + seconds/60

In [74]:
cats = ["time format"]
for cat in cats:
    fights_df[cat].fillna("unk", inplace=True)
    fights_df[cat] = fights_df[cat].astype("category")

In [75]:
fights_df[cats] = fights_df[cats].apply(lambda col: col.cat.codes)
fights_df.head(10)

Unnamed: 0.1,Unnamed: 0,fighter_1,fighter_2,method,round,time,time format,kd_fighter_1,kd_fighter_2,sig_landed_fighter_1,...,age_fighter_2,slpm_fighter_2,str_acc_fighter_2,sapm_fighter_2,str_def_fighter_2,td_avg_fighter_2,td_acc_fighter_2,td_def_fighter_2,sub_avg_fighter_2,winner
0,0,Israel Adesanya,Sean Strickland,Decision - Unanimous,5,5:00,1,0,1,5.032218,...,32.0,5.82,41,4.24,63,0.92,64,84,0.2,1
1,1,Tai Tuivasa,Alexander Volkov,Submission,2,4:37,0,0,1,6.122884,...,35.0,5.1,57,3.0,54,0.49,63,73,0.2,1
2,2,Manel Kape,Felipe dos Santos,Decision - Unanimous,3,5:00,0,1,0,6.730522,...,23.0,6.6,32,7.47,38,0.0,0,0,0.0,0
3,3,Justin Tafa,Austen Lane,KO/TKO,1,1:22,0,1,0,5.117199,...,36.0,4.48,47,2.85,48,0.0,0,0,0.0,0
4,4,Tyson Pedro,Anton Turkalj,KO/TKO,1,2:12,0,1,0,3.727228,...,27.0,1.72,53,2.48,49,6.55,51,16,0.0,0
5,5,Carlos Ulberg,Da Woon Jung,Submission,3,4:49,0,1,0,6.089372,...,30.0,3.49,43,3.89,51,1.9,50,77,0.2,0
6,6,Jack Jenkins,Chepe Mariscal,KO/TKO,2,3:19,0,0,0,5.142348,...,31.0,4.67,53,3.95,52,3.22,38,100,0.6,1
7,7,Jamie Mullarkey,John Makdessi,Decision - Unanimous,3,5:00,0,0,0,5.142348,...,38.0,5.52,49,4.15,68,0.0,0,86,0.0,0
8,8,Nasrat Haqparast,Landon Quinones,Decision - Unanimous,3,5:00,0,0,0,9.226934,...,28.0,9.87,41,11.4,54,0.0,0,100,0.0,0
9,9,Blood Diamond,Charles Radtke,Decision - Unanimous,3,5:00,0,0,0,3.657702,...,33.0,2.33,46,2.93,53,1.0,16,100,0.0,1


In [78]:
fights_df = fights_df[["time format","sig_landed_fighter_1", "sig_landed_fighter_2","td_landed_fighter_1","td_landed_fighter_2", "winner"]]

In [79]:
num_fights = 250
X, y = fights_df.drop("winner", axis=1), fights_df["winner"]
X_train, X_test, y_train, y_test = X[num_fights:], X[:num_fights], y[num_fights:], y[:num_fights]

In [80]:
# normalization
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

conts = X.drop(cats, axis=1).columns.values.tolist()

scaler = StandardScaler()
X_train_scaled.loc[:, conts] = scaler.fit_transform(X_train[conts])
X_test_scaled.loc[:, conts] = scaler.transform(X_test[conts])

In [81]:
class UfcDataset(Dataset):
    def __init__(self, df, cats, conts, targets):
        self.X_cats = df[cats].astype(np.int64).values
        self.X_conts = df[conts].astype(np.float32).values
        self.y = targets.astype(np.float32).values.reshape(-1, 1)


    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, i):
        return [self.X_cats[i], self.X_conts[i], self.y[i]]

In [82]:
train_dataset = UfcDataset(X_train_scaled, cats, conts, y_train)
test_dataset = UfcDataset(X_test_scaled, cats, conts, y_test)

In [83]:
# Data loaders
bz=32
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=bz, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=bz, shuffle=False)

In [84]:
class UfcNet(nn.Module):
    def __init__(self, emb_dims, num_conts, fc_layer_sizes, emb_drop, ps):
        super(UfcNet, self).__init__()

        # embedding layers for categorical features
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
        self.num_embs = sum([y for _, y in emb_dims])
        self.num_conts = num_conts

        # fully connected layers
        fc_layer_sizes = [self.num_embs + self.num_conts] + fc_layer_sizes
        self.fc_layers = nn.ModuleList([nn.Linear(fc_layer_sizes[i],fc_layer_sizes[i+1])
                                        for i in range(len(fc_layer_sizes)-1)])

        # out layer
        self.out = nn.Linear(fc_layer_sizes[-1], 1)

        # batch norm layers
        self.first_bn = nn.BatchNorm1d(self.num_conts)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(sz)
                                        for sz in fc_layer_sizes[1:]])
        # dropout layers
        self.emb_drop = nn.Dropout(emb_drop)
        self.dropout_layers = nn.ModuleList([nn.Dropout(p) for p in ps])

    def forward(self, x_cats, x_conts):
        x = [e(x_cats[:, i]) for i, e in enumerate(self.emb_layers)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)

        x_c = self.first_bn(x_conts)
        x = torch.cat([x, x_c], 1)

        for fc, bn, d in zip(self.fc_layers, self.bn_layers, self.dropout_layers):
            x = F.relu(fc(x))
            x = bn(x)
            x = d(x)

        x = self.out(x)
        return torch.sigmoid(x)

In [85]:
emb_dims = [(len(fights_df[cat].unique()), min(50, len(fights_df[cat].unique())//2)) for cat in cats]
emb_dims

[(2, 1)]

In [86]:
num_conts = len(conts)
fc_layer_sizes = [512, 128, 32]
emb_drop = 0.5
ps = [0.5] * 3

In [87]:
ufc_model = UfcNet(emb_dims, num_conts, fc_layer_sizes, emb_drop, ps).to(device)

In [88]:
criterion = nn.BCELoss()
learning_rate = 1e-2
optimizer = torch.optim.Adam(ufc_model.parameters(), lr=learning_rate, weight_decay=1e-4)

In [89]:
num_epochs = 5
total_step = len(train_dl)
for epoch in range(num_epochs):
    for i, (x_cats, x_conts, y) in enumerate(train_dl):
        x_cats, x_conts, y = x_cats.to(device), x_conts.to(device), y.to(device)

        # forward
        outputs = ufc_model(x_cats, x_conts)
        loss = criterion(outputs, y)

        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/183], Loss: 0.4244
Epoch [2/5], Step [100/183], Loss: 0.3756
Epoch [3/5], Step [100/183], Loss: 0.4509
Epoch [4/5], Step [100/183], Loss: 0.4002
Epoch [5/5], Step [100/183], Loss: 0.4197


In [90]:
with torch.no_grad():
    correct = 0
    total = 0
    for x_cats, x_conts, y in test_dl:
        x_cats, x_conts, y = x_cats.to(device), x_conts.to(device), y.to(device)
        outputs = ufc_model(x_cats, x_conts)
        preds = (outputs>0.5).type(torch.cuda.FloatTensor)
        total += y.size(0)
        correct += (preds == y).sum().item()

print ("Accuracy: {:.2f}%".format(100*correct/total))

Accuracy: 70.40%
