In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats



import seaborn as sns
from sklearn import metrics, datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


In [13]:
df = pd.read_csv("/Users/ik/QMinersHackathon42/data/features_many.csv")
df = df[df["y_draw"] == 0]

In [14]:
df.drop(columns = ["Season", "Date", "y_draw", "elo_p_h", "market_type", "y_away_win"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6806 entries, 0 to 7395
Data columns (total 91 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   orig_index                   6806 non-null   int64  
 1   HID                          6806 non-null   int64  
 2   AID                          6806 non-null   int64  
 3   y_home_win                   6806 non-null   int64  
 4   oddsH                        6806 non-null   float64
 5   oddsA                        6806 non-null   float64
 6   elo_h                        6806 non-null   float64
 7   elo_a                        6806 non-null   float64
 8   elo_diff                     6806 non-null   float64
 9   pts_pg_h                     6806 non-null   float64
 10  pts_pg_a                     6806 non-null   float64
 11  gd_pg_h                      6806 non-null   float64
 12  gd_pg_a                      6806 non-null   float64
 13  games_season_h         

In [15]:
df = df.replace(np.nan,0)
df["Bookmaker_prob"] = (1/df["oddsH"])/(1/df["oddsH"]  + 1/df["oddsA"])
df.drop(columns = ["oddsH", "oddsA"], inplace=True)
data = df.drop(columns=[ "y_home_win", "Bookmaker_prob"]).values
target = df["y_home_win"].values
bookmaker_prob = df["Bookmaker_prob"].values

scaler = StandardScaler()

random_seed = 42

print(len(bookmaker_prob))
print(len(data))
Xtrain, Xval, ytrain, yval_data, bookmaker_prob_train, bookmaker_prob_val_data = train_test_split(
    data, target, bookmaker_prob, test_size=0.25, shuffle=False)

##
Xtrain = scaler.fit_transform(Xtrain)
Xval_data = scaler.transform(Xval)

6806
6806


In [16]:
X_train = torch.tensor(Xtrain, dtype=torch.float32)
bookmaker_prob_train = torch.tensor(bookmaker_prob_train, dtype=torch.float32).unsqueeze(1)
y_train = torch.tensor(ytrain, dtype=torch.float32).unsqueeze(1)



train_dataset = TensorDataset(X_train, bookmaker_prob_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False) 

In [17]:
import torch.nn.functional as F

class ProbabilityEstimatorNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, 1)  # Single output neuron

    # Xavier/Glorot initialization for hidden layers
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.zeros_(self.fc2.bias)
        
        # Small random init for output layer
        nn.init.uniform_(self.output.weight, -0.1, 0.1)
        nn.init.zeros_(self.output.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)  # Outputs probabilities in [0,1]
        return x

import torch

def decorrelation_loss(outputs, bookmaker_prob, lambda_decorr):
    outputs = outputs.view(-1)
    bookmaker_prob = bookmaker_prob.view(-1)
    cov = torch.mean((outputs - outputs.mean()) * (bookmaker_prob - bookmaker_prob.mean()))
    loss = lambda_decorr * cov**2
    return loss


def l2_regularization(model, lambda_):
    # Sum of squared weights (L2 norm), excluding biases
    l2_norm = sum(
        torch.sum(param ** 2) 
        for param in model.parameters() 
        if param.requires_grad and param.dim() > 1
    )
    return lambda_ * l2_norm
# in training loop:
# outputs = model(X_batch)
# loss = criterion(outputs, y_batch)
# reg = l2_regularization(model, lambda_)
# total_loss = loss + reg
# total_loss.backward()



In [18]:
# Suppose X_train, y_train, bookmaker_probs_train are your training data tensors
# X_train: (num_samples, feature_dim), y_train: (num_samples, 1), bookmaker_probs_train: (num_samples,)
# Wrap in Dataset and DataLoader
feature_dim = data.shape[1]
model = ProbabilityEstimatorNN(feature_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


criterion = nn.BCEWithLogitsLoss()

num_samples_train = X_train.shape[0]


# Initialize model, optimizer


# Hyperparameters for loss

lambda_ = 0.01
lambda_decorr = 0.3
epochs = 100


# Training loop
def train(model, optimizer, criterion, num_samples_train, lambda_decorr, epochs, train_loader, lambda_):
    
    for epoch in range(epochs):
        print(epoch)
        total_loss = 0
        for X_batch, bookmaker_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch) 
            #print("outputs min/max:", outputs.min().item(), outputs.max().item()) #debugging
            loss = criterion(outputs, y_batch) + decorrelation_loss(bookmaker_batch, outputs,lambda_decorr) + l2_regularization(model, lambda_)
            loss.backward() 
            """for name, param in model.named_parameters():
                if param.grad is not None:
                    print(name, param.grad.norm().item())"""          
             # Backpropagation
            optimizer.step()           # Update parameters)
            total_loss += loss.item() * X_batch.size(0)
        avg_loss = total_loss / len(train_loader.dataset)


    return avg_loss

In [19]:
X_val = torch.tensor(Xval_data, dtype=torch.float32)
bookmaker_prob_val = torch.tensor(bookmaker_prob_val_data, dtype=torch.float32).unsqueeze(1)
y_val = torch.tensor(yval_data, dtype=torch.float32).unsqueeze(1)

val_dataset = TensorDataset(X_val, bookmaker_prob_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [20]:
num_samples_val = X_val.shape[0]

def predict(model, criterion, val_loader):
    all_predictions = []
    num_samples = 0
    total_loss = 0
    with torch.no_grad():
        for X_batch, y_batch, bookmaker_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch) #- (1/num_samples_val)*decorrel_weight*decorellation(bookmaker_batch, outputs)
            batch_size = X_batch.size(0)
            total_loss += loss.item() * batch_size
            num_samples += batch_size
            probs = torch.sigmoid(outputs).view(-1).cpu()
            all_predictions.append(probs)

    avg_loss = total_loss / num_samples
    #print(f"Validation Loss: {avg_loss:.4f}")
    flat_array = torch.cat(all_predictions).numpy()
    mse = mean_squared_error(yval_data, flat_array)
    r2 = r2_score(yval_data, flat_array)

    #print("MSE:", mse)
    #print("R2:", r2)
    return avg_loss, flat_array, mse, r2

In [21]:
def accuracy(flat_array, sensitivity):
    adjusted = np.full_like(flat_array, 0.5)  # Initialize with 0.5

    mask_high = flat_array > (0.5 + sensitivity)
    mask_low = flat_array < (0.5 - sensitivity)

    adjusted[mask_high] = 1
    adjusted[mask_low] = 0

    adjusted = adjusted.reshape(-1)

    acc = (adjusted == yval_data).mean()

    print(f"Validation accuracy of normalized model: {acc:.4f}")
    print(np.unique(yval_data))  
    print("acc on making a guess:", acc*len(adjusted)/np.sum(adjusted != 0.5))
    acc_with_making_a_guess = acc*len(adjusted)/np.sum(adjusted != 0.5)
    volatility = np.sum(adjusted != 0.5)
    return acc, acc_with_making_a_guess, volatility

In [23]:


# Hyperparameters for loss
epochs = 4
lambda_decorr = 0.7
lambda_ = 0.02

acc_coeff = 0.03
avg_train_loss_set = []
val_loss_set = []
array_for_outcomes = np.array([])
mse_arr = []
r2_arr = []
x_axis = []
acc_arr = []
acc_with_making_a_guess_arr = []
volatility_arr = []
 # Training loop


ffeature_dim = data.shape[1]
model = ProbabilityEstimatorNN(feature_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_samples_train = X_train.shape[0]

avg_train_loss_set.append(train(model, optimizer, criterion, num_samples_train, lambda_decorr, epochs, train_loader, lambda_))

print(avg_train_loss_set)
val_loss, flat_array, mse, r2 = predict(model, criterion, val_loader)
acc, acc_with_making_a_guess, volatility = accuracy(flat_array, acc_coeff )
val_loss_set.append(val_loss)
#array_for_outcomes = np.vstack([flat_array, array_for_outcomes])

mse_arr.append(mse)
r2_arr.append(r2)
x_axis.append(epochs)

volatility_arr.append(volatility)
acc_with_making_a_guess_arr.append(acc_with_making_a_guess)
acc_arr.append(acc)

print("MSE arr:", mse_arr)
print("R2 arr:", r2_arr)


0
1
2
3
[0.6794074441198271]
Validation accuracy of normalized model: 0.6099
[0 1]
acc on making a guess: 0.6098707403055229
MSE arr: [0.23310312628746033]
R2 arr: [0.02028048038482666]


In [34]:
import sys
# Add the folder containing the file, not the file itself
sys.path.append("/Users/ik/QMinersHackathon42/")

# Now import the module (use the filename without .py)
import betting_strategy

In [None]:
bets, max_sharpe = betting_strategy.betting_strategy(bookmaker_prob_val_data, flat_array, 0.03)
for i in range(len(bets)):
    if bets[i]["label"] ==  "A":
        print("yes")
    if bets[i]["bet"] !=  0:
        print(f"Bet {bets[i]:.4f} on match {i+1}")

{0: {'label': 'H', 'prob': np.float32(0.62741107), 'bet': 0.0}, 1: {'label': 'H', 'prob': np.float32(0.60069835), 'bet': 0.0}, 2: {'label': 'H', 'prob': np.float32(0.5836718), 'bet': 0.0}, 3: {'label': 'H', 'prob': np.float32(0.6846425), 'bet': 0.0}, 4: {'label': 'H', 'prob': np.float32(0.6027455), 'bet': 0.0}, 5: {'label': 'H', 'prob': np.float32(0.64880276), 'bet': 0.0}, 6: {'label': 'H', 'prob': np.float32(0.5823147), 'bet': 0.0}, 7: {'label': 'H', 'prob': np.float32(0.68680614), 'bet': 0.0}, 8: {'label': 'H', 'prob': np.float32(0.60973775), 'bet': 0.0}, 9: {'label': 'H', 'prob': np.float32(0.6415861), 'bet': 0.0}, 10: {'label': 'H', 'prob': np.float32(0.6049995), 'bet': 0.0}, 11: {'label': 'H', 'prob': np.float32(0.6354989), 'bet': 0.0}, 12: {'label': 'H', 'prob': np.float32(0.6212227), 'bet': 0.0}, 13: {'label': 'H', 'prob': np.float32(0.70937914), 'bet': 0.0}, 14: {'label': 'H', 'prob': np.float32(0.5926671), 'bet': 0.0}, 15: {'label': 'H', 'prob': np.float32(0.57756734), 'bet': 