In [1]:
# import packages 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

In [2]:
# load data 
df= pd.read_csv(r'D:\kollege_stuff\Grad\Deepleanring\Projects\nfl_betting-main\scraped_data\elo_pfr_betting_data.csv')

# shift target column 
target = 'ou_line'    # column name to predict
df[target] = df[target].shift(-16)

# Remove the last raw
df = df.iloc[:-16]

# Remove columns with strings 
df = df.drop(['team1','team2','qb1','qb2', 'spread_result', 'ou_result', "home_time_of_poss",'away_time_of_poss', 'date', 'playoff'], axis=1)
df = df.dropna(axis='columns')

# Select training features 
features = list(df.columns.difference([target]))

In [3]:
test_start = int(df.shape[0]*0.8)
df_train = df.loc[:test_start].copy()
df_test  = df.loc[test_start:].copy()

In [4]:
# Standardize features 
target_mean = df_train[target].mean()
target_stdev = df_train[target].std()

for c in df_train.columns:
    mean = df_train[c].mean()
    stdev = df_train[c].std()

    df_train[c] = (df_train[c] - mean) / stdev
    df_test[c] = (df_test[c] - mean) / stdev

In [5]:
# Create a dataset that pytorch loader can work with 
class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length=5):
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i): 
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i + 1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i + 1), :]
            x = torch.cat((padding, x), 0)

        return x, self.y[i]

In [6]:
sequence_length = 16

train_dataset = SequenceDataset(
    df_train,
    target=target,
    features=features,
    sequence_length=sequence_length
)

test_dataset = SequenceDataset(
    df_test,
    target=target,
    features=features,
    sequence_length=sequence_length
)

In [7]:
# Set the dataset in pytorch dataloader
torch.manual_seed(99)
batch_size = 5
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
X, y = next(iter(train_loader))
print(X.shape)
print(y.shape)
print(X)

torch.Size([5, 16, 29])
torch.Size([5])
tensor([[[-0.8342, -0.8346, -0.6544,  ..., -0.8317,  1.3568, -0.8564],
         [-0.8329, -0.8332,  1.6077,  ..., -0.8317, -0.3811, -0.8564],
         [-0.8315, -0.8319,  0.4651,  ..., -0.8317, -0.1205, -0.8564],
         ...,
         [-0.8165, -0.8170,  0.2053,  ..., -0.8317, -0.5549, -0.8564],
         [-0.8152, -0.8157,  1.8085,  ..., -0.8317, -1.9452, -0.8564],
         [-0.8138, -0.8143, -0.4123,  ..., -0.8317, -0.8156, -0.8564]],

        [[-1.0030, -1.0033, -1.0412,  ..., -0.8317,  0.9223, -0.8564],
         [-1.0016, -1.0019,  0.9855,  ..., -0.8317, -0.5549, -0.8564],
         [-1.0002, -1.0006, -0.5775,  ..., -0.8317,  0.8354, -0.8564],
         ...,
         [-0.9853, -0.9844, -1.4555,  ..., -0.8317,  1.0092, -0.8564],
         [-0.9839, -0.9830,  0.7155,  ..., -0.8317, -0.9894, -0.8564],
         [-0.9826, -0.9817,  1.1957,  ..., -0.8317, -0.8156, -0.8564]],

        [[ 1.4915,  1.4933,  0.4446,  ...,  1.3256, -0.6418,  1.3062],
     

In [9]:
# model 
class RegressionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(RegressionLSTM, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        self.lstm = nn.LSTM(input_size=self.input_size,
                            hidden_size=self.hidden_size, 
                            num_layers = self.num_layers, 
                            dropout = self.dropout, 
                            batch_first = True)
        self.linear = nn.Linear(in_features=self.hidden_size, out_features=1)
        
    def forward(self,x):
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).requires_grad_() 
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).requires_grad_()
        
        _, (hn, _) = self.lstm(x,(h0,c0))
        out = self.linear(hn[0]).flatten()
        
        return out 

In [10]:
# hyperparameters
lr = 5e-5
layers = 16
epoch = 10
input_size = len(features)
hidden_size = 16
num_layers = 1
dropout = 0.1

model = RegressionLSTM(input_size=input_size, 
                       hidden_size=hidden_size, 
                       num_layers= num_layers,
                       dropout= dropout
                      )
loss_func = nn.MSELoss()
criterion = torch.optim.Adam(model.parameters(), lr = lr)



In [11]:
# Training 
def train(data, model, loss_func, optimizer):
    n_batches = len(data)
    total_loss = 0
    model.train()
    
    for x,y in data:
        output= model(x)
        loss = loss_func(output, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    mean_loss = total_loss/n_batches
    print(f'Training loss: {mean_loss}')
    return mean_loss
    

def test(data,model,loss_func):
    n_batches = len(data)
    total_loss = 0
    
    model.eval()
    with torch.no_grad():
        for x, y in data:
            output= model(x)
            total_loss += loss_func(output,y).item()
            
    mean_loss = total_loss/n_batches
    print(f'Test loss: {mean_loss}')
    
    return mean_loss
        

In [12]:
# prediction function

def predict(data, model):
    
    output = torch.tensor([])
    model.eval()
    
    with torch.no_grad():
        for X,_ in data:
            y_fit = model(X)
            output = torch.cat((output,y_fit), 0)
    
    return output



In [13]:
# Run lstm 
trainloss = []
testloss = []
for i in range(epoch):
    print(f'Epoch {i}\n -----')
    train_loss = train(train_loader, model, loss_func, optimizer=criterion)
    trainloss.append(train_loss)
    test_loss  = test(test_loader, model, loss_func)
    testloss.append(test_loss)    
    print()

Epoch 0
 -----
Training loss: 1.012671143994877
Test loss: 1.245895646046847

Epoch 1
 -----
Training loss: 1.0011415424992738
Test loss: 1.193063557439018

Epoch 2
 -----
Training loss: 0.9932370279240468
Test loss: 1.1547718836809509

Epoch 3
 -----
Training loss: 0.9874899357937876
Test loss: 1.1279847248224542

Epoch 4
 -----
Training loss: 0.9833414684871326
Test loss: 1.108748380560428

Epoch 5
 -----
Training loss: 0.9800564460053836
Test loss: 1.0949938714620657

Epoch 6
 -----
Training loss: 0.977358784870572
Test loss: 1.0836523938341998

Epoch 7
 -----
Training loss: 0.9751536191417333
Test loss: 1.0765529919299297

Epoch 8
 -----
Training loss: 0.9731546289311294
Test loss: 1.0718925732071511

Epoch 9
 -----
Training loss: 0.9713348280235921
Test loss: 1.0685123428702354



In [14]:
# prediction

train_eval_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

y_pred = "Game Prediction"
df_train[y_pred] = predict(train_eval_loader, model).numpy()
df_test[y_pred]  = predict(test_loader, model).numpy()

df_out = pd.concat((df_train, df_test))[[target, y_pred]]

for c in df_out.columns:
    df_out[c] = df_out[c] * target_stdev + target_mean
    
df_out

Unnamed: 0,ou_line,Game Prediction
0,43.0,44.474228
1,37.0,44.154800
2,37.5,44.117092
3,41.5,43.656174
4,43.5,43.811203
...,...,...
3176,48.0,45.328022
3177,54.5,45.548115
3178,55.0,45.443195
3179,46.0,45.593185


In [None]:
plt.plot(df_out['ou_line'])
plt.plot(df_out['Game Prediction'])
plt.show()