<a href="https://colab.research.google.com/github/op-dvorak/kalshi_covid_model/blob/main/0_0_0COVIDWeeklyNewCasecount.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kalshi RNN Predicting Weekly COVID New Cases
**Authors:** Oam Patel, Khoi Nguyen

**Date Created:** 04/19/2022

**Last Modified:** Last Night

**Description:** Creating an RNN that can forecast cumulative Weekly COVID New Cases and place bets on Kalshi

## Data Processing
Collect Casecount Data

Collect Validation Data



In [35]:
import pandas as pd
import numpy as np
from torch import nn
import torch
from torch.utils.data import TensorDataset,DataLoader

In [3]:
casecount_df = pd.read_csv(r'/content/data_table_for_daily_case_trends__the_united_states (1).csv')
casecount_df.head()

Unnamed: 0,State,Date,New Cases,7-Day Moving Avg,7-Day % Positivity,Historic Cases
0,United States,Apr 18 2022,47339,37132,,1521
1,United States,Apr 17 2022,19597,35384,,0
2,United States,Apr 16 2022,10823,35113,5.29,0
3,United States,Apr 15 2022,39871,35458,5.05,1232
4,United States,Apr 14 2022,63086,35636,4.66,1736


RNN data input structure?
* x: window of past 14 days of case data & last week's total case count?
* want date to be split
* y: this week's total case count (note: Kalshi measures **Thursday to Wednesday**)

In [4]:
casecount_df = casecount_df.drop('State',axis=1)

# split date into day,month,year
casecount_df.Date = pd.to_datetime(casecount_df.Date)
casecount_df['day'] = casecount_df['Date'].dt.day
casecount_df['month'] = casecount_df['Date'].dt.month
casecount_df['year'] = casecount_df['Date'].dt.year
casecount_df = casecount_df.drop('Date',axis=1)
casecount_df['Last Week'] = casecount_df['New Cases'][:-12]
casecount_df

Unnamed: 0,New Cases,7-Day Moving Avg,7-Day % Positivity,Historic Cases,day,month,year,Last Week
0,47339,37132,,1521,18,4,2022,47339.0
1,19597,35384,,0,17,4,2022,19597.0
2,10823,35113,5.29,0,16,4,2022,10823.0
3,39871,35458,5.05,1232,15,4,2022,39871.0
4,63086,35636,4.66,1736,14,4,2022,63086.0
...,...,...,...,...,...,...,...,...
812,0,0,,0,27,1,2020,
813,1,0,,0,26,1,2020,
814,0,0,,0,25,1,2020,
815,1,0,,0,24,1,2020,


In [5]:
# convert to numpy and add last week data
casecount = np.asarray(casecount_df)[3:]
sum = 0
for l in range(0,len(casecount)):
  sum = 0
  if l%7 == 0 and l < 805: 
    for i in range(0,7): 
      sum += casecount[l+7+i][0]
    for i in range(0,7):
      casecount[l+i][7] = sum
    
casecount = casecount[:-2]
casecount = casecount[::-1]

df = pd.DataFrame(casecount)
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,,0.0,25.0,1.0,2020.0,
1,1.0,0.0,,0.0,26.0,1.0,2020.0,
2,0.0,0.0,,0.0,27.0,1.0,2020.0,
3,0.0,0.0,,0.0,28.0,1.0,2020.0,
4,0.0,0.0,,0.0,29.0,1.0,2020.0,
...,...,...,...,...,...,...,...,...
807,35103.0,30293.0,3.75,125.0,11.0,4.0,2022.0,196225.0
808,27840.0,29519.0,4.04,548.0,12.0,4.0,2022.0,196225.0
809,51370.0,31421.0,4.34,584.0,13.0,4.0,2022.0,196225.0
810,63086.0,35636.0,4.66,1736.0,14.0,4.0,2022.0,196225.0


In [29]:
# create windows and prediction data
window_size = 14
x = []
y = []

for i in range(window_size,len(casecount)): 
  x.append(casecount[i-window_size:i])
  y.append(casecount[i-7][7])


798
798


In [34]:
# create training, validation, test split
batch_size = 32
x_train = x[:600]
y_train = y[:600]
x_val = x[600:650]
y_val = y[600:650]
x_test = x[650:]
y_test = y[650:]


train_features = torch.Tensor(x_train)
train_target = torch.Tensor(y_train)
val_features = torch.Tensor(x_val)
val_target = torch.Tensor(y_val)
test_features = torch.Tensor(x_test)
test_target = torch.Tensor(y_test)

train = TensorDataset(train_features, train_target)
val = TensorDataset(val_features, val_target)
test = TensorDataset(test_features, test_target)

train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val, batch_size = batch_size, shuffle=False, drop_last = True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)

  # This is added back by InteractiveShellApp.init_path()


## Machine Learning Model


In [37]:
class RNNModel(torch.nn.Module): 
  def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
    super(RNNModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.layer_dim = layer_dim
    # RNN layers
    self.rnn = nn.RNN(
        input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
    )
    # fully connected layer
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    # initializing hidden state for first input with zeros
    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
    # forward propagation by passing in the input and hidden state into the model
    out, h0 = self.rnn(x, h0.detach())
    # reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
    # so that it can fit into the fully connected layer
    out = out[:, -1, :]
    # convert the final state to our desired output shape (batch_size, output_dim)
    out = self.fc(out)
    return out

class LSTMModel(torch.nn.Module):
  def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
    super(LSTMModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.layer_dim = layer_dim
    # LSTM layers
    self.lstm = nn.LSTM(
        input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
    )
    # Fully connected layer
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
    c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
    # tbh I don't really know what this does but PyTorch says to use it
    out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
    out = out[:, -1, :]
    out = self.fc(out)
    return out

In [38]:
class Optimization:
  def __init__(self, model, loss_fn, optimizer):
    self.model = model
    self.loss_fn = loss_fn
    self.optimizer = optimizer
    self.train_losses = []
    self.val_losses = []

  def train_step(self, x, y):
    # Sets model to train mode
    self.model.train()
    yhat = self.model(x)
    loss = self.loss_fn(y, yhat)
    loss.backward()
    # Updates parameters and zeroes gradients
    self.optimizer.step()
    self.optimizer.zero_grad()
    return loss.item()