# <strong>Bitcoin Recurrent Neural Network<strong>
### Justin Marlor & Habit Blunk
##### *Colorado State University*

This is our notebook that automatically copies data from [this dataset hosted on Kaggle](https://www.kaggle.com/datasets/mczielinski/bitcoin-historical-data), then throws it into various neural networks and predicts the price of Bitcoin.

To run it:

1. Run the script located in this repository at `./env-script`. This will set up your virtual environment. 
2. Run `source ./venv/bin/activate`. This will put you in the virtual environment we have set up, so this notebook can be run on any machine so long as it has Python 3.x and can install the dependencies at `./dependencies.txt`.
3. Paste this into `~/.config/kaggle/kaggle.json`:
    ```json
    {
      "username": "justinmarlor",
      "key": "b98017f9291bfa83686f6c6780d38e04"
    }
    ```
4. Execute each cell in sequence.

#### Cell 1: imports, grabbing, and preprocessing dataset

In [None]:
import pandas as pd
import subprocess
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.autograd import Variable
from torch.nn import GRU, RNN
from sklearn.preprocessing import StandardScaler

class TimeSeriesDataset(torch.utils.data.Dataset):
    def __init__(self, tensor, seq_length, target_idx):
        self.tensor = tensor
        self.seq_length = seq_length
        self.target_idx = target_idx

    def __len__(self):
        return len(self.tensor) - self.seq_length

    def __getitem__(self, idx):
        seq = self.tensor[idx:idx + self.seq_length]
        target = self.tensor[idx + self.seq_length, self.target_idx]
        return seq, target 

result = subprocess.run(['bash', './add-run-kaggle-bitcoin'], capture_output=True,text=True)

print(result.stdout)
print(result.stderr)

if result.returncode == 0:
    df = pd.read_csv("kaggle-bitcoin/upload/btcusd_1-min_data.csv", dtype={"Volume": float}, low_memory=False)
    df['datetime'] = pd.to_datetime(df['Timestamp'].astype('Int64'), unit='s', errors='coerce')
    df['Year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    display(df)

#### Cell 2: plotting dataset

In [None]:
plt.plot(df['datetime'], df['Open'], label='open', color='blue')
plt.plot(df['datetime'], df['Close'], label='close', color='orange')
plt.plot(df['datetime'], df['High'], label='high', color='red')
plt.plot(df['datetime'], df['Low'], label='low', color='green')

plt.xlabel('datetime')
plt.ylabel('price')
plt.title('ohlc time series')
plt.legend()
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

#### Cell 3: create tensor and define functions

In [None]:
df_tensor = df.drop(columns=['datetime']).astype('float32').dropna()
scaler = StandardScaler()
normalized_values = scaler.fit_transform(df_tensor.values)
tensor = torch.tensor(df_tensor.values, dtype=torch.float32)

close_idx = df.columns.get_loc('Close')

def create_sequences(tensor, seq_length, target_idx):
  sequences = []
  targets = []

  for i in range(len(tensor) - seq_length):
    seq = tensor[i:i + seq_length]
    target_value =  tensor[i + seq_length, target_idx]
    sequences.append(seq)
    targets.append(target_value) 
  return torch.stack(sequences), torch.tensor(targets).unsqueeze(1) 

seq_length = 60
batch_size = 64
dataset = TimeSeriesDataset(tensor, seq_length, close_idx)

train_size = int(0.95 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

def train_model(model, train_loader, val_loader, num_epochs=4, lr=1e-3):
  loss_fn = nn.MSELoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  
  for epoch in range(num_epochs):
    model.train() 
    total_loss = 0 
    for batch_X, batch_y in train_loader:
      pred = model(batch_X)
      loss = loss_fn(pred.squeeze(), batch_y)
      optimizer.zero_grad() 
      loss.backward()
      optimizer.step() 
      total_loss += loss.item() 
    model.eval()
    val_loss = 0
    with torch.no_grad():
      for val_X, val_y in val_loader:
        val_pred = model(val_X)
        val_loss += loss_fn(val_pred.squeeze(), val_y).item()
    print(f"epoch {epoch + 1}, train loss: {total_loss / len(train_loader):.4f}, val loss: {val_loss / len(val_loader):.4f}") 


#### Cell 4: building and training vanilla RNN against dataset, then saving it in a `*.pth` file

In [None]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim=1):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')
        self.fc = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :]) 
        return out

rnn_model = RNNModel(input_dim=tensor.shape[1], hidden_dim=64)
train_model(rnn_model, train_loader, val_loader) 
torch.save(rnn_model.state_dict(), "rnn_model.pth")

#### Cell 5: building and training LSTM against dataset, then saving it in a `*.pth` file

In [None]:
class LSTMModel(nn.Module):
  def __init__(self, input_dim, hidden_dim, layer_dim=1):
    super(LSTMModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.layer_dim = layer_dim
    self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True) 
    self.fc = nn.Linear(hidden_dim, 1)

  def forward(self, x): 
    out, _ = self.lstm(x)
    out = self.fc(out[:, -1, :])
    return out 

lstm_model = LSTMModel(input_dim = tensor.shape[1], hidden_dim=64)
train_model(lstm_model, train_loader, val_loader)
torch.save(lstm_model.state_dict(), "lstm_model.pth")