# This dataset contains information on historic trades for several cryptoassets, such as Bitcoin and Ethereum. Your challenge is to predict their future returns.

As historic cryptocurrency prices are not confidential this will be a forecasting competition using the time series API. Furthermore the public leaderboard targets are publicly available and are provided as part of the competition dataset. Expect to see many people submitting perfect submissions for fun. Accordingly, THE PUBLIC LEADERBOARD FOR THIS COMPETITION IS NOT MEANINGFUL and is only provided as a convenience for anyone who wants to test their code. The final private leaderboard will be determined using real market data gathered after the submission period closes.

train.csv - The training set

- timestamp - A timestamp for the minute covered by the row.
- Asset_ID - An ID code for the cryptoasset.
- Count - The number of trades that took place this minute.
- Open - The USD price at the beginning of the minute.
- High - The highest USD price during the minute.
- Low - The lowest USD price during the minute.
- Close - The USD price at the end of the minute.
- Volume - The number of cryptoasset u units traded during the minute.
- VWAP - The volume-weighted average price for the minute.
- Target - 15 minute residualized returns. See the 'Prediction and Evaluation section of this notebook for details of how the target is calculated.
- Weight - Weight, defined by the competition hosts here
- Asset_Name - Human readable Asset name.

example_test.csv - An example of the data that will be delivered by the time series API.

example_sample_submission.csv - An example of the data that will be delivered by the time series API. The data is just copied from train.csv.

asset_details.csv - Provides the real name and of the cryptoasset for each Asset_ID and the weight each cryptoasset receives in the metric.

supplemental_train.csv - After the submission period is over this file's data will be replaced with cryptoasset prices from the submission period. In the Evaluation phase, the train, train supplement, and test set will be contiguous in time, apart from any missing data. The current copy, which is just filled approximately the right amount of data from train.csv is provided as a placeholder.

    📌 There are 14 coins in the dataset

    📌 There are 4 years in the [full] dataset


#  Import

In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import time

In [2]:
# check if CUDA is available
use_cuda = torch.cuda.is_available()
print(use_cuda)

# Load data

In [3]:
pd.set_option('display.max_rows', None)
train_csv = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/train.csv")
test_csv = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/example_test.csv")
samples_submission_csv = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/example_sample_submission.csv")
asset_csv = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/asset_details.csv")
supplemental_train = pd.read_csv("/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv")

# Remove NaN rows

In [4]:
train_csv = train_csv.dropna()

In [5]:
train_csv.iloc[1:10]

In [6]:
asset_csv

In [7]:
train_csv_BitcoinDash = train_csv[train_csv.Asset_ID==0]

In [8]:
len(train_csv_BitcoinDash)

# Target evolution visualization

In [9]:
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(train_csv_BitcoinDash, x=train_csv_BitcoinDash.timestamp, y=train_csv_BitcoinDash.Target, title='Taregt evo')

In [13]:
fig, axs = plt.subplots(5,3, figsize=(10,10))
i = 0
j = 0
for nb in asset_csv.Asset_ID:
    temp = asset_csv.Asset_Name[nb]
    train_csv_temp = train_csv[train_csv.Asset_ID==nb]
    axs[i][j].plot(train_csv_temp.timestamp, train_csv_temp.Target)
    axs[i][j].title.set_text(temp)
    i = (i+1)%5
    if i == 4:
        j+=1
fig.tight_layout(pad=1.0)        
plt.show()

In [None]:
i =0
n=(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)

for nb in n:
    i = (i+1)%5
    print(i)

# Memory review

In [None]:
train_csv.info(memory_usage = "deep")

# Memory Size Reduction

In [None]:
for column in train_csv:
    print(column)
    if train_csv[column].dtype == 'float64':
        train_csv[column]=pd.to_numeric(train_csv[column], downcast='float')
    if train_csv[column].dtype == 'int64':
        train_csv[column]=pd.to_numeric(train_csv[column], downcast='integer')

In [None]:
train_csv.info(memory_usage = "deep")

# Dataset description

In [None]:
dtf_description_train = train_csv.describe()
dtf_description_train

In [None]:
dtf_description_test = test_csv.describe()
dtf_description_test

In [None]:
dtf_description_sample = samples_submission_csv.describe()
dtf_description_sample

In [None]:
dtf_description_asset = asset_csv.describe()
dtf_description_asset

In [None]:
dtf_supplemental_train= supplemental_train.describe()
dtf_supplemental_train

# Do not Run

train_csv otherwise the kernel will die

# Number of entries that are null/void

In [None]:
def list_null(dataset, feature, per):
    # Input
    # dataset: the selected dataset
    # feature: the name of the feature
    # per: the percentage of row that are void vs the whole column
    # output
    # list the column with a void percentage higher than per
    res = []
    for f in feature:
        #print("feature: {} number of missing value: {} \n the lengt of the dataset is {} and the percentage of null value is {}".format(f, train_csv[f].isna().sum(), len(dataset), train_csv[f].isna().sum()/len(dataset)))
        if (dataset[f].isna().sum()/len(dataset)) > per:
            res.append(f)
        
    return res

list_null(train_csv, train_csv.columns, 0.001)


# Distribution info

In [None]:
'''y = train_csv_nnan['Target']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=stats.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=stats.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=stats.lognorm)'''

# Note: output is highly concentrated around 0

# Datashowcase

In [None]:
train_csv.Target[0:9]

In [None]:
fig, axes = plt.subplots(3, 2, sharex=True)
axes[0, 0].plot(train_csv.value)
axes[0,0].set_title('Original Series')
plot_acf(train_csv.value, ax=axes[0, 1])

# Dataset preparation

In [None]:
class TimeSeriesDataset (Dataset):
    def __init__(self, X, y, seq_len=1):
        self.X = X
        self.y = y
        self.seq_len = seq_len
    
    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)
    
    def __getitem__(self, index):
        return (self.X[index:index+self.seq_len], self.y[index+self.seq_len-1])

# Dataloader

In [None]:
train_dataset = TimeSeriesDataset(X, target, seq_len=100)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 50, shuffle = False)

for i, d in enumerate(train_dataloader):
    print(i, d[0].shape, d[1].shape)

# Define the RNN

Next, we define an RNN in PyTorch. We'll use `nn.RNN` to create an RNN layer, then we'll add a last, fully-connected layer to get the output size that we want. An RNN takes in a number of parameters:
* **input_size** - the size of the input
* **hidden_dim** - the number of features in the RNN output and in the hidden state
* **n_layers** - the number of layers that make up the RNN, typically 1-3; greater than 1 means that you'll create a stacked RNN
* **batch_first** - whether or not the input/output of the RNN will have the batch_size as the first dimension (batch_size, seq_length, hidden_dim)

Take a look at the [RNN documentation](https://pytorch.org/docs/stable/nn.html#rnn) to read more about recurrent layers.

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(RNN, self).__init__()
        
        self.hidden_dim=hidden_dim

        # define an RNN with specified parameters
        # batch_first means that the first dim of the input and output will be the batch_size
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        
        # last, fully-connected layer
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x, hidden):
        # x (batch_size, seq_length, input_size)
        # hidden (n_layers, batch_size, hidden_dim)
        # r_out (batch_size, time_step, hidden_size)
        batch_size = x.size(0)
        
        # get RNN outputs
        r_out, hidden = self.rnn(x, hidden)
        # shape output to be (batch_size*seq_length, hidden_dim)
        r_out = r_out.view(-1, self.hidden_dim)  
        
        # get final output 
        output = self.fc(r_out)
        
        return output, hidden

### Check the input and output dimensions

As a check that your model is working as expected, test out how it responds to input data.

In [None]:
# test that dimensions are as expected
test_rnn = RNN(input_size=1, output_size=1, hidden_dim=10, n_layers=2)

# generate evenly spaced, test data pts
''' time_steps = np.linspace(0, np.pi, seq_length)
data = np.sin(time_steps)
data.resize((seq_length, 1))'''

test_input = torch.Tensor(data).unsqueeze(0) # give it a batch_size of 1 as first dimension
print('Input size: ', test_input.size())

# test out rnn sizes
test_out, test_h = test_rnn(test_input, None)
print('Output size: ', test_out.size())
print('Hidden state size: ', test_h.size())

# Training the RNN

In [None]:
# decide on hyperparameters
input_size=1 
output_size=1
hidden_dim=32
n_layers=1

# instantiate an RNN
rnn = RNN(input_size, output_size, hidden_dim, n_layers)
print(rnn)

### Loss and Optimization

This is a regression problem: can we train an RNN to accurately predict the next data point, given a current data point?

>* The data points are coordinate values, so to compare a predicted and ground_truth point, we'll use a regression loss: the mean squared error.
* It's typical to use an Adam optimizer for recurrent models.

In [None]:
# MSE loss and Adam optimizer with a learning rate of 0.01
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.01) 

### Defining the training function

This function takes in an rnn, a number of steps to train for, and returns a trained rnn. This function is also responsible for displaying the loss and the predictions, every so often.

#### Hidden State

Pay close attention to the hidden state, here:
* Before looping over a batch of training data, the hidden state is initialized
* After a new hidden state is generated by the rnn, we get the latest hidden state, and use that as input to the rnn for the following steps

In [None]:
# train the RNN
def train(rnn, n_steps, print_every):
    
    # initialize the hidden state
    hidden = None      
    
    for batch_i, step in enumerate(range(n_steps)):
        # defining the training data 
        time_steps = np.linspace(step * np.pi, (step+1)*np.pi, seq_length + 1)
        data = np.sin(time_steps)
        data.resize((seq_length + 1, 1)) # input_size=1

        x = data[:-1]
        y = data[1:]
        
        # convert data into Tensors
        x_tensor = torch.Tensor(x).unsqueeze(0) # unsqueeze gives a 1, batch_size dimension
        y_tensor = torch.Tensor(y)

        # outputs from the rnn
        prediction, hidden = rnn(x_tensor, hidden)

        ## Representing Memory ##
        # make a new variable for hidden and detach the hidden state from its history
        # this way, we don't backpropagate through the entire history
        hidden = hidden.data

        # calculate the loss
        loss = criterion(prediction, y_tensor)
        # zero gradients
        optimizer.zero_grad()
        # perform backprop and update weights
        loss.backward()
        optimizer.step()

        # display loss and predictions
        if batch_i%print_every == 0:        
            print('Loss: ', loss.item())
            plt.plot(time_steps[1:], x, 'r.') # input
            plt.plot(time_steps[1:], prediction.data.numpy().flatten(), 'b.') # predictions
            plt.show()
    
    return rnn
