In [None]:
!pip install jovian opendatasets matplotlib seaborn xgboost --upgrade --quiet


In [5]:
# Import packages
import jovian
import opendatasets as od
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

## Downloading the Data
The dataset is obtained from kaggle. The dataset contains information on historic trades for several cryptoassets, such as Bitcoin and Ethereum.

In [6]:
data_url = 'https://www.kaggle.com/c/g-research-crypto-forecasting/data'
od.download(data_url)

In [8]:
data_dir = './g-research-crypto-forecasting'


In [9]:
os.listdir(data_dir)


In [11]:
# Execute this to save new versions of the notebook
#jovian.commit(project="g-research-crypto-forecasting")

## Data Preparation and Cleaning
In this section, train.csv will be read in. The training set has the following variables

timestamp - A timestamp for the minute covered by the row.

Asset_ID - An ID code for the cryptoasset.

Count - The number of trades that took place this minute.

Open - The USD price at the beginning of the minute.

High - The highest USD price during the minute.

Low - The lowest USD price during the minute.

Close - The USD price at the end of the minute.

Volume - The number of cryptoasset units traded during the minute.

VWAP - The volume weighted average price for the minute.

Target - 15 minute residualized returns.

The following variables will be created based on the timestamp column.

hour is the hour of the day

weekday is the weekday of the week

Categorical variables (Asset_ID, hour, and weekday) will be on hot encoded.

Numeric variables (Count, Open, High, Low, Close, Volume, and VWAP) will be scaled to the value of 0 to 1 for each of the Asset_ID.

## Read in Dataset

In [12]:
train_df = pd.read_csv('./g-research-crypto-forecasting/train.csv')
asset_details_df = pd.read_csv('./g-research-crypto-forecasting/asset_details.csv')
test_df = pd.read_csv('./g-research-crypto-forecasting/example_test.csv')

In [13]:
train_df.shape


In [14]:
# Convert timestamp to date time
train_df['timestamp'] = train_df.timestamp.astype('datetime64[s]')
train_df = train_df[train_df.timestamp.dt.year==2020]
train_df = train_df[train_df.timestamp.dt.month==11]
train_df.shape

The reduced training set has 598769 records.

## Handeling Missing Values
Check missing values

In [15]:
train_df.isna().sum()


There are 8814 records with missing data available

In [16]:
train_df.isin([np.nan, np.inf, -np.inf]).sum()


The following code will drop the missing values and infinity values.

In [18]:
train_df.dropna(inplace=True)
train_df = train_df[np.isfinite(train_df).all(1)]

In [19]:
train_df.describe()


## Feature Engineering
The following code converts timestamp to date time type and creates variables hour and weekday based on the value of column timestamp.

In [20]:
# Create hour variable
train_df['hour'] = train_df.timestamp.dt.hour

# Create weekday variable
train_df['weekday'] = train_df.timestamp.dt.weekday

### One Hot Encode Categorical Variables
This section will convert variables Asset_ID, hour and weekday to categorical variables and create one hot encoder for categorical variables

In [21]:
# Convert Asset_ID, hour and weekday to categorical
train_df['Asset_ID'] = train_df.Asset_ID.astype('category')
train_df['hour'] = train_df.hour.astype('category')
train_df['weekday'] = train_df.weekday.astype('category')

In [22]:
# Set up Encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# fit onehotcoder
encoder.fit(train_df[['Asset_ID','hour','weekday']])

In [23]:
# get new encoded cols names
encoded_cols = list(encoder.get_feature_names(['Asset_ID','hour','weekday']))

# replace categorical variables with one hot encoder
train_df[encoded_cols] = encoder.transform(train_df[['Asset_ID','hour','weekday']])

### Scale Numeric Variables
This section will scale numeric variables Count, Open, High, Low, Close, Volume, and VWAP to range from 0 to 1.

In [24]:
scaler = MinMaxScaler()

num_cols = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']

scaler.fit(train_df[num_cols])
train_df[num_cols] = scaler.transform(train_df[num_cols])

## Time Series Plots
The following code get the time range for each of the Asset_ID.

In [25]:
train_df.groupby(["Asset_ID"]).agg({'timestamp': [np.min,np.max]})


The date range is slightly different for each of the asset.



In [26]:
plt.figure(figsize=(50, 25))
sns.lineplot(data=train_df, x="timestamp", y="Target", hue="Asset_ID")

The Target variable has very different patterns for each of the asset of interest.

#### Correlation Matrix
The following code provide correlation matrix for each of the asset.

In [27]:
corr_matrix = train_df[['Count','Open','High','Low','Close','Volume','VWAP','Target']].corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix,annot=True, cmap='Blues');

The Open, High, Low, Close, and VWAP are highly correlated.