# Data Collection

Data for prediction can either collected from Kaggle or Poloniex. To make sure coherence, the column names for data collected from Poloniex are changed to match with Kaggle’s.

In [14]:
import json
import numpy as np
import os
import pandas as pd
import urllib
from urllib.request import urlopen

# Connect to poloniex's API
url_0 = 'https://poloniex.com/public?command=returnChartData&currencyPair=USDT_XRP&start=1612860000&end=1618242600&period=300'

url = [url_0]
length = [17943]
start = 1612860000
end = 1621499700

url_left = 'https://poloniex.com/public?command=returnChartData&currencyPair=USDT_XRP&start='
url_mid = '&end='
url_right = '&period=300'
num = 28800

# Get data. Poloniex website forbid large data request at one time.
for i in range(18):
    start -= 8640000 
    end -= 8640000
    url_cur = url_left + str(start) + url_mid + str(end) + url_right
    url.append(url_cur)
    length.append(28800)
    num += 1

# Change data request length, because some data are missing from poloniex.
length[18] = 28794
length[17] = 28766
length[16] = 28764
length[15] = 28799
length[6] = 28796
length[2] = 28799

# Parse json returned from the API to Pandas DF
openUrl = urlopen(url.pop())
r = openUrl.read()
openUrl.close()
d = json.loads(r.decode())
df = pd.DataFrame(d, index=[i for i in range(length.pop())], columns=['date','high','low','open','close'])

# Rename data table
original_columns=[u'date', u'high', u'low', u'open', u'close']
new_columns = ['Timestamp', 'High', 'Low', 'Open', 'Close']
df = df.loc[:,original_columns]
df.columns = new_columns
df.to_csv('data/ripple2016to2021.csv',index=None)

for i in range(len(url)-1, -1, -1):
    openUrl = urlopen(url[i])
    r = openUrl.read()
    openUrl.close()
    d = json.loads(r.decode())
    df = pd.DataFrame(d, index=[i for i in range(length[i])], columns=['date','high','low','open','close'])

    df = df.loc[:,original_columns]
    df.columns = new_columns
    df.to_csv('data/ripple2016to2021.csv', mode='a', index=None, header=False)

df.head()

Unnamed: 0,Timestamp,High,Low,Open,Close
0,1612860000,0.4743,0.472879,0.474271,0.474276
1,1612860300,0.474507,0.474178,0.474276,0.474451
2,1612860600,0.474141,0.473067,0.474141,0.473875
3,1612860900,0.473562,0.472824,0.473539,0.472931
4,1612861200,0.472445,0.4713,0.472445,0.4716


# Data Preparation

Data collected from source needs to be parsed in order to send to the model for prediction. The PastSampler class is for splitting the data into a list of datas and labels. The input size (N) is 256, while the output size (K) is 16. Note that data collected from Poloniex was ticked on a 5 minute basis. This indicates that, if the input spans across 1280 minutes, the output covers over 80 minutes

In [15]:
class PastSampler:
    '''
    Forms training samples for predicting future values from past value
    '''
     
    def __init__(self, N, K, sliding_window = True):
        '''
        Predict K future sample using N previous samples
        '''
        self.K = K
        self.N = N
        self.sliding_window = sliding_window
 
    def transform(self, A):
        M = self.N + self.K     #Number of samples per row (sample + target)
        #indexes
        if self.sliding_window:
            I = np.arange(M) + np.arange(A.shape[0] - M + 1).reshape(-1, 1)
        else:
            if A.shape[0]%M == 0:
                I = np.arange(M)+np.arange(0,A.shape[0],M).reshape(-1,1)
                
            else:
                I = np.arange(M)+np.arange(0,A.shape[0] -M,M).reshape(-1,1)
            
        B = A[I].reshape(-1, M * A.shape[1], A.shape[2])
        ci = self.N * A.shape[1]    #Number of features per sample
        return B[:, :ci], B[:, ci:] #Sample matrix, Target matrix

# Data file path
dfp = 'data/ripple2016to2021.csv'

# Columns of price data to use
columns = ['Close']
df = pd.read_csv(dfp)
time_stamps = df['Timestamp']
df = df.loc[:,columns]
original_df = pd.read_csv(dfp).loc[:,columns]

After creating the PastSampler class, we apply it on the collected data. Since the original data ranges from 0 to over 60000, data scaling is needed to allow the neural network to understand the data easier.

In [16]:
file_name='ripple2016to2021_close.h5'

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Normalization
for c in columns:
    df[c] = scaler.fit_transform(df[c].values.reshape(-1,1))
    
# Features are input sample dimensions(channels)
A = np.array(df)[:,None,:]
original_A = np.array(original_df)[:,None,:]
time_stamps = np.array(time_stamps)[:,None,None]

# Make samples of temporal sequences of pricing data (channel)
NPS, NFS = 256, 16         #Number of past and future samples
ps = PastSampler(NPS, NFS, sliding_window=False)
B, Y = ps.transform(A)
input_times, output_times = ps.transform(time_stamps)
original_B, original_Y = ps.transform(original_A)

# Create h5 file.
import h5py
with h5py.File(file_name, 'w') as f:
    f.create_dataset("inputs", data = B)
    f.create_dataset('outputs', data = Y)
    f.create_dataset("input_times", data = input_times)
    f.create_dataset('output_times', data = output_times)
    f.create_dataset("original_datas", data=np.array(original_df))
    f.create_dataset('original_inputs',data=original_B)
    f.create_dataset('original_outputs',data=original_Y)