In [1]:
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install statsmodels
# !pip install sklearn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import pickle
import math

In [3]:
import tensorflow.keras as keras
from statsmodels.api import OLS
from sklearn import svm
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.layers import Dense, LSTM

# Load dataset

In [4]:
with open('dataset.pkl', 'rb') as f:
    data = pickle.load(f)

# Generate the dataset of all subsets

In [5]:
def work_with_nan(df):
    df = df.fillna(0)
    
    return df

In [6]:
tXY = pd.DataFrame(columns = data['MutantApeYachtClub'].columns)
tXY['collection'] = None

for i in data.keys():
    tmp = data[i].copy()
    tmp = work_with_nan(tmp)
    tmp['collection'] = i
    
    tXY = pd.concat([tXY, tmp], sort=False)

print('len of the whole dataset:', len(tXY))
tXY.head()

len of the whole dataset: 9563


Unnamed: 0,dates,prices_avg,prices_floor,prices_median,volume,marketcap,lowwatermark,amount_transactions,amount_owners,amount_sellers,...,btc adj close,btc volume,eth adj close,eth volume,sol adj close,sol volume,change in btc,change in eth,change in sol,collection
0,2022-05-01,23420.275634,0.0,20321.779297,196543000.0,238973.5,0.0,8392,31476.0,18,...,38469.09375,27002760110,2827.756104,15332730152,89.671509,1426749080,0.0,0.0,0.0,Otherdeed
1,2022-05-02,28144.463557,0.0,14212.331055,110945500.0,2493893.0,0.0,3942,33094.0,3,...,38529.328125,32922642426,2857.4104,18609741545,87.581017,1265220129,60.234375,29.654297,-2.090492,Otherdeed
2,2022-05-03,23819.450167,0.0,12583.413086,39230630.0,1252085.0,0.0,1647,33520.0,4,...,37750.453125,27326943244,2783.476318,13026093219,85.842377,1011988792,-778.875,-73.934082,-1.73864,Otherdeed
3,2022-05-04,23066.77605,0.0,11631.999023,7888837.0,728096700.0,0.0,342,33825.0,72,...,39698.371094,36754404490,2940.644775,18186749944,92.770851,1428455342,1947.917969,157.168457,6.928474,Otherdeed
4,2022-05-05,38025.247593,9555.102539,12412.468262,5399585.0,860742700.0,0.0,142,33831.0,1037,...,36575.140625,43106256317,2749.213135,22642925048,84.596024,1805979399,-3123.230469,-191.431641,-8.174828,Otherdeed


In [7]:
with open('all data.pkl', 'wb') as f:
    pickle.dump(tXY, f)

# Feature engineering

In [8]:
tXY.head()

Unnamed: 0,dates,prices_avg,prices_floor,prices_median,volume,marketcap,lowwatermark,amount_transactions,amount_owners,amount_sellers,...,btc adj close,btc volume,eth adj close,eth volume,sol adj close,sol volume,change in btc,change in eth,change in sol,collection
0,2022-05-01,23420.275634,0.0,20321.779297,196543000.0,238973.5,0.0,8392,31476.0,18,...,38469.09375,27002760110,2827.756104,15332730152,89.671509,1426749080,0.0,0.0,0.0,Otherdeed
1,2022-05-02,28144.463557,0.0,14212.331055,110945500.0,2493893.0,0.0,3942,33094.0,3,...,38529.328125,32922642426,2857.4104,18609741545,87.581017,1265220129,60.234375,29.654297,-2.090492,Otherdeed
2,2022-05-03,23819.450167,0.0,12583.413086,39230630.0,1252085.0,0.0,1647,33520.0,4,...,37750.453125,27326943244,2783.476318,13026093219,85.842377,1011988792,-778.875,-73.934082,-1.73864,Otherdeed
3,2022-05-04,23066.77605,0.0,11631.999023,7888837.0,728096700.0,0.0,342,33825.0,72,...,39698.371094,36754404490,2940.644775,18186749944,92.770851,1428455342,1947.917969,157.168457,6.928474,Otherdeed
4,2022-05-05,38025.247593,9555.102539,12412.468262,5399585.0,860742700.0,0.0,142,33831.0,1037,...,36575.140625,43106256317,2749.213135,22642925048,84.596024,1805979399,-3123.230469,-191.431641,-8.174828,Otherdeed


In [9]:
# label encoder
def label_encoder(df):
    label = LabelEncoder()
    df["collection"] = label.fit_transform(df["collection"])
    
    return df

In [10]:
# Normalizing the Data
def normalize(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    df = scaler.fit_transform(df)
    
    return df

# Models

## LSTM

In [64]:
tmp = tXY.copy()
tmp = tmp.drop(['prices_floor', 'prices_median', 'dates'], axis=1)
tmp = label_encoder(tmp)
tmp = tmp.astype('float32')

scaler = MinMaxScaler(feature_range=(0, 1))
DataScaler = scaler.fit(tmp)
X = DataScaler.transform(tmp)

In [70]:
# Extracting the closing prices of each day
FullData=tmp[['prices_avg']].values
print(FullData[0:5])
 
# Feature Scaling for fast training of neural networks
from sklearn.preprocessing import StandardScaler, MinMaxScaler
 
# Choosing between Standardization or normalization
#sc = StandardScaler()
sc=MinMaxScaler()
 
DataScaler = sc.fit(FullData)
X=DataScaler.transform(FullData)
#X=FullData
 
print('### After Normalization ###')
X[0:5]

[[23420.275]
 [28144.463]
 [23819.45 ]
 [23066.775]
 [38025.246]]
### After Normalization ###


array([[0.02832816],
       [0.03404449],
       [0.02881117],
       [0.02790042],
       [0.04600037]], dtype=float32)

In [71]:
# split into samples
X_samples = list()
y_samples = list()

NumerOfRows = len(X)
TimeSteps=10  # next day's Price Prediction is based on last how many past day's prices

# Iterate thru the values to create combinations
for i in range(TimeSteps , NumerOfRows , 1):
    x_sample = X[i-TimeSteps:i]
    y_sample = X[i]
    X_samples.append(x_sample)
    y_samples.append(y_sample)

################################################
# Reshape the Input as a 3D (number of samples, Time Steps, Features)
X_data=np.array(X_samples)
X_data=X_data.reshape(X_data.shape[0],X_data.shape[1], 1)
print('\n#### Input Data shape ####')
print(X_data.shape)

# We do not reshape y as a 3D data  as it is supposed to be a single column only
y_data=np.array(y_samples)
y_data=y_data.reshape(y_data.shape[0], 1)
print('\n#### Output Data shape ####')
print(y_data.shape)


#### Input Data shape ####
(9553, 10, 1)

#### Output Data shape ####
(9553, 1)


In [72]:
# Choosing the number of testing data records
TestingRecords=5

# Splitting the data into train and test
X_train=X_data[:-TestingRecords]
X_test=X_data[-TestingRecords:]
y_train=y_data[:-TestingRecords]
y_test=y_data[-TestingRecords:]

############################################

# Printing the shape of training and testing
print('\n#### Training Data shape ####')
print(X_train.shape)
print(y_train.shape)
print('\n#### Testing Data shape ####')
print(X_test.shape)
print(y_test.shape)


#### Training Data shape ####
(9548, 10, 1)
(9548, 1)

#### Testing Data shape ####
(5, 10, 1)
(5, 1)


In [73]:
# Defining Input shapes for LSTM
TimeSteps=X_train.shape[1]
TotalFeatures=X_train.shape[2]
print("Number of TimeSteps:", TimeSteps)
print("Number of Features:", TotalFeatures)

Number of TimeSteps: 10
Number of Features: 1


In [79]:
# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# Initialising the RNN
regressor = Sequential()

# Adding the First input hidden layer and the LSTM layer
# return_sequences = True, means the output of every time step to be shared with hidden next layer
regressor.add(LSTM(units = 10, activation = 'relu', input_shape = (TimeSteps, TotalFeatures), return_sequences=True))

# Adding the Second Second hidden layer and the LSTM layer
regressor.add(LSTM(units = 5, activation = 'relu', input_shape = (TimeSteps, TotalFeatures), return_sequences=True))

# Adding the Second Third hidden layer and the LSTM layer
regressor.add(LSTM(units = 5, activation = 'relu', return_sequences=False ))


# Adding the output layer
regressor.add(Dense(units = 1))

# Compiling the RNN
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

##################################################

import time
# Measuring the time taken by the model to train
StartTime=time.time()

# Fitting the RNN to the Training set
regressor.fit(X_train, y_train, batch_size = 5, epochs = 5)

EndTime=time.time()
print("## Total Time Taken: ", round((EndTime-StartTime)/60), 'Minutes ##')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
## Total Time Taken:  0 Minutes ##


In [80]:
# Making predictions on test data
predicted_Price = regressor.predict(X_test)
predicted_Price = DataScaler.inverse_transform(predicted_Price)
 
# Getting the original price values for testing data
orig=y_test
orig=DataScaler.inverse_transform(y_test)
 
# Accuracy of the predictions
print('Accuracy:', 100 - (100*(abs(orig-predicted_Price)/orig)).mean())
 

Accuracy: -351.42059326171875
