In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import plotly.graph_objs as go
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from math import sqrt
from sklearn import svm
import plotly.offline as py
py.init_notebook_mode(connected=True)

Using TensorFlow backend.


In [2]:
df = pd.read_csv('bitstampUSD_1-min_data_2012-01-01_to_2018-11-11.csv')

In [3]:
df.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
1,1325317980,,,,,,,
2,1325318040,,,,,,,
3,1325318100,,,,,,,
4,1325318160,,,,,,,


In [4]:
df.tail()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
3603131,1541894100,6348.54,6348.54,6348.54,6348.54,0.007997,50.769274,6348.54
3603132,1541894160,6348.54,6349.01,6348.54,6349.01,0.011729,74.466671,6348.93609
3603133,1541894220,6349.01,6349.01,6349.01,6349.01,0.068436,434.503642,6349.01
3603134,1541894280,,,,,,,
3603135,1541894340,6349.17,6349.32,6349.17,6349.32,0.038261,242.92741,6349.214148


In [5]:
df.shape

(3603136, 8)

In [6]:
df.isnull().sum()

Timestamp                  0
Open                 1214307
High                 1214307
Low                  1214307
Close                1214307
Volume_(BTC)         1214307
Volume_(Currency)    1214307
Weighted_Price       1214307
dtype: int64

In [7]:
df = df.dropna()

In [8]:
df.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
478,1325346600,4.39,4.39,4.39,4.39,48.0,210.72,4.39
547,1325350740,4.5,4.57,4.5,4.57,37.862297,171.380338,4.526411
548,1325350800,4.58,4.58,4.58,4.58,9.0,41.22,4.58
1224,1325391360,4.58,4.58,4.58,4.58,1.502,6.87916,4.58


In [9]:
df.isnull().sum()

Timestamp            0
Open                 0
High                 0
Low                  0
Close                0
Volume_(BTC)         0
Volume_(Currency)    0
Weighted_Price       0
dtype: int64

## Linear Regression model with single feature

In [10]:
df1 = df[['Timestamp', 'Weighted_Price']]

In [11]:
X = df1.drop('Weighted_Price', axis = 1)
y = df1['Weighted_Price'].copy()

In [12]:
bitcoin_x_train, bitcoin_x_test, bitcoin_y_train, bitcoin_y_test = train_test_split(X, y, test_size = 0.20, random_state = 41) 

In [13]:
print(bitcoin_x_train.shape)
print(bitcoin_x_test.shape)
print(bitcoin_y_train.shape)
print(bitcoin_y_test.shape)

(1911063, 1)
(477766, 1)
(1911063,)
(477766,)


In [14]:
regr = linear_model.LinearRegression()

In [15]:
regr.fit(bitcoin_x_train, bitcoin_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
bitcoin_y_pred = regr.predict(bitcoin_x_test)

In [17]:
print("Mean squared error for Linear Regression with single feature: %.3f"
      % mean_squared_error(bitcoin_y_test, bitcoin_y_pred))

Mean squared error for Linear Regression with single feature: 6428612.095


## Linear Regression with multiple feature

In [18]:
df2 = df[['Volume_(BTC)', 'Timestamp', 'Weighted_Price']]

In [None]:
X = df2.drop('Weighted_Price', axis = 1)
y = df2['Weighted_Price'].copy()

In [None]:
bitcoin_x_train, bitcoin_x_test, bitcoin_y_train, bitcoin_y_test = train_test_split(X, y, test_size = 0.20, random_state = 41) 

In [None]:
print(bitcoin_x_train.shape)
print(bitcoin_x_test.shape)
print(bitcoin_y_train.shape)
print(bitcoin_y_test.shape)

(1911063, 2)
(477766, 2)
(1911063,)
(477766,)


In [None]:
regr.fit(bitcoin_x_train, bitcoin_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [None]:
bitcoin_y_pred = regr.predict(bitcoin_x_test)

In [None]:
print("Mean squared error for Linear Regression with multiple features: %.3f"
      % mean_squared_error(bitcoin_y_test, bitcoin_y_pred))

Mean squared error for Linear Regression with multiple features: 6421020.470


## Data Analysis

In [None]:
df['date'] = pd.to_datetime(df['Timestamp'], unit='s').dt.date
group = df.groupby('date')
Real_Price = group['Weighted_Price'].mean()

In [None]:
Real_Price.shape

(2504,)

In [None]:
prediction_days = 300
df_train = Real_Price[:len(Real_Price)-prediction_days-1]
df_test = Real_Price[len(Real_Price)-prediction_days:]

In [None]:
working_data = [df_train, df_test]
working_data = pd.concat(working_data)
working_data = working_data.reset_index()
working_data['date'] = pd.to_datetime(working_data['date'])
working_data = working_data.set_index('date')

In [None]:
s = sm.tsa.seasonal_decompose(working_data.Weighted_Price.values, freq=60)

trace1 = go.Scatter(x = np.arange(0, len(s.trend), 1),y = s.trend,mode = 'lines',name = 'Trend',
    line = dict(color = ('rgb(244, 146, 65)'), width = 4))
trace2 = go.Scatter(x = np.arange(0, len(s.seasonal), 1),y = s.seasonal,mode = 'lines',name = 'Seasonal',
    line = dict(color = ('rgb(66, 244, 155)'), width = 2))

trace3 = go.Scatter(x = np.arange(0, len(s.resid), 1),y = s.resid,mode = 'lines',name = 'Residual',
    line = dict(color = ('rgb(209, 244, 66)'), width = 2))

trace4 = go.Scatter(x = np.arange(0, len(s.observed), 1),y = s.observed,mode = 'lines',name = 'Observed',
    line = dict(color = ('rgb(66, 134, 244)'), width = 2))

data = [trace1, trace2, trace3, trace4]
layout = dict(title = 'Seasonal decomposition', xaxis = dict(title = 'Time'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='seasonal_decomposition')

Prediction using Recurrent neural networks for day wise data

In [None]:
print("Train data length:" + str(len(df_train)))
print("Test data length:" + str(len(df_test)))

Train data length:2203
Test data length:300


In [None]:
training_set = df_train.values
training_set = np.reshape(training_set, (len(training_set), 1))
sc = MinMaxScaler()
training_set = sc.fit_transform(training_set)
#print(len(training_set))
X_train = training_set[0:len(training_set)-1]
#print(X_train)
#print(len(X_train))
y_train = training_set[1:len(training_set)]
#print(y_train)
#print(len(y_train))
X_train = np.reshape(X_train, (len(X_train), 1, 1))

In [None]:
regressor = Sequential()

regressor.add(LSTM(units = 4, activation = 'sigmoid', input_shape = (None, 1)))

regressor.add(Dense(units = 1))

regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

regressor.fit(X_train, y_train, batch_size = 5, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x16ce4b05cf8>

In [None]:
test_set = df_test.values
inputs = np.reshape(test_set, (len(test_set), 1))
inputs = sc.transform(inputs)
inputs = np.reshape(inputs, (len(inputs), 1, 1))
predicted_BTC_price = regressor.predict(inputs)
prediction1_inverse = sc.inverse_transform(predicted_BTC_price)
Y_test_inverse = sc.inverse_transform(inputs.reshape(-1,1))
prediction2_inverse = np.array(prediction1_inverse[:,0][0:])
Y_test2_inverse = np.array(Y_test_inverse[:,0])

In [None]:
trace1 = go.Scatter(
    x = np.arange(0, len(prediction2_inverse), 1),
    y = prediction2_inverse,
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test2_inverse), 1),
    y = Y_test2_inverse,
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

data = [trace1, trace2]
layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='results_demonstrating0')

In [None]:
RMSE = sqrt(mean_squared_error(Y_test2_inverse, prediction2_inverse))
print('Test RMSE: %.3f' % RMSE)

Test RMSE: 168.448


Prediction using Recurrent neural networks for minute wise data

In [None]:
df1['minute'] = df['Timestamp']
group1 = df1.groupby('minute')
Real_Price1 = group1['Weighted_Price'].mean()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [None]:
Real_Price1.shape

(2388829,)

In [None]:
prediction_days = 300000
df_train = Real_Price1[:len(Real_Price)-prediction_days-1]
df_test = Real_Price1[len(Real_Price)-prediction_days:]

In [None]:
print("Train data length:" + str(len(df_train)))
print("Test data length:" + str(len(df_test)))

Train data length:2091332
Test data length:297496


In [None]:
training_set = df_train.values
training_set = np.reshape(training_set, (len(training_set), 1))
sc = MinMaxScaler()
training_set = sc.fit_transform(training_set)
#print(len(training_set))
X_train = training_set[0:len(training_set)-1]
#print(X_train)
#print(len(X_train))
y_train = training_set[1:len(training_set)]
#print(y_train)
#print(len(y_train))
X_train = np.reshape(X_train, (len(X_train), 1, 1))

In [None]:
regressor = Sequential()

regressor.add(LSTM(units = 4, activation = 'sigmoid', input_shape = (None, 1)))

regressor.add(Dense(units = 1))

regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

regressor.fit(X_train, y_train, batch_size = 50, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
test_set = df_test.values
inputs = np.reshape(test_set, (len(test_set), 1))
inputs = sc.transform(inputs)
inputs = np.reshape(inputs, (len(inputs), 1, 1))
predicted_BTC_price = regressor.predict(inputs)
prediction1_inverse = sc.inverse_transform(predicted_BTC_price)
Y_test_inverse = sc.inverse_transform(inputs.reshape(-1,1))
prediction2_inverse = np.array(prediction1_inverse[:,0][0:])
Y_test2_inverse = np.array(Y_test_inverse[:,0])

In [None]:
trace1 = go.Scatter(
    x = np.arange(0, len(prediction2_inverse), 1),
    y = prediction2_inverse,
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test2_inverse), 1),
    y = Y_test2_inverse,
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

data = [trace1, trace2]
layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Minute number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='results_demonstrating0')

In [None]:
RMSE = sqrt(mean_squared_error(Y_test2_inverse, prediction2_inverse))
print('RMSE error for minute wise data: %.3f' % RMSE)