In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import metrics
import plotly as plt
import plotly.graph_objects as go
import plotly.express as px
import datetime
import joblib

In [13]:
df=pd.read_csv('../data/input_data/bitfinix.csv')
df.time = pd.to_datetime(df.time, unit='ms')

In [14]:
df = df.set_index('time').asfreq('1Min')

In [16]:
df = df.mask(np.isinf(df))
cols = ['open','high','low', 'close'] 
df[cols] = df[cols].fillna(method='ffill')
df.update(df['volume'].fillna(0))

In [17]:
def StochRSI(price, window=14):
    delta = price['close'].diff()
    upDays = delta.copy()
    upDays[delta<=0]=0.0
    downDays = abs(delta.copy())
    downDays[delta>0]=0.0
    

    RolUp = upDays.rolling(window).mean()
    RolDown = downDays.rolling(window).mean()

    RS = RolUp / RolDown

    rsi= 100.0 - (100.0 / (1.0 + RS))
    df['RSI'] = rsi
    stochrsi  = (rsi - rsi.rolling(window).min()) / (rsi.rolling(window).max() - rsi.rolling(window).min())
    df['SRSI'] = stochrsi
    
StochRSI(df)

In [18]:
def addBollinger(df, period=20, col='close'):
    bbmid_series = df[col].rolling(window=period).mean()
    series_stdev = df[col].rolling(window=period).std()
    BBUpperBand = bbmid_series + 2*series_stdev
    BBLowerBand = bbmid_series - 2*series_stdev
    df['B%'] = (df['close'] - BBLowerBand)/(BBUpperBand - BBLowerBand)
    df['BBBandwidth'] = (BBUpperBand - BBLowerBand ) / bbmid_series
    return df

df = addBollinger(df)

In [19]:
def MACD(df, n_fast, n_slow):
    EMAfast = pd.Series(df['close'].ewm(span = n_fast, min_periods = n_fast - 1).mean())  
    EMAslow = pd.Series(df['close'].ewm(span = n_slow, min_periods = n_slow - 1).mean()) 
    MACD = pd.Series(EMAfast - EMAslow, name = 'MACD')  
    df = df.join(MACD)  
    return df

df = MACD(df,12,26)

In [20]:
def WillR(df):
    highest_high = df['high'].rolling(window=14,center=False).max()
    lowest_low = df['low'].rolling(window=14,center=False).min()
    df['WillR'] = (-100) * ((highest_high - df['close']) / (highest_high - lowest_low))
willR = WillR(df)

In [21]:
df['y_shifted'] = df['close'].shift(10)

In [22]:
df = df.mask(np.isinf(df))

In [23]:
df.fillna(0,inplace=True)

In [24]:
df_split_date = pd.Timestamp('2017-01-01 00:00:00')

In [25]:
df = df.reset_index()

In [26]:
df = df[(pd.to_datetime(df["time"]) >= df_split_date)]

In [32]:
split_date = pd.Timestamp('2019-01-01 00:00:00')

In [33]:
train_test_data = df[(pd.to_datetime(df["time"]) < split_date)]

In [34]:
validation_data = df[(pd.to_datetime(df["time"]) >= split_date)]

In [35]:
split_row = len(train_test_data) - int(0.2 * len(train_test_data))
train_data = train_test_data.iloc[:split_row]
test_data = train_test_data.iloc[split_row:]

In [36]:
#x_train = train_data[['high','low','open','volume']]
#y_train = train_data[['close']]
#x_test = test_data[['high','low','open','volume']]
#y_test = test_data[['close']]

x_train = train_data[['high','low','open','volume','RSI','SRSI','B%','BBBandwidth','MACD','WillR']]
y_train = train_data[['y_shifted']]
x_test = test_data[['high','low','open','volume','RSI','SRSI','B%','BBBandwidth','MACD','WillR']]
y_test = test_data[['y_shifted']]

In [51]:
candlestickFigure = go.Figure()
SRSIFig = go.Figure()
candlestickFigure.add_trace(go.Candlestick(x=train_test_data['time'].tail(10000),
                open=train_test_data['open'].tail(10000),
                high=train_test_data['high'].tail(10000),
                low=train_test_data['low'].tail(10000),
                close=train_test_data['close'].tail(10000)))
candlestickFigure.update_layout(title='Bitcoin OHLC Price', xaxis_rangeslider_visible=True,  yaxis_fixedrange= False)
candlestickFigure.show()


In [28]:
import plotly.figure_factory as ff
corrs = x_train.corr()

correlation_figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
correlation_figure.show()

In [22]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth= 100,
                           n_estimators=400,
                           min_samples_split=2,
                           max_features=4,
                           n_jobs=30
                          )
rf.fit(x_train, np.ravel(y_train));


In [21]:
filename = 'randomforestpredict.sav'
joblib.dump(rf,  filename,compress=10)

['randomforestpredict.sav']

In [None]:
randomForest_Model = joblib.load('randomforest.sav')

In [29]:
from plotly.offline import plot as py

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)

cols = ['high','low','open','volume','RSI','SRSI','B%','BBBandwidth','MACD','WillR']
data = []

# Print the feature ranking
print("Feature ranking:")
for f in range(x_train.shape[1]):
    data.append(importances[indices[f]])
features = pd.DataFrame([(data[0], data[1], data[2], data[3], data[4], data[5],
                         data[6], data[7], data[8], data[9])], columns=cols)  

trace = go.Bar(x=cols, y=importances[indices],
               marker=dict(color='red'),
               opacity=0.5
              )

layout = go.Layout(title="Feature importances")
fig = go.Figure(data=[trace], layout=layout)
fig.show()

Feature ranking:


In [30]:
features

Unnamed: 0,high,low,open,volume,RSI,SRSI,B%,BBBandwidth,MACD,WillR
0,0.424323,0.348115,0.198008,0.018328,0.005418,0.005213,0.000421,0.000166,6e-06,7.804116e-07


In [32]:
predicted_y = rf.predict(x_test)

In [93]:
forecasted_close_30min = pd.DataFrame(predicted_y, columns=['y_shifted'])

In [100]:
import pickle
with open('randomforestPredict.pickle', 'wb') as f:
    pickle.dump([validation_data,forecasted_close_10min,forecasted_close_30min,forecasted_close_60min], f)

In [33]:
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
print("Root Mean Squared Error(RMSE) : ", sqrt(mean_squared_error(y_test,predicted_y)))
print("Mean Absolute Error(MAE) : ", mean_absolute_error(y_test,predicted_y))
r2 =  r2_score(y_test, predicted_y)
print("R Squared (R2) : ",r2)


Root Mean Squared Error(RMSE) :  2.670360055028455
Mean Absolute Error(MAE) :  1.1507454239552313
R Squared (R2) :  0.9999951763536954


In [60]:
adj_r2 = 1 - (1 - r2 ** 2) * ((x_train.shape[1] - 1) / (x_train.shape[0] - x_train.shape[1] - 1))

In [87]:
print(rf.score(x_train, y_train), 1 - (1-rf.score(x_train, y_train))*(len(y_train)-1)/(len(y_train)-x_train.shape[1]-1))

0.9999997678637826 0.9999997678610222


In [28]:
import pickle
with open('randomforestPredict.pickle', 'rb') as f:
    actual,forecasted_close_10min,forecasted_close_30min,forecasted_close_60min = pickle.load(f)

In [32]:
forecasted_close_10min.head()

Unnamed: 0,close
0,3832.23725
1,3832.6925
2,3832.5695
3,3832.40675
4,3829.28


In [31]:
forecasted_close_10min = pd.DataFrame(forecasted_close_10min, columns=['close'])
forecasted_close_30min = pd.DataFrame(forecasted_close_30min, columns=['close'])
forecasted_close_60min = pd.DataFrame(forecasted_close_60min, columns=['close'])

In [170]:
time = pd.DataFrame(actual['time'].head(10).values, columns=['time'])

In [171]:
df_10min = pd.concat([time, forecasted_close_10min], ignore_index=True, axis=1)
df_10min.columns = ['time','close']
df_30min = pd.concat([time, forecasted_close_30min], ignore_index=True, axis=1)
df_30min.columns = ['time','close']
df_60min = pd.concat([time, forecasted_close_60min], ignore_index=True, axis=1)
df_60min.columns = ['time','close']

In [172]:
predctionFig = go.Figure()
predctionFig.add_trace(go.Scatter(x=actual['time'].head(10), y=actual['close'].head(10), name="Actual",mode='lines',
                         line_color='deepskyblue'))
predctionFig.add_trace(go.Scatter(x=df_10min['time'], y=df_10min['close'], name="10 minutes ahead",mode='lines',
                         line_color='violet'))

predctionFig.add_trace(go.Scatter(x=df_30min['time'], y=df_30min['close'], name="30 minutes ahead",mode='lines',
                         line_color='red'))

predctionFig.add_trace(go.Scatter(x=df_60min['time'], y=df_60min['close'], name="60 minutes ahead",mode='lines',
                         line_color='green'))
predctionFig.layout.update(title_text='Predicted VS Actual Value')

predctionFig.show()

In [35]:
preds_df = pd.DataFrame(predicted_y, columns=['close'])

In [None]:
predictionFig = go.Figure()
predictionFig.add_trace(go.Scatter(x=test_data['time'].tail(100000), y=preds_df['close'].tail(100000), name="Predicted"))
predictionFig.add_trace(go.Scatter(x=test_data['time'].tail(100000), y=y_test['close'].tail(100000), name="Actual"))
predictionFig.layout.update(title_text='Predicted VS Actual Value',xaxis_rangeslider_visible=True)

predictionFig.show()