In [16]:
# for data handling:
import numpy as np
import pandas as pd
from datetime import date, datetime

# for RNN:
from tensorflow import keras
from tensorflow.keras import layers

# for Plotting:
!pip install chart-studio
import plotly.graph_objects as go



In [17]:
# import the Historic Crypto package:
!pip install Historic-Crypto
from Historic_Crypto import HistoricalData

# obtain bitcoin data, calculate returns and intraday volatility:
dataset = HistoricalData(start_date = '2013-06-06',ticker = 'BTC').retrieve_data()
dataset['Returns'] = dataset['Close'].pct_change()
dataset['Volatility'] = np.abs(dataset['Close']- dataset['Open'])
dataset.dropna(axis = 0, how = 'any', inplace = True)
dataset.head()



Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market_Cap,Returns,Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-06-07,118.97,119.0,106.42,111.5,0,1255258638,-0.055085,7.47
2013-06-08,111.0,111.42,107.3,108.3,0,1219717920,-0.0287,2.7
2013-06-09,107.89,108.99,88.5,100.0,0,1126642500,-0.076639,7.89
2013-06-10,100.44,110.1,95.0,106.35,0,1198638945,0.0635,5.91
2013-06-11,106.35,109.6,104.0,108.9,0,1227874725,0.023977,2.55


In [18]:
def plot_dates_values(data_timestamps, data_plot):
  '''
  This function provides plotly plots of the input series.

  Arguments: 
          data_timestamps: the timestamp associated with each instance of data.
          data_plot: the series of data to be plotted.

  Returns:
          fig: displays a figure of the series with a slider and buttons.
  '''

  fig = go.Figure()
  fig.add_trace(go.Scatter(x = data_timestamps, y = data_plot,
                           mode = 'lines',
                           name = data_plot.name,
                           connectgaps=True))
  fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1 Years", step="year", stepmode="backward"),
            dict(count=2, label="2 Years", step="year", stepmode="backward"),
            dict(count=3, label="3 Years", step="year", stepmode="backward"),
            dict(label="All", step="all")
        ]))) 
  
  fig.update_layout(
    title=data_plot.name,
    xaxis_title="Date",
    yaxis_title="",
    font=dict(
        family="Arial",
        size=11,
        color="#7f7f7f"
    ))
  
  return fig.show()

In [19]:
plot_dates_values(dataset.index, dataset['Volume'])

In [20]:
plot_dates_values(dataset.index, dataset['Close'])

In [21]:
plot_dates_values(dataset.index, dataset['Open'])

In [22]:
plot_dates_values(dataset.index, dataset['Volatility'])

In [23]:
plot_dates_values(dataset.index, dataset['Returns'])

In [24]:
def generate_train_test_split(data, train_end, test_start):
  '''
  This function splits the dataset into training data and testing data through
  use of strings. The strings provided as arguments for 'train_end' and
  'test_start' must be on sequential days.

  Arguments: 
          data: the DataFrame to be split into training and testing data.
          train_end: the date on which the training data ends (str).
          test_start: the date on which the testing data begins (str).

  Returns:
          training_data: data to be used in model training (Pandas DataFrame).
          testing_data: the data to be used in model testing (Pandas DataFrame).
  '''
  if isinstance(train_end, str) is False:
    raise TypeError("train_end argument should be a string.")
  
  if isinstance(test_start, str) is False:
    raise TypeError("test_start argument should be a string.")

  train_end_datetime = datetime.strptime(train_end, '%Y-%m-%d')
  test_start_datetime = datetime.strptime(test_start, '%Y-%m-%d')
  while train_end_datetime >= test_start_datetime:
    raise ValueError("train_end argument cannot occur prior to the test_start argument.")
  while abs((train_end_datetime - test_start_datetime).days) > 1:
    raise ValueError("the train_end argument and test_start argument should be seperated by 1 day.")

  training_data = data[:train_end]
  testing_data = data[test_start:]

  print('Train Dataset Shape:',training_data.shape)
  print('Test Dataset Shape:',testing_data.shape)

  return training_data, testing_data

In [25]:
# We now call the above function, generating training and testing data:
training_data, testing_data = generate_train_test_split(dataset, '2018-12-31','2019-01-01')

Train Dataset Shape: (2034, 8)
Test Dataset Shape: (656, 8)


In [26]:
def normalise_training_values(data):
  '''
  This function normalises the input values by both mean and standard deviation.
  The mean and standard deviation must be saved for test set standardisation downstream.

  Arguments: 
          data: the DataFrame column to be normalised.

  Returns:
          values: normalised data to be used in model training (numpy array).
          mean: the training set mean, to be used for normalising test set (float).
          std: the training set standard deviation, to be used for normalising the test set (float).
  '''
  if isinstance(data, pd.Series) is False:
    raise TypeError("data argument should be a Pandas Series.")

  values = data.to_list()
  mean = np.mean(values)
  values -= mean
  std = np.std(values)
  values /= std
  print("*"*80)
  print("The length of the training data is: {}".format(len(values)))
  print("The mean of the training data is: {}".format(mean.round(2)))
  print("The standard deviation of the training data is {}".format(std.round(2)))
  print("*"*80)
  return values, mean, std



In [27]:
# now call above function:
training_values, training_mean, training_std = normalise_training_values(training_data['Volume'])

********************************************************************************
The length of the training data is: 2034
The mean of the training data is: 1541822040.56
The standard deviation of the training data is 3084022262.69
********************************************************************************


In [28]:
# define the number of time-steps in each sequence:
TIME_STEPS = 30

def generate_sequences(values, time_steps = TIME_STEPS):
  '''
  This function generates sequences of length 'TIME_STEPS' to be passed to the model.

  Arguments: 
          values: the normalised values which generate sequences (numpy array).
          time_steps: the length of the sequences (int).

  Returns:
          train_data: 3D data to be used in model training (numpy array).
  '''
  if isinstance(values, np.ndarray) is False:
    raise TypeError("values argument must be a numpy array.")
  if isinstance(time_steps, int) is False:
    raise TypeError("time_steps must be an integer object.")

  output = []

  for i in range(len(values) - time_steps):
    output.append(values[i : (i + time_steps)])
  train_data = np.expand_dims(output, axis =2)
  print("Training input data shape: {}".format(train_data.shape))

  return train_data

In [29]:
# now call the above function to generate x_train:  
x_train = generate_sequences(training_values)

Training input data shape: (2004, 30, 1)


In [30]:
def define_model(x_train):
  '''
  This function uses the dimensions of x_train to generate an RNN model.

  Arguments: 
          x_train: 3D data to be used in model training (numpy array).

  Returns:
          model: the model architecture (Tensorflow Object).
          model_summary: a summary of the model's architecture.
  '''

  if isinstance(x_train, np.ndarray) is False:
    raise TypeError("The x_train argument should be a 3 dimensional numpy array.")

  num_steps = x_train.shape[1]
  num_features = x_train.shape[2]

  keras.backend.clear_session()
  
  model = keras.Sequential(
      [
       layers.Input(shape=(num_steps, num_features)),
       layers.Conv1D(filters=32, kernel_size = 15, padding = 'same', data_format= 'channels_last',
                     dilation_rate = 1, activation = 'linear'),
       layers.LSTM(units = 25, activation = 'tanh', name = 'LSTM_layer_1',return_sequences= False),
       layers.RepeatVector(num_steps),
       layers.LSTM(units = 25, activation = 'tanh', name = 'LSTM_layer_2', return_sequences= True),
       layers.Conv1D(filters = 32, kernel_size = 15, padding = 'same', data_format = 'channels_last',
                     dilation_rate = 1, activation = 'linear'),
       layers.TimeDistributed(layers.Dense(1, activation = 'linear'))
      ]
  )

  model.compile(optimizer=keras.optimizers.Adam(learning_rate = 0.001), loss = "mse")
  return model, model.summary()

In [31]:
def model_fit():
  '''
  This function calls the above 'define_model()' function, subsequently training
  the model on the x_train data.

  Arguments: 
          N/A.

  Returns:
          model: the trained model.
          history: a summary of how the model trained (training error, validation error).
  '''
  # call the define_model function above on x_train:
  model, summary = define_model(x_train)

  history = model.fit(
    x_train,
    x_train,
    epochs=400,
    batch_size=128,
    validation_split=0.1,
    callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", 
                                              patience=25, 
                                              mode="min", 
                                              restore_best_weights=True)])
  
  return model, history

In [32]:
# call the above function, generating the model and the model's history:
model, history = model_fit()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 30, 32)            512       
_________________________________________________________________
LSTM_layer_1 (LSTM)          (None, 25)                5800      
_________________________________________________________________
repeat_vector (RepeatVector) (None, 30, 25)            0         
_________________________________________________________________
LSTM_layer_2 (LSTM)          (None, 30, 25)            5100      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 30, 32)            12032     
_________________________________________________________________
time_distributed (TimeDistri (None, 30, 1)             33        
Total params: 23,477
Trainable params: 23,477
Non-trainable params: 0
____________________________________________________

In [33]:
def plot_training_validation_loss():
  '''
  This function plots the training and validation loss curves of the trained model,
  enabling visual diagnosis of underfitting (bias) or overfitting (variance).

  Arguments: 
          N/A.

  Returns:
          fig: a visual representation of the model's training loss and validation
          loss curves.

  '''
  training_validation_loss = pd.DataFrame.from_dict(history.history, orient='columns')

  fig = go.Figure()
  fig.add_trace(go.Scatter(x = training_validation_loss.index, y = training_validation_loss["loss"].round(6),
                           mode = 'lines',
                           name = 'Training Loss',
                           connectgaps=True))
  fig.add_trace(go.Scatter(x = training_validation_loss.index, y = training_validation_loss["val_loss"].round(6),
                           mode = 'lines',
                           name = 'Validation Loss',
                           connectgaps=True))
  
  fig.update_layout(
  title='Training and Validation Loss',
  xaxis_title="Epoch",
  yaxis_title="Loss",
  font=dict(
        family="Arial",
        size=11,
        color="#7f7f7f"
    ))
  
  return fig.show()

In [34]:
# call the above function:
plot_training_validation_loss()

In [35]:
def reconstruction_error(x_train):
  '''
  This function calculates the reconstruction error and displays a histogram of
  the training mean absolute error.

  Arguments: 
          x_train: 3D data to be used in model training (numpy array).

  Returns:
          fig: a visual representation of the training MAE distribution.

  '''

  if isinstance(x_train, np.ndarray) is False:
    raise TypeError("x_train argument should be a numpy array.")

  x_train_pred = model.predict(x_train)
  global train_mae_loss
  train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis = 1)
  histogram = train_mae_loss.flatten() 
  fig =go.Figure(data = [go.Histogram(x = histogram, 
                                      histnorm = 'probability',
                                      name = 'MAE Loss')])  
  fig.update_layout(
  title='Mean Absolute Error Loss',
  xaxis_title="Training MAE Loss (%)",
  yaxis_title="Number of Samples",
  font=dict(
        family="Arial",
        size=11,
        color="#7f7f7f"
    ))
  
  print("*"*80)
  print("Reconstruction error threshold: {} ".format(np.max(train_mae_loss).round(4)))
  print("*"*80)
  return fig.show()

In [36]:
# now call the above function:
reconstruction_error(x_train)

********************************************************************************
Reconstruction error threshold: 0.8401 
********************************************************************************


In [37]:
def normalise_testing_values(data, training_mean, training_std):
  '''
  This function uses the training mean and standard deviation to normalise
  the testing data, generating a numpy array of test values.

  Arguments: 
          data: the data to be used in model testing (Pandas DataFrame column).
          mean: the training set mean (float).
          std: the training set standard deviation (float).

  Returns:
          values: an array of testing values (numpy array).

  '''
  if isinstance(data, pd.Series) is False:
    raise TypeError("data argument should be a Pandas Series.")

  values = data.to_list()
  values -= training_mean
  values /= training_std
  print("*"*80)
  print("The length of the testing data is: {}".format(data.shape[0]))
  print("The mean of the testing data is: {}".format(data.mean()))
  print("The standard deviation of the testing data is {}".format(data.std()))
  print("*"*80)

  return values

In [38]:
# now call the above function:
test_value = normalise_testing_values(testing_data['Volume'], training_mean, training_std) 

********************************************************************************
The length of the testing data is: 656
The mean of the testing data is: 22911837419.309452
The standard deviation of the testing data is 11630656431.931973
********************************************************************************


In [39]:
def generate_testing_loss(test_value):
  '''
  This function uses the model to predict anomalies within the test set.
  Additionally, this function generates the 'anomalies' global variable,
  containing the outliers identified by the RNN.

  Arguments: 
          test_value: an array of testing values (numpy array).

  Returns:
          fig: a visual representation of the testing MAE distribution.

  '''
  x_test = generate_sequences(test_value)
  print("*"*80)
  print("Test input shape: {}".format(x_test.shape))

  x_test_pred = model.predict(x_test)
  test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis = 1)
  test_mae_loss = test_mae_loss.reshape((-1))

  global anomalies
  anomalies = (test_mae_loss >= np.max(train_mae_loss)).tolist()
  print("Number of anomaly samples: ", np.sum(anomalies))
  print("Indices of anomaly samples: ", np.where(anomalies))
  print("*"*80)

  histogram = test_mae_loss.flatten() 
  fig =go.Figure(data = [go.Histogram(x = histogram, 
                                      histnorm = 'probability',
                                      name = 'MAE Loss')])  
  fig.update_layout(
  title='Mean Absolute Error Loss',
  xaxis_title="Testing MAE Loss (%)",
  yaxis_title="Number of Samples",
  font=dict(
        family="Arial",
        size=11,
        color="#7f7f7f"
    ))
    
  return fig.show()

In [40]:
# call the above function:
generate_testing_loss(test_value)

Training input data shape: (626, 30, 1)
********************************************************************************
Test input shape: (626, 30, 1)
Number of anomaly samples:  471
Indices of anomaly samples:  (array([ 66,  71,  72,  73,  74,  86, 100, 101, 102, 103, 104, 105, 106,
       107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
       120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
       172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
       185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
       202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 216,
       217, 224, 225, 226, 227, 237, 238, 258, 259, 260, 261, 269, 270,
       271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282,

In [41]:
def plot_outliers(data):
  '''
  This function determines the position of the outliers within the time-series,
  which are subsequently plotted.

  Arguments: 
          data: the initial dataset (Pandas DataFrame).

  Returns:
          fig: a visual representation of the outliers present in the series, as
          determined by the RNN.

  '''

  outliers = []

  for data_idx in range(TIME_STEPS -1, len(test_value) - TIME_STEPS + 1):
    time_series = range(data_idx - TIME_STEPS + 1, data_idx)
    if all([anomalies[j] for j in time_series]):
      outliers.append(data_idx + len(training_data))

  outlying_data = data.iloc[outliers, :]

  cond = data.index.isin(outlying_data.index)
  no_outliers = data.drop(data[cond].index)

  fig = go.Figure()
  fig.add_trace(go.Scatter(x = no_outliers.index, y = no_outliers["Volume"],
                           mode = 'markers',
                           name = no_outliers["Volume"].name,
                           connectgaps=False))
  fig.add_trace(go.Scatter(x = outlying_data.index, y = outlying_data["Volume"],
                           mode = 'markers',
                           name = outlying_data["Volume"].name + ' Outliers',
                           connectgaps=False))
  
  fig.update_xaxes(rangeslider_visible=True)

  fig.update_layout(
  title='Detected Outliers',
  xaxis_title=data.index.name,
  yaxis_title=no_outliers["Volume"].name,
  font=dict(
        family="Arial",
        size=11,
        color="#7f7f7f"
    ))
  
  return fig.show()

In [42]:
# call the final function:
plot_outliers(dataset)