## Importing necessary libraries and mounting Google Drive

In [None]:
# Let's import necessary libraries
import numpy as np
import glob
from tqdm import tqdm_notebook as tqdm
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Functions for TFR recording

In [None]:
# Functions for TFRecords
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array

In [None]:
# !!! There was a mistake in y = y - 1 before y = to_categorical(y, 3), 
# So classes were mixed instead of 0 - up, 1 - down, 2 - static => the labels
# transformed to 0 - down, 1 - static, 2 - up, so all the TFR files: 
# Prepared_Dataset_ZScore_TFR_split_by_9, Prepared_Dataset_ZScore_TFR_split_by_20, 
# Prepared_Dataset_MinMax_TFR_split_by_20 and Prepared_Dataset_MinMax_TFR_9_10_split_by_20
# were created with these mixed classes. I won't rerecord them now to save time. 

# Function to prepare sample of data from raw features and labels
def data_preparation_forex(features, labels, T, index):
    
  # features
  X= features[index - T:index, :]
  X=X.reshape(X.shape + (1,))

  # labels
  y = labels[index-1]
  y = to_categorical(y, 3)
  
  return X,y

In [None]:
def parse_single_sample(sample, label):
  
  #define the dictionary -- the structure -- of our single example 
  data = {
        'NTicks' : _int64_feature(sample.shape[0]),
        'NFeatures': _int64_feature(sample.shape[1]),
        'NChannels' : _int64_feature(sample.shape[2]),
        'DimLabel':_int64_feature(label.shape[0]),
        'Sample' : _bytes_feature(serialize_array(sample)),
        'Label' : _bytes_feature(serialize_array(label))
    }
  #create an Example, wrapping the single features
  out = tf.train.Example(features=tf.train.Features(feature=data))

  return out

In [None]:
# # Function to record all data in one file

# def write_samples_to_tfr(features, labels,T, filepath):
   
#   writer = tf.io.TFRecordWriter(filepath) #create a writer that'll store our data to disk
#   count = 0

#   for index in tqdm(range(T,len(features)+1)):

#     #get the data we want to write
#     current_sample,current_label = data_preparation_forex(features, labels,
#                                                           T, index) 

#     out = parse_single_sample(sample=current_sample, label=current_label)
#     writer.write(out.SerializeToString())
#     count += 1

#   writer.close()
#   print(f"Wrote {count} elements to TFRecord")

In [None]:
def write_dataset_to_tfr_shards(features, labels, T,max_samples_per_file,
                              filename,out_dir):
  total_num_samples=(len(features)-T+1)
  #determine the number of shards (single TFRecord files) we need:
  splits = (total_num_samples//max_samples_per_file) + 1 #determine how many tfr shards we need
  if total_num_samples%max_samples_per_file == 0:
    splits-=1
  print(f"\nUsing {splits} shard(s) for {total_num_samples} samples, with up to {max_samples_per_file} samples per shard")

  file_count = 0
  
  pbar=tqdm(desc='Overall samples recorded',
              total= total_num_samples) # To see recording progress vs total number of samples 
  
  for i in tqdm(range(splits)):
    current_shard_name = "{}_{}_{}.tfrecords".format(i+1, splits, filename)
    writer = tf.io.TFRecordWriter(out_dir+current_shard_name)
    
    current_shard_count = 0
    
    while current_shard_count < max_samples_per_file: #as long as our shard is not full
      #get the index of the file that we want to parse now
      index = i*max_samples_per_file+current_shard_count+T
      if index == (len(features)+1): #when we have consumed the whole data, stop generation
        break

      #get the data we want to write
      current_sample,current_label = data_preparation_forex(features, labels,
                                                          T, index) 

      #create the required Example representation
      out = parse_single_sample(sample=current_sample, label=current_label)
    
      writer.write(out.SerializeToString())
      current_shard_count+=1
      file_count += 1
      pbar.update(1)

    writer.close()
  pbar.close()
  print(f"\nWrote {file_count} elements to TFRecord")

# Z-scored Recording

## Z-scored dataset 7:00 - 10:00 & 13:00 - 15:00

In [None]:
file_path_full_features_unprepared='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Full_Features_zscore/full_features_zscore_unprepared.npy'
file_path_full_labels_unprepared='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Full_Features_zscore/full_labels_zscore_unprepared.npy'

full_features_zscore_unprepared=np.load(file_path_full_features_unprepared)
full_labels_zscore_unprepared=np.load(file_path_full_labels_unprepared)

In [None]:
# Let's check data types of both arrays
print(full_features_zscore_unprepared.dtype)
print(full_labels_zscore_unprepared.dtype)

float64
float64


The above types are excessive for storing the data, so to optimize them we will downgrade **features** to float32 and **labels** to int8.

In [None]:
features_dtype='float32'
labels_dtype='int8'

full_features_zscore_unprepared=full_features_zscore_unprepared.astype(features_dtype)
full_labels_zscore_unprepared=full_labels__zscore_unprepared.astype(labels_dtype)

In [None]:
# Let's check data types of both arrays again
print(full_features_zscore_unprepared.dtype)
print(full_labels_zscore_unprepared.dtype)

float32
int8


In [None]:
# Let's print the shapes of features and labels
print(full_features_zscore_unprepared.shape)
print(full_labels_zscore_unprepared.shape)

(8315560, 20)
(8315560,)


In [None]:
# Number of equity ticks per sample fed to model 
T=100

In [None]:
# Output Directory and filename pattern to record TFR shards to
out_dir='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/datasets/forex_full/Prepared_Dataset_TFR_new/'
filename="Prepared_Dataset_Forex"

# Total size of transformed dataset
total_num_samples=(len(full_features_zscore_unprepared)-T+1)
print(f'The total size of our data set is {total_num_samples}')
# We want to have ~5% of data per shard to make it convinient 
# to change percentage split when reading the data into train, validation and
# test datasets  
max_samples_per_file=int(np.ceil(total_num_samples/20))
max_samples_per_file
print(f'The maximum amount of samples per shard is {max_samples_per_file}')

The total size of our data set is 8315461
The maximum amount of samples per shard is 415774


In [None]:
write_dataset_to_tfr_shards(full_features_zscore_unprepared, 
                            full_labels_zscore_unprepared, T,
                            max_samples_per_file, filename,out_dir)


Using 20 shard(s) for 8315461 samples, with up to 415774 samples per shard


Overall samples recorded:   0%|          | 0/8315461 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]


Wrote 8315461 elements to TFRecord


!!! There was a mistake in function **data_preparation_forex**, so labels were changed to 0 - down, 1 - static, 2 - up. For more details read  explanation above the function. !!!

## Z-scored dataset 7:00 - 10:00 & 13:00 - 15:00 labeled by hours

In [None]:
file_path_full_features_unprepared='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Raw_Features_and_Labels/zscore_7:00_10:00_and_13:00_15:00_labeled_by_hours/full_features_zscore_unprepared.npy'
file_path_full_labels_unprepared='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Raw_Features_and_Labels/zscore_7:00_10:00_and_13:00_15:00_labeled_by_hours/full_labels_zscore_unprepared.npy'

full_features_zscore_unprepared=np.load(file_path_full_features_unprepared)
full_labels_zscore_unprepared=np.load(file_path_full_labels_unprepared)

In [None]:
# Let's check data types of both arrays
print(full_features_zscore_unprepared.dtype)
print(full_labels_zscore_unprepared.dtype)

float64
int64


The above types are excessive for storing the data, so to optimize them we will downgrade **features** to float32 and **labels** to int8.

In [None]:
features_dtype='float32'
labels_dtype='int8'

full_features_zscore_unprepared=full_features_zscore_unprepared.astype(features_dtype)
full_labels_zscore_unprepared=full_labels_zscore_unprepared.astype(labels_dtype)

In [None]:
# Let's check data types of both arrays again
print(full_features_zscore_unprepared.dtype)
print(full_labels_zscore_unprepared.dtype)

float32
int8


In [None]:
# Let's print the shapes of features and labels
print(full_features_zscore_unprepared.shape)
print(full_labels_zscore_unprepared.shape)

(8315479, 20)
(8315479,)


In [None]:
# Number of equity ticks per sample fed to model 
T=100

In [None]:
# Output Directory and filename pattern to record TFR shards to
out_dir='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Prepared Datasets for models/Prepared_Dataset_ZScore_TFR_periods_7:00_10:00_and_13:00_15:00_labeled_by_hours_split_by_20/'
filename="Prepared_Dataset_Forex"

# Total size of transformed dataset
total_num_samples=(len(full_features_zscore_unprepared)-T+1)
print(f'The total size of our data set is {total_num_samples}')
# We want to have ~5% of data per shard to make it convinient 
# to change percentage split when reading the data into train, validation and
# test datasets  
max_samples_per_file=int(np.ceil(total_num_samples/20))
max_samples_per_file
print(f'The maximum amount of samples per shard is {max_samples_per_file}')

The total size of our data set is 8315380
The maximum amount of samples per shard is 415769


In [None]:
write_dataset_to_tfr_shards(full_features_zscore_unprepared, 
                            full_labels_zscore_unprepared, T,
                            max_samples_per_file, filename,out_dir)


```
Using 20 shard(s) for 8315380 samples, with up to 415769 samples per shard
Overall samples recorded: 15%
1265378/8315380 [13:35<1:26:16, 1361.84it/s]
15%
3/20 [13:35<1:16:06, 268.60s/it]
```



# Min Max Recording

## Min Max 7:00 - 10:00 & 13:00 - 15:00

In [None]:
df_raw=pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/df_time_th0_0000028_k_10')

In [None]:
df_raw

Unnamed: 0_level_0,Date,P_a1,V_a1,P_b1,V_b1,P_a2,V_a2,P_b2,V_b2,P_a3,...,P_a4,V_a4,P_b4,V_b4,P_a5,V_a5,P_b5,V_b5,label,Mid_price
Date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-20 07:00:00.137,2023-01-20,1.08247,600000.0,1.08245,100000.0,1.08248,2000000.0,1.08244,2200000.0,1.08249,...,1.08250,10100000.0,1.08242,11200000.0,1.08250,15600000.0,1.08242,15200000.0,1.0,1.082460
2023-01-20 07:00:00.148,2023-01-20,1.08247,600000.0,1.08244,1200000.0,1.08248,2000000.0,1.08243,6300000.0,1.08249,...,1.08250,10100000.0,1.08242,15200000.0,1.08250,15600000.0,1.08241,17200000.0,1.0,1.082455
2023-01-20 07:00:00.157,2023-01-20,1.08247,600000.0,1.08244,1200000.0,1.08248,2000000.0,1.08243,5300000.0,1.08249,...,1.08250,12100000.0,1.08242,13200000.0,1.08250,15600000.0,1.08241,17200000.0,1.0,1.082455
2023-01-20 07:00:00.172,2023-01-20,1.08247,600000.0,1.08244,1200000.0,1.08248,2000000.0,1.08243,5800000.0,1.08249,...,1.08250,11100000.0,1.08242,13200000.0,1.08250,13600000.0,1.08241,17200000.0,1.0,1.082455
2023-01-20 07:00:00.183,2023-01-20,1.08247,600000.0,1.08244,1200000.0,1.08248,2000000.0,1.08243,5800000.0,1.08249,...,1.08250,11100000.0,1.08242,13200000.0,1.08250,12600000.0,1.08241,17200000.0,2.0,1.082455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-23 14:59:59.543,2023-02-23,1.06070,100000.0,1.06069,3000000.0,1.06071,1500000.0,1.06068,4600000.0,1.06072,...,1.06073,12600000.0,1.06067,12800000.0,1.06074,24600000.0,1.06066,15200000.0,2.0,1.060695
2023-02-23 14:59:59.552,2023-02-23,1.06070,100000.0,1.06069,4100000.0,1.06071,1500000.0,1.06068,5600000.0,1.06072,...,1.06073,11600000.0,1.06067,12800000.0,1.06074,23600000.0,1.06066,17200000.0,2.0,1.060695
2023-02-23 14:59:59.556,2023-02-23,1.06070,100000.0,1.06069,4100000.0,1.06071,1500000.0,1.06068,5600000.0,1.06072,...,1.06073,11600000.0,1.06067,12800000.0,1.06074,23600000.0,1.06066,17200000.0,2.0,1.060695
2023-02-23 14:59:59.559,2023-02-23,1.06070,100000.0,1.06069,4100000.0,1.06071,1500000.0,1.06068,5600000.0,1.06072,...,1.06073,12100000.0,1.06067,16800000.0,1.06074,22600000.0,1.06067,17200000.0,2.0,1.060695


In [None]:
# Now when we prepared the data, let's do min_max scaling
df_min_max_scaled=df_raw.copy()

In [None]:
# Apply min_max_scaling
for column in df_min_max_scaled.columns[:-1]:
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())

In [None]:
df_min_max_scaled

Unnamed: 0_level_0,P_a1,V_a1,P_b1,V_b1,P_a2,V_a2,P_b2,V_b2,P_a3,V_a3,...,V_b3,P_a4,V_a4,P_b4,V_b4,P_a5,V_a5,P_b5,V_b5,label
Date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-20 07:00:00.137,0.554828,0.017644,0.554984,0.002806,0.554828,0.054593,0.554984,0.028904,0.554828,0.098456,...,0.107567,0.554828,0.187197,0.554879,0.137551,0.554828,0.241188,0.555114,0.170012,1.0
2023-01-20 07:00:00.148,0.554828,0.017644,0.554750,0.033705,0.554828,0.054593,0.554750,0.082781,0.554828,0.098456,...,0.145184,0.554828,0.187197,0.554879,0.189117,0.554828,0.241188,0.554879,0.193625,1.0
2023-01-20 07:00:00.157,0.554828,0.017644,0.554750,0.033705,0.554828,0.054593,0.554750,0.069640,0.554828,0.147321,...,0.119241,0.554828,0.225994,0.554879,0.163334,0.554828,0.241188,0.554879,0.193625,1.0
2023-01-20 07:00:00.172,0.554828,0.017644,0.554750,0.033705,0.554828,0.054593,0.554750,0.076210,0.554828,0.147321,...,0.119241,0.554828,0.206596,0.554879,0.163334,0.554828,0.208551,0.554879,0.193625,1.0
2023-01-20 07:00:00.183,0.554828,0.017644,0.554750,0.033705,0.554828,0.054593,0.554750,0.076210,0.554828,0.147321,...,0.119241,0.554828,0.206596,0.554879,0.163334,0.554828,0.192232,0.554879,0.193625,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-23 14:59:59.543,0.045827,0.002938,0.045859,0.084267,0.045827,0.040931,0.045859,0.060442,0.045827,0.110672,...,0.088110,0.045827,0.235694,0.045869,0.158178,0.046060,0.388055,0.045869,0.170012,2.0
2023-02-23 14:59:59.552,0.045827,0.002938,0.045859,0.115166,0.045827,0.040931,0.045859,0.073582,0.045827,0.083797,...,0.114053,0.045827,0.216295,0.045869,0.158178,0.046060,0.371736,0.045869,0.193625,2.0
2023-02-23 14:59:59.556,0.045827,0.002938,0.045859,0.115166,0.045827,0.040931,0.045859,0.073582,0.045827,0.108229,...,0.127024,0.045827,0.216295,0.045869,0.158178,0.046060,0.371736,0.045869,0.193625,2.0
2023-02-23 14:59:59.559,0.045827,0.002938,0.045859,0.115166,0.045827,0.040931,0.045859,0.073582,0.045827,0.108229,...,0.165938,0.045827,0.225994,0.045869,0.209744,0.046060,0.355418,0.046103,0.193625,2.0


In [None]:
# Let's check how balanced our dataset is
up=sum([df_min_max_scaled['label']==0.0][0])
down=sum([df_min_max_scaled['label']==1.0][0])
static=sum([df_min_max_scaled['label']==2.0][0])
all=up+down+static

print(f'upward movement is {round(up/all*100,2)}% of all labels')
print(f'downward movement is {round(down/all*100,2)}% of all labels')
print(f'static movement is {round(static/all*100,2)}% of all labels')

upward movement is 32.55% of all labels
downward movement is 32.73% of all labels
static movement is 34.72% of all labels


In [None]:
# Let's get the features and the labels from the data.   
full_features_min_max_unprep=df_min_max_scaled.iloc[:,:20].to_numpy()
full_labels_min_max_unprep=df_min_max_scaled.iloc[:,20].to_numpy()

In [None]:
# Let's record the features for future reference
file_path_full_features_min_max_unprep='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Full_Features_min_max/full_features_min_max_unprep.npy'
file_path_full_labels_min_max_unprep='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Full_Features_min_max/full_labels_min_max_unprep.npy'

np.save(file_path_full_features_min_max_unprep, full_features_min_max_unprep)
np.save(file_path_full_labels_min_max_unprep, full_labels_min_max_unprep)

In [None]:
# Let's check data types of both arrays
print(full_features_min_max_unprep.dtype)
print(full_labels_min_max_unprep.dtype)

float64
float64


The above types are excessive for storing the data, so to optimize them we will downgrade **features** to float32 and **labels** to int8.

In [None]:
features_dtype='float32'
labels_dtype='int8'

full_features_min_max_unprep=full_features_min_max_unprep.astype(features_dtype)
full_labels_min_max_unprep=full_labels_min_max_unprep.astype(labels_dtype)

In [None]:
# Let's check data types of both arrays again
print(full_features_min_max_unprep.dtype)
print(full_labels_min_max_unprep.dtype)

float32
int8


In [None]:
# Let's print the shapes of features and labels
print(full_features_min_max_unprep.shape)
print(full_labels_min_max_unprep.shape)

(8315560, 20)
(8315560,)


In [None]:
# Number of equity ticks per sample fed to model 
T=100

In [None]:
# Output Directory and filename pattern to record TFR shards to
out_dir='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Prepared_Dataset_MinMax_TFR_split_by_20/'
filename="Prepared_Dataset_Forex"

# Total size of transformed dataset
total_num_samples=(len(full_features_min_max_unprep)-T+1)
print(f'The total size of our data set is {total_num_samples}')
# We want to have ~5% of data per shard to make it convinient 
# to change percentage split when reading the data into train, validation and
# test datasets  
max_samples_per_file=int(np.ceil(total_num_samples/20))
max_samples_per_file
print(f'The maximum amount of samples per shard is {max_samples_per_file}')

In [None]:
write_dataset_to_tfr_shards(full_features_min_max_unprep, 
                            full_labels_min_max_unprep, T,
                            max_samples_per_file, filename,out_dir)

!!! There was a mistake in function **data_preparation_forex**, so labels were changed to 0 - down, 1 - static, 2 - up. For more details read  explanation above the function. !!!

## Min Max 9:00 - 10:00

**Let's record to TFR shards the dataset from 9:00 to 10:00 with labeling approach by Vincent (threshold=0.00001, k=10). The dataset is highly unbalanced with ~ 75% of no-movement class and ~12.5% of upward-movement and down-movement classes respectively.**

In [None]:
df_raw=pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/df_time_th0_00001_k_10')

In [None]:
df_raw

In [None]:
# Now when we prepared the data, let's do min_max scaling
df_min_max_scaled=df_raw.copy()

In [None]:
# Apply min_max_scaling
for column in df_min_max_scaled.columns[:-1]:
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())

In [None]:
df_min_max_scaled

In [None]:
# Let's check how balanced our dataset is
up=sum([df_min_max_scaled['label']==0.0][0])
down=sum([df_min_max_scaled['label']==1.0][0])
static=sum([df_min_max_scaled['label']==2.0][0])
all=up+down+static

print(f'upward movement is {round(up/all*100,2)}% of all labels')
print(f'downward movement is {round(down/all*100,2)}% of all labels')
print(f'static movement is {round(static/all*100,2)}% of all labels')

In [None]:
# Let's get the features and the labels from the data.   
full_features_min_max_unprep=df_min_max_scaled.iloc[:,:20].to_numpy()
full_labels_min_max_unprep=df_min_max_scaled.iloc[:,20].to_numpy()

In [None]:
file_path_full_features_min_max_unprep='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Full_Features_min_max/full_features_min_max_unprep_9_10.npy'
file_path_full_labels_min_max_unprep='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Full_Features_min_max/full_labels_min_max_unprep_9_10.npy'

np.save(file_path_full_features_min_max_unprep, full_features_min_max_unprep)
np.save(file_path_full_labels_min_max_unprep, full_labels_min_max_unprep)

In [None]:
# Let's check data types of both arrays
print(full_features_min_max_unprep.dtype)
print(full_labels_min_max_unprep.dtype)

In [None]:
# Let's print the shapes of features and labels
print(full_features_min_max_unprep.shape)
print(full_labels_min_max_unprep.shape)

In [None]:
# Number of equity ticks per sample fed to model 
T=100

In [None]:
# Output Directory and filename pattern to record TFR shards to
out_dir='/content/drive/MyDrive/Colab Notebooks/Limited order book Forecasting/Datasets/Forex/Prepared_Dataset_MinMax_TFR_9_10_split_by_20/'
filename="Prepared_Dataset_Forex"

# Total size of transformed dataset
total_num_samples=(len(full_features_min_max_unprep)-T+1)
print(f'The total size of our data set is {total_num_samples}')
# We want to have ~5% of data per shard to make it convinient 
# to change percentage split when reading the data into train, validation and
# test datasets  
max_samples_per_file=int(np.ceil(total_num_samples/20))
max_samples_per_file
print(f'The maximum amount of samples per shard is {max_samples_per_file}')

In [None]:
write_dataset_to_tfr_shards(full_features_min_max_unprep, 
                            full_labels_min_max_unprep, T,
                            max_samples_per_file, filename,out_dir)

!!! There was a mistake in function **data_preparation_forex**, so labels were changed to 0 - down, 1 - static, 2 - up. For more details read  explanation above the function. !!!