# Import packages

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 500)

import matplotlib.pyplot as plt

plt.style.use('classic')
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 300)

In [2]:
import sys

sys.path.insert(0, 'D:/BERT_in_intraday_trading')

In [3]:
from src.support import *
from src.backtest import *
from src.models import *

In [4]:
import random
from sklearn.model_selection import train_test_split

In [6]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
# from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

# Import and pre-processing news data

## News_data

In [None]:
with open("D:/BERT_in_intraday_trading/Training/Data/stored_data.pkl", "rb") as f:
    news_data = pickle.load(f)

# cnt = 0

# for new in news_data:
#     if new['CONTENT'] == 'content':
#         cnt += 1

# print(cnt, len(news_data))

news_data = [new for new in news_data if new['CONTENT'] != 'content']
news_data = pd.DataFrame(news_data)
news_data.set_index(keys = 'TIME_POSTED', inplace = True)
news_data.index = pd.to_datetime(news_data.index).tz_localize(None)
news_data = news_data[~news_data.index.isna()].sort_index()

## Interval data

In [None]:
interval_data = pd.read_pickle('D:/BERT_in_intraday_trading/Training/Data/XAUUSDm_M1.pkl')

interval_data = interval_data.set_index('DATE_TIME')
interval_data.index = pd.to_datetime(interval_data.index)

interval_data['DATE'] = pd.to_datetime(interval_data['DATE'])
interval_data['OPEN'] = interval_data['OPEN']
interval_data['HIGH'] = interval_data['HIGH']
interval_data['LOW'] = interval_data['LOW']
interval_data['CLOSE'] = interval_data['CLOSE']


### Prepare_df

In [None]:
df_1_min = prepare_df(df = interval_data, timeframe = '1min', add_indicators = True)

df_1_min['WHOLE_RANGE'] = df_1_min['HIGH'] - df_1_min['LOW']
df_1_min['GRP_WHOLE_RANGE'] = pd.qcut(df_1_min['WHOLE_RANGE'], 10)
df_1_min['GRP_BODY'] = pd.qcut(df_1_min['BODY'], 10)
df_1_min['YEAR'] = df_1_min.index.strftime('%Y')
df_1_min['WEEK'] = df_1_min.index.strftime('%Y%W')  
df_1_min['MONTH'] = df_1_min.index.strftime('%Y%m')

In [None]:
df_1_min.shape

In [None]:
df_1_min.index[0], df_1_min.index[-1]

In [None]:
# plot_df(df_1_min, 
#         path = None,# 'D:/Intraday_trading/Training/Saved_results/plot_df.html', 
#         open_tab = False)

## Labelling

In [None]:
news_data['MEAN_BA'] = news_data.apply(lambda x: df_1_min.loc[(df_1_min.index >= x.name - pd.Timedelta(hours = 4)) & (df_1_min.index <= x.name + pd.Timedelta(hours = 4)), 'Ret(t)'].mean(), axis = 1)
news_data['VAR_BA'] = news_data.apply(lambda x: df_1_min.loc[(df_1_min.index >= x.name - pd.Timedelta(hours = 4)) & (df_1_min.index <= x.name + pd.Timedelta(hours = 4)), 'Ret(t)'].var(), axis = 1)

news_data['MEAN_B'] = news_data.apply(lambda x: df_1_min.loc[(df_1_min.index >= x.name - pd.Timedelta(hours = 4)) & (df_1_min.index <= x.name), 'Ret(t)'].mean(), axis = 1)
news_data['VAR_B'] = news_data.apply(lambda x: df_1_min.loc[(df_1_min.index >= x.name - pd.Timedelta(hours = 4)) & (df_1_min.index <= x.name), 'Ret(t)'].var(), axis = 1)

news_data['MEAN_A'] = news_data.apply(lambda x: df_1_min.loc[(df_1_min.index >= x.name) & (df_1_min.index <= x.name + pd.Timedelta(hours = 4)), 'Ret(t)'].mean(), axis = 1)
news_data['VAR_A'] = news_data.apply(lambda x: df_1_min.loc[(df_1_min.index >= x.name) & (df_1_min.index <= x.name + pd.Timedelta(hours = 4)), 'Ret(t)'].var(), axis = 1)


In [None]:
news_data['RATIO_VAR_A_B'] = news_data['VAR_A']/news_data['VAR_B']
news_data['RATIO_MEAN_A_B'] = news_data['MEAN_A']/news_data['MEAN_B']
news_data['FLAG_HIGH_RISK'] = news_data['RATIO_VAR_A_B'].apply(lambda x: 1 if x >= news_data['RATIO_VAR_A_B'].quantile(0.75) else 0)


In [None]:
news_data['FLAG_HIGH_RISK'].value_counts()

In [None]:
news_data.to_pickle('D:/BERT_in_intraday_trading/Training/Data/news_data_w_labels.pkl')

# Import BERT

In [7]:
news_data = pd.read_pickle("D:/BERT_in_intraday_trading/Training/Data/news_data_w_labels.pkl")

In [8]:
texts = news_data['CONTENT']
labels = news_data[['RATIO_VAR_A_B', 'RATIO_MEAN_A_B', 'FLAG_HIGH_RISK']]

# Split
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size = 0.7, 
    random_state = 12345, shuffle = False
    )

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size = 0.5, 
    random_state = 12345, shuffle = False)

# Wrap into datasets
def prepare_labeled_dataset(texts, labels, batch_size = 32):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    ds = ds.shuffle(buffer_size=len(texts))
    ds = ds.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

train_ds = prepare_labeled_dataset(train_texts, train_labels)
val_ds = prepare_labeled_dataset(val_texts, val_labels)
test_ds = prepare_labeled_dataset(test_texts, test_labels)

In [31]:
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

# tfhub_handle_encoder = map_name_to_handle[bert_model_name]
# tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

tfhub_handle_encoder = 'https://kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-4-h-512-a-8/2'
tfhub_handle_preprocess = 'https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3'


print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-4-h-512-a-8/2
Preprocess model auto-selected: https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3


In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [33]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_mask', 'input_type_ids']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [34]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-4-h-512-a-8/2
Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.76262873  0.9928099  -0.18611842  0.36673835  0.15233713  0.6550445
  0.9681154  -0.94862705  0.00216199 -0.9877732   0.06842697 -0.97630596]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[-0.28946307  0.3432126   0.33231524 ...  0.21300834  0.71020776
  -0.05771159]
 [-0.28742072  0.31980997 -0.23018596 ...  0.5845511  -0.21329741
   0.7269215 ]
 [-0.66157013  0.6887673  -0.8743302  ...  0.10877225 -0.26173237
   0.47855318]
 ...
 [-0.22561178 -0.2892561  -0.07064433 ...  0.4756602   0.83277094
   0.4002539 ]
 [-0.29824233 -0.27473113 -0.05450515 ...  0.48849773  1.0955355
   0.18163365]
 [-0.4437818   0.00930784  0.07223748 ...  0.17290097  1.183325
   0.07897963]]


In [35]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'text_input')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name = 'text_preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable = True, name = 'BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.3)(net)
  FLAG_HIGH_RISK = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'FLAG_high_risk')(net)
  RATIO_MEAN = tf.keras.layers.Dense(1, activation = None, name = 'RATIO_MEAN')(net)
  RATIO_VAR = tf.keras.layers.Dense(1, activation = 'relu', name = 'RATIO_VAR')(net)
  
  return tf.keras.Model(text_input, [FLAG_HIGH_RISK, RATIO_MEAN, RATIO_VAR])

In [37]:
classifier_model = build_classifier_model()

In [38]:
classifier_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None,)]            0           []                               
                                                                                                  
 text_preprocessing (KerasLayer  {'input_type_ids':   0          ['text_input[0][0]']             
 )                              (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [40]:
bert_raw_result = classifier_model(tf.constant([news_data['CONTENT'][0]]))
bert_raw_result

[<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.4708895]], dtype=float32)>,
 <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.57016623]], dtype=float32)>,
 <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.]], dtype=float32)>]

In [42]:
loss = tf.keras.losses.MeanSquaredError()
metrics = tf.metrics.MeanAbsolutePercentageError()

In [43]:
epochs = 1
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5


# optimizer = tf.keras.optimizers.Adam(
#     learning_rate=init_lr,
#     weight_decay=0.01
# )

classifier_model.compile(optimizer = 'adam',
                         loss=loss,
                         metrics=metrics)

In [44]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Training model with https://kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-4-h-512-a-8/2


In [45]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')



ValueError: too many values to unpack (expected 2)

In [None]:
dataset_name = 'news_data'
saved_model_path = 'D:/BERT_in_intraday_trading/Training/Saved_results/{}_bert'.format(dataset_name.replace('/', '_'))

classifier_model.save(saved_model_path, include_optimizer=False)

In [None]:
# reloaded_model = tf.saved_model.load(saved_model_path)

# def print_my_examples(inputs, results):
#   result_for_printing = \
#     [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
#                          for i in range(len(inputs))]
#   print(*result_for_printing, sep='\n')
#   print()


# examples = [
#     'this is such an amazing movie!',  # this is the same sentence tried earlier
#     'The movie was great!',
#     'The movie was meh.',
#     'The movie was okish.',
#     'The movie was terrible...'
# ]

# reloaded_results = tf.sigmoid(reloaded_model(tf.constant(examples)))
# original_results = tf.sigmoid(classifier_model(tf.constant(examples)))

# print('Results from the saved model:')
# print_my_examples(examples, reloaded_results)
# print('Results from the model in memory:')
# print_my_examples(examples, original_results)


# serving_results = reloaded_model \
#             .signatures['serving_default'](tf.constant(examples))

# serving_results = tf.sigmoid(serving_results['classifier'])

# print_my_examples(examples, serving_results)