<a href="https://colab.research.google.com/github/DRSNAJ/BERT-LSTM-sentiment-trader/blob/main/BERT_LSTM_news_sentiment_trader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
from datetime import datetime
import time
import plotly.express as px

import tensorflow as tf
import torch
from keras.models import Sequential
from keras.layers import LSTM, Dense
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TFBertModel, BertTokenizer
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.nn.functional import softmax

from IPython.display import clear_output

#### Model Training and Loading Configuration

This section of the code defines the control flow for handling a machine learning model, specifically deciding whether to train a new model or load a pre-existing one.

In [None]:
train_model = False
process_data = False
load_model = '/content/drive/MyDrive/Colab Notebooks/Saved Models/log_pert_LSTM-model-2024-05-11_1035.keras'

In [None]:
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
dataset_src = '/content/drive/MyDrive/Colab Notebooks/datasets/news_tweets/raw_csv' # locations of the tweet csv files
processed_data_dest = '/content/drive/MyDrive/Colab Notebooks/datasets/news_tweets/processed'

if process_data:
  dest_files = list(map(lambda x : os.path.splitext(x)[0],os.listdir(processed_data_dest)))

  for folderitem in os.listdir(dataset_src):
    news_df = pd.DataFrame()
    if os.path.splitext(folderitem)[0] not in dest_files:
      data = pd.read_csv(dataset_src + "/" + folderitem)
      data['timestamp'] = pd.to_datetime((data['date'] + ' ' + data['time']))

      news_df = pd.concat([news_df,data])

      # Regular expression pattern to match URLs
      tweet_link_format = r'(\s)http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

      # Replace URLs with an empty string
      news_df['tweet'] = news_df['tweet'].str.replace(tweet_link_format, '', regex=True)
      news_df['tweet'] = news_df['tweet'].str.replace('. Follow live updates:', '', regex=True)

      # Adding the sentiment columns
      news_df[['positive', 'negative', 'neutral']] = 0

      pd.to_pickle(news_df,processed_data_dest + "/" + os.path.splitext(folderitem)[0] + ".pkl")


  for folderitem in os.listdir(processed_data_dest):
    news_df = pd.DataFrame()
    news_df = pd.read_pickle(processed_data_dest + "/" + folderitem)
    length_file = news_df.shape[0]

    for (idx, text) in enumerate(news_df['tweet']):

      cur_sentiment = news_df.iloc[idx, news_df.columns.get_indexer(['positive', 'negative', 'neutral'])]

      if (list(cur_sentiment) == [0,0,0]):
        clear_output(wait=True)

        print("File: " + folderitem)
        print("idx: " + str(idx) + "/" + str(length_file) + " (" + str(round(idx/length_file*100,2)) + "%)")

        encoded_input = tokenizer(text, padding=True, return_tensors='pt')

        with torch.no_grad():
            output = model(**encoded_input)
        logits = output.logits

        probabilities = softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities).item()
        # Mapping indices to classes based on the usual setup for finbert
        news_df.iloc[idx, news_df.columns.get_indexer(['positive', 'negative', 'neutral'])] = probabilities[0].tolist()

        if (idx%100 == 0):
          print("Saving")
          pd.to_pickle(news_df, processed_data_dest + "/" + folderitem)
    pd.to_pickle(news_df, processed_data_dest + "/" + folderitem)

In [None]:
test_data = pd.read_pickle(processed_data_dest + "/tweets_bbc.pkl")
test_data = pd.concat([test_data, pd.read_pickle(processed_data_dest + "/tweets_cnn.pkl")], ignore_index=True, sort=False)
# test_data = pd.concat([test_data, pd.read_pickle(processed_data_dest + "/tweets_eco.pkl")], ignore_index=True, sort=False)
print(test_data.shape)
print(test_data.columns)


(89783, 40)
Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest', 'timestamp', 'positive', 'negative', 'neutral'],
      dtype='object')


In [None]:
# test_data[].dt.tz_localize('US/Eastern').dt.tz_convert('US/Central')
test_data = test_data[['timestamp', 'tweet', 'positive', 'negative', 'neutral', 'replies_count', 'retweets_count', 'likes_count', 'timezone']].drop_duplicates().dropna()


In [None]:
print(test_data.shape)

In [None]:
resolution = 4*60 # the blocks of time used for predition in min (eg: every 1 hour, every 4 hours, daily, weekly, every 30 min...)


In [None]:
data_path = '/content/drive/MyDrive/Colab Notebooks/datasets/forex_data/DAT_MT_GBPUSD_M1' # location of the forex data

forex_data = pd.DataFrame()
column_names = ['date','time','open','high','low','close','na']

for f in os.listdir(data_path):
  data = pd.read_csv(data_path + '/' + f, names=column_names)

  # Formatting data and creating timestamps
  data['date'] = data['date'].str.replace('.', '-')
  data['timestamp'] = pd.to_datetime((data['date'] + ' ' + data['time']))

  forex_data = pd.concat([forex_data,data])

# Removing duplicates and sorting by time.
forex_data = forex_data[['timestamp','open','high','low','close']].drop_duplicates().sort_values(by='timestamp')

# Adding in missing timestamps and interpolating the forex prices between those values.
forex_data = forex_data.set_index('timestamp')[['open','high','low','close']].asfreq(freq='60s').interpolate()

# Smoothing out closing data over 4H to remove noise using Exponential Moving Average and Simple Moving Average
period = 60*4
forex_data['4hemw'] = forex_data['close'].ewm(span=period, adjust=False).mean() # Exponential Moving Average
forex_data['ma'] = forex_data['close'].rolling(window=period).mean() # Simple Moving Average
forex_data['ma'] = forex_data['ma'].shift(-int(np.round(period/2))) # Smoothing out stock prices

# Calculating the rate of change of the average
forex_data['pert_change'] = np.gradient(forex_data['ma'])
forex_data['pert_change'] = forex_data['pert_change'].rolling(window=period).mean() # can try ema here as well
forex_data['pert_change'] = forex_data['pert_change'].shift(-int(np.round(period/2)))

forex_data['log_change']  = np.log(1 + forex_data['pert_change']);

# Loading BERT Model

In [None]:
sentiments = []

