# FinBert - Pre-trained NLP model to analyze sentiment of financial text
- https://huggingface.co/ProsusAI/finbert
- https://www.researchgate.net/post/Do_you_need_to_preprocess_text_for_BERT (preproccessing is not needed)



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import re

In [3]:
# Load data
bnb_all_sentiment = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Import/bnb_sentiment.csv')
ada_all_sentiment = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Import/ada_sentiment.csv')
btc_all_sentiment = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Import/btc_sentiment.csv' )#,header=None, names=['coin','section','title','text','url','source','date','time','language','word_count','char_count','average_word_length','stopword_count','stopword_rate','preprocessed_text','polarity_TextBlob','sentiment_TextBlob','subjectivity_TextBlob','polarity_scores_vader','compound','neg_score','neu_score','pos_score','sentiment_vader'])
eth_all_sentiment = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Import/eth_sentiment.csv')
xrp_all_sentiment = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Import/xrp_sentiment.csv')

Maximum sequence length of BERT is 512. In https://arxiv.org/pdf/1905.05583.pdf head+tail (empirically select the first 128 and the last 382 tokens) achieved the best performance on the IMDb dataset (Type: Sentiment).

In [2]:
def head_tail(text):
    #https://data-dive.com/german-nlp-binary-text-classification-of-reviews-part1
    # Remove whitespace pattern
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    # Remove html tags pattern
    RE_TAGS = re.compile(r"<[^>]+>")
    # Remove special character pattern
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    # Remove hyperlinks pattern
    RE_HYPERLINKS = re.compile(r'https?://\S+',re.IGNORECASE)
    
    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_WSPACE, " ", text)
    text = re.sub(RE_HYPERLINKS, " ", text)
    
    # select forst 128 and last 382 tokens
    text = text[:128] + text[-382:]
    return text

In [4]:
bnb_all_sentiment['finBert_text'] = bnb_all_sentiment.text.apply(lambda x :head_tail(x))
ada_all_sentiment['finBert_text'] = ada_all_sentiment.text.apply(lambda x :head_tail(x))
btc_all_sentiment['finBert_text'] = btc_all_sentiment.text.apply(lambda x :head_tail(x))
eth_all_sentiment['finBert_text'] = eth_all_sentiment.text.apply(lambda x :head_tail(x))
xrp_all_sentiment['finBert_text'] = xrp_all_sentiment.text.apply(lambda x :head_tail(x))

In [5]:
# Setup model
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")


## < 512 Token (text)

In [6]:
# Function to calculate sentiment
def sentiment_score(text):
  tokens = tokenizer.encode_plus(text, return_tensors='pt')["input_ids"]
  output = model(tokens)
  probs = torch.nn.functional.softmax(output[0], dim=-1)
  mean = probs.mean(dim=0)
  positive, negative, neutral = float(f'{mean[0]:.3f}'), float(f'{mean[1]:.3f}'), float(f'{mean[2]:.3f}')
  result = torch.argmax(mean).item()
  return positive, negative, neutral, result


In [7]:
%%time
bnb_all_sentiment = bnb_all_sentiment.join(bnb_all_sentiment.finBert_text.apply(lambda x: sentiment_score(x[:512]))
                                            .apply(pd.Series)
                                            .rename(columns={0: 'fin_sentiment_positive',
                                                             1: 'fin_sentiment_negative',
                                                             2: 'fin_sentiment_neutral',
                                                             3: 'fin_sentiment_label'}))

CPU times: user 2min 2s, sys: 7.49 s, total: 2min 10s
Wall time: 2min 12s


In [8]:
bnb_all_sentiment.to_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Export/bnb_all_sentiment.csv',index=False)

In [9]:
%%time
ada_all_sentiment = ada_all_sentiment.join(ada_all_sentiment.finBert_text.apply(lambda x: sentiment_score(x[:512]))
                                            .apply(pd.Series)
                                            .rename(columns={0: 'fin_sentiment_positive',
                                                             1: 'fin_sentiment_negative',
                                                             2: 'fin_sentiment_neutral',
                                                             3: 'fin_sentiment_label'}))

CPU times: user 11min 39s, sys: 29.7 s, total: 12min 8s
Wall time: 12min 11s


In [10]:
ada_all_sentiment.to_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Export/ada_all_sentiment.csv',index=False)

In [11]:
%%time
xrp_all_sentiment = xrp_all_sentiment.join(xrp_all_sentiment.finBert_text.apply(lambda x: sentiment_score(x[:512]))
                                            .apply(pd.Series)
                                            .rename(columns={0: 'fin_sentiment_positive',
                                                             1: 'fin_sentiment_negative',
                                                             2: 'fin_sentiment_neutral',
                                                             3: 'fin_sentiment_label'}))

CPU times: user 27min 34s, sys: 50.1 s, total: 28min 24s
Wall time: 28min 26s


In [12]:
xrp_all_sentiment.to_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Export/xrp_all_sentiment.csv',index=False)

In [13]:
%%time
eth_all_sentiment = eth_all_sentiment.join(eth_all_sentiment.finBert_text.apply(lambda x: sentiment_score(x[:512]))
                                            .apply(pd.Series)
                                            .rename(columns={0: 'fin_sentiment_positive',
                                                             1: 'fin_sentiment_negative',
                                                             2: 'fin_sentiment_neutral',
                                                             3: 'fin_sentiment_label'}))

CPU times: user 35min 34s, sys: 57.9 s, total: 36min 32s
Wall time: 36min 36s


In [14]:
eth_all_sentiment.to_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Export/eth_all_sentiment.csv',index=False)

In [15]:
%%time
btc_all_sentiment = btc_all_sentiment.join(btc_all_sentiment.finBert_text.apply(lambda x: sentiment_score(x[:512]))
                                            .apply(pd.Series)
                                            .rename(columns={0: 'fin_sentiment_positive',
                                                             1: 'fin_sentiment_negative',
                                                             2: 'fin_sentiment_neutral',
                                                             3: 'fin_sentiment_label'}))

CPU times: user 3h 58min 2s, sys: 3min 15s, total: 4h 1min 17s
Wall time: 4h 1min 58s


In [16]:
btc_all_sentiment.to_csv('/content/drive/MyDrive/Colab Notebooks/FINTECH/Colab/Export/btc_all_sentiment.csv',index=False)

## > 512 Token (text) Test
- https://www.youtube.com/watch?v=yDGo9z_RlnE
- https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f


In [None]:
# def sentiment_score_all_token(text):
#   if not text.strip():
#     return {}

#   tokens = tokenizer.encode_plus(text, add_special_tokens=False,
#                                return_tensors='pt')
#   input_id_chunks = tokens['input_ids'][0].split(510)
#   mask_chunks = tokens['attention_mask'][0].split(510)
  
#   input_dict = preprocess_inputs_and_run_model(input_id_chunks,mask_chunks )
#   return input_dict

In [None]:
# def preprocess_inputs_and_run_model(input_id_chunks,mask_chunks ):
#   chunksize = 512

#   # change from tuple to list, tuple=umutable
#   input_id_chunks = list(input_id_chunks)
#   mask_chunks = list(mask_chunks)

#   for i in range(len(input_id_chunks)):
#     input_id_chunks[i] = torch.cat([
#         torch.Tensor([101]), input_id_chunks[i], torch.Tensor([102])
#     ])
#     # attention mask can only be 0 or 1
#       # 1 = real token -> pay attention to it
#       # 0 = padding token (ignore padding)
#     mask_chunks[i] = torch.cat([
#         torch.Tensor([1]), mask_chunks[i], torch.Tensor([1])
#     ])
    
#     pad_len = chunksize - input_id_chunks[i].shape[0]
#     if pad_len > 0:
#       input_id_chunks[i] = torch.cat([
#           input_id_chunks[i], torch.Tensor([0] * pad_len)
#       ])
#       mask_chunks[i] = torch.cat([
#           mask_chunks[i], torch.Tensor([0] * pad_len)
#       ])

#   input_ids = torch.stack(input_id_chunks)
#   attention_mask = torch.stack(mask_chunks)

#   input_dict= {
#       'input_ids': input_ids.long(),
#       'attention_mask': attention_mask.int()
#   }
    
#   return input_dict


In [None]:
# %%time
# #sentiment_score(bnb_all_sentiment.text.iloc[0])
# input_dict = ada_all_sentiment.text.apply(lambda x:sentiment_score_all_token(x))

Token indices sequence length is longer than the specified maximum sequence length for this model (1962 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 21.7 s, sys: 24.6 ms, total: 21.7 s
Wall time: 22.2 s


In [None]:
# %%time
# bnb_finbert_values = []
# bnb_finbert_labels = []


# for i in range(0, 3): # range(len(input_dict):
#   output = model(**input_dict.get(i))
#   probs = torch.nn.functional.softmax(output[0], dim=-1)
#   mean = probs.mean(dim=0)
#   bnb_finbert_values.append(mean)
#   result = torch.argmax(mean).item()
#   bnb_finbert_labels.append(result)

# #batch_size = 10
# # results = []
# # for i in range(0, len(input_dict), batch_size):
# #   input_batch = input_dict[i:i+batch_size][i]
# #   output_batch = model(**input_batch)
# #   results.append(output_batch)

# # output = torch.cat(results)


CPU times: user 20.4 s, sys: 519 ms, total: 20.9 s
Wall time: 21.2 s


In [None]:
# bnb_finbert_values

[tensor([0.0549, 0.0675, 0.8776], grad_fn=<MeanBackward1>),
 tensor([0.0424, 0.0203, 0.9373], grad_fn=<MeanBackward1>),
 tensor([0.6172, 0.0120, 0.3708], grad_fn=<MeanBackward1>)]

In [None]:
# # 0 = positive
# # 1 = negative
# # 2 = neutral
# bnb_finbert_labels

[2, 2, 0]