### Long-Form Sentiment Analysis with Text Chunking

In [None]:
from transformers import  BertForSequenceClassification, BertTokenizer, RobertaTokenizer, RobertaForSequenceClassification
from torch.nn.functional import softmax
import pandas as pd
import torch
import numpy as np

In [None]:

path = "<path to pre-processed csv file>"
try:
    data = pd.read_csv(path, encoding='ISO-8859-1')
    df = pd.DataFrame(data)
except UnicodeDecodeError as e:
    print("failed to read iso-8559-1", e)
try:
    data = pd.read_csv(path, encoding='windows-1252')
    df = pd.DataFrame(data)
except UnicodeDecodeError as e:
    print("failed to decode windows-1252:", e)
df


### Format pre-processed data for tokenization

In [None]:
import re

def format_for_tokenization(text):
    text = text.lower() #lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text) 
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'\[.*?\]', '', text)
    return text

df['Content'] = df['Content'].apply(format_for_tokenization)
df['Content'].iloc[1]

### Chunking Methods

In [None]:
def chunk_and_tokenize(tokenizer, content, chunk_size=510):
    tokens = tokenizer.encode_plus(content, add_special_tokens=False, return_tensors='pt')
    input_ids_chunks = tokens['input_ids'][0].split(chunk_size)
    attention_mask_chunks = tokens['attention_mask'][0].split(chunk_size)

    return input_ids_chunks, attention_mask_chunks

def prepare_chunks(tokenizer, input_ids_chunks, attention_mask_chunks):
    chunk_size = 512
    prepared_input_ids, prepared_attention_masks = [], []

    for input_ids, attention_mask in zip(input_ids_chunks, attention_mask_chunks):
        input_ids_padded = torch.cat([torch.tensor([tokenizer.cls_token_id]), input_ids, torch.tensor([tokenizer.sep_token_id])])
        attention_mask_padded = torch.cat([torch.tensor([1]), attention_mask, torch.tensor([1])])

        pad_length = chunk_size - len(input_ids_padded)
        if pad_length > 0:
            input_ids_padded = torch.cat([input_ids_padded, torch.tensor([tokenizer.pad_token_id] * pad_length)])
            attention_mask_padded = torch.cat([attention_mask_padded, torch.tensor([0] * pad_length)])

        prepared_input_ids.append(input_ids_padded)
        prepared_attention_masks.append(attention_mask_padded)

    return torch.stack(prepared_input_ids), torch.stack(prepared_attention_masks)

In [None]:
def analysis_results_to_df(df, column_name, results):
    if len(results) != len(df):
        raise ValueError("Length of results list must match df")
        
    df[column_name] = results
    return df

### Analysis: distilroBERTa Model

In [None]:
# NOTE: Note: I have used torch.nn.softmax and mean to convert model output logits into probabilities. 
# Depending on specific requirements, additional torch or numPy methods can be used to further manipulate and analyze the tensor outputs
# Above applies to all Analysis cells

tokenizer = RobertaTokenizer.from_pretrained('mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis')
model = RobertaForSequenceClassification.from_pretrained('mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis')
model.eval()  

def distil_roberta_analyzer(content):
    input_ids_chunks, attention_mask_chunks = chunk_and_tokenize(tokenizer, content)
    input_ids, attention_mask = prepare_chunks(tokenizer, input_ids_chunks, attention_mask_chunks)

    with torch.no_grad():  
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    mean_probs = probs.mean(dim=0)
    
    return mean_probs

distil_roberta_results = []
for content in df['Content'].fillna(''):  
    mean_probs = distil_roberta_analyzer(content)
    print(mean_probs)
    distil_roberta_results.append(mean_probs)

column_name = "distilroBERTa_sentiment"
analysis_results_to_df(df, column_name, distil_roberta_results)


### Analysis: HuggingFace ahmedrachid fine-tuned finbert model

In [None]:
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis", num_labels=3)
model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
model.eval()  



def ft_finbert_analyzer(content):
    input_ids_chunks, attention_mask_chunks = chunk_and_tokenize(tokenizer, content)
    input_ids, attention_mask = prepare_chunks(tokenizer, input_ids_chunks, attention_mask_chunks)

    with torch.no_grad():  
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    mean_probs = probs.mean(dim=0)
    
    return mean_probs

_ahmedrachid_results = []
for content in df['Content'].fillna(''):  
    mean_probs = ft_finbert_analyzer(content)
    _ahmedrachid_results.append(mean_probs)

column_name = "ft_finbert_sentiment"
analysis_results_to_df(df, column_name, _ahmedrachid_results)

## Analysis: HuggingFace RogerKam fine-tuned roberta model

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("RogerKam/roberta_fine_tuned_sentiment_financial_news")
model = RobertaForSequenceClassification.from_pretrained("RogerKam/roberta_fine_tuned_sentiment_financial_news")
model.eval()  


def rk_analyzer(content):
    input_ids_chunks, attention_mask_chunks = chunk_and_tokenize(tokenizer, content)
    input_ids, attention_mask = prepare_chunks(tokenizer, input_ids_chunks, attention_mask_chunks)

    with torch.no_grad():  
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    mean_probs = probs.mean(dim=0)
    
    return mean_probs

roger_kam_roberta = []
for content in df['Content'].fillna(''):  
    mean_probs = rk_analyzer(content)
    roger_kam_roberta.append(mean_probs)

column_name = "rk_roBERTa_sentiment"
analysis_results_to_df(df, column_name, roger_kam_roberta)

## Analysis: Original finBERT model

In [None]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
model.eval()  

def finbert_analyzer(content):
    input_ids_chunks, attention_mask_chunks = chunk_and_tokenize(tokenizer, content)
    input_ids, attention_mask = prepare_chunks(tokenizer, input_ids_chunks, attention_mask_chunks)

    with torch.no_grad():  
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    mean_probs = probs.mean(dim=0)
    
    return mean_probs

fin_results = []
for content in df['Content'].fillna(''):  
    mean_probs = finbert_analyzer(content)
    fin_results.append(mean_probs)

column_name = "original_finBERT_sentiment"
analysis_results_to_df(df, column_name, fin_results)