In [1]:
import pandas as pd 
import numpy as np
import torch
import tensorflow as tf

In [3]:
data = pd.read_csv("compiled_dataset.csv")
data.dropna(axis=0, inplace=True) 
df_cleaned = data[data['MDA Data'] != 'Data Not Available in SEC-Filings']
data = df_cleaned

In [4]:
mda_column = np.asarray(data["MDA Data"])

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [7]:
def get_input_ids_and_attention_mask_chunk(tokens):
    chunksize = 512
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    attention_mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))
    
    for i in range(len(input_id_chunks)):
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        
        attention_mask_chunks[i] = torch.cat([
            torch.tensor([1]), attention_mask_chunks[i], torch.tensor([1])
        ])
        
        pad_length = chunksize - input_id_chunks[i].shape[0]
        
        if pad_length > 0:
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_length)
            ])
            attention_mask_chunks[i] = torch.cat([
                attention_mask_chunks[i], torch.Tensor([0] * pad_length)
            ])
            
    return input_id_chunks, attention_mask_chunks 

In [None]:
#  run second time if some error comes . 

def get_sentiment_analysis(i):
    tokens = tokenizer.encode_plus(mda_column[i], add_special_tokens = False, return_tensors = 'pt')
    input_id_chunks = tokens['input_ids'][0].split(510)
    attention_mask_chunks = tokens['attention_mask'][0].split(510)

    input_id_chunks, attention_mask_chunks = get_input_ids_and_attention_mask_chunk(tokens)


    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(attention_mask_chunks)

    input_dict = {
        'input_ids' : input_ids.long(),
        'attention_mask' : attention_mask.int()
    }

    outputs = model(**input_dict)
    probabilities = torch.nn.functional.softmax(outputs[0], dim = -1)
    mean_probabilities = probabilities.mean(dim = 0)
    mean_probabilities

    # print(torch.argmax(mean_probabilities).item())
    return torch.argmax(mean_probabilities).item()


In [None]:
for i in range(10):
    print(get_sentiment_analysis(i))