In [1]:
import pickle
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
model_name = "ProsusAI/finbert"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [2]:
import time

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [3]:
# Define a dictionary of class labels
# class_labels = {0: "negative", 1: "neutral", 2: "positive"}
def predict_sentiment(text):
    # Check if CUDA is available and set the device accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move the model to the device
    model.to(device)

    # Tokenize the text and return a dictionary of tensors
    class_labels = {0: "negative", 1: "neutral", 2: "positive"}
    inputs = tokenizer(text, truncation=True, max_length=512, padding='max_length', return_tensors="pt")

    # Move the inputs to the device
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    # Initialize a list to hold the probabilities for each chunk
    chunk_probs = []

    # Process the input tensors in chunks of 512 tokens
    for i in range(0, inputs["input_ids"].size(1), 512):
        # Create a dictionary for the current chunk
        chunk = {key: tensor[:, i:i+512] for key, tensor in inputs.items()}

        # Run the chunk through the model and get the logits
        with torch.no_grad():
            logits = model(**chunk).logits

        # Compute the probabilities
        probs = logits.softmax(dim=-1)

        # Add the probabilities to the list
        chunk_probs.append(probs)

    # Average the probabilities across all chunks
    avg_probs = torch.mean(torch.stack(chunk_probs), dim=0)

    # Get the class with the highest average probability
    class_idx = avg_probs.argmax().item()

    # Get the sentiment score
    sentiment_score = avg_probs.max().item()

    # Get the individual probabilities for each sentiment class
    negative_prob, neutral_prob, positive_prob = avg_probs.tolist()[0]

    # Return the predicted class label, the sentiment score, and the individual probabilities
    return class_labels[class_idx], sentiment_score, negative_prob, neutral_prob, positive_prob

In [4]:
# function applies sentiment analysis to the entire dataframe, once it is done, it is stored into a pickle file
def apply_sentiment_analysis(df): 
    # Apply the predict_sentiment function to the specified column
    results = df['content'].apply(predict_sentiment)

    # Unpack the results into separate Series
    sentiments, sentiment_scores, negative_probs, neutral_probs, positive_probs = zip(*results)

    # Add the new columns to the DataFrame
    df['sentiment'] = sentiments
    df['sentiment_score'] = sentiment_scores
    df['negative_prob'] = negative_probs
    df['neutral_prob'] = neutral_probs
    df['positive_prob'] = positive_probs
    
    return df


In [5]:
for i in range(2006, 2024):
    tic()
    data = pd.read_pickle(f"Clean_S&P{i}.pkl")
    proc = apply_sentiment_analysis(data)
    proc.to_pickle(f'finBERT_S&P{i}.pkl')
    print(f"Successfully Processed finBERT S&P{i}")
    tac()

Successfully Processed finBERT S&P2006
Time passed: 0hour:5min:10sec
Successfully Processed finBERT S&P2007
Time passed: 0hour:11min:27sec
Successfully Processed finBERT S&P2008
Time passed: 0hour:22min:19sec
Successfully Processed finBERT S&P2009
Time passed: 0hour:23min:39sec
Successfully Processed finBERT S&P2010
Time passed: 0hour:26min:35sec
Successfully Processed finBERT S&P2011
Time passed: 0hour:28min:56sec
Successfully Processed finBERT S&P2012
Time passed: 0hour:29min:37sec
Successfully Processed finBERT S&P2013
Time passed: 0hour:29min:41sec
Successfully Processed finBERT S&P2014
Time passed: 0hour:30min:12sec
Successfully Processed finBERT S&P2015
Time passed: 0hour:30min:30sec
Successfully Processed finBERT S&P2016
Time passed: 0hour:31min:15sec
Successfully Processed finBERT S&P2017
Time passed: 0hour:31min:36sec
Successfully Processed finBERT S&P2018
Time passed: 0hour:32min:13sec
Successfully Processed finBERT S&P2019
Time passed: 0hour:32min:40sec
Successfully Processe

In [6]:
import os
import pandas as pd

def combine_pickles(start_year=2006, end_year=2023):
    # Initialize an empty dataframe
    combined_df = pd.DataFrame()

    # Loop over the range of years
    for year in range(start_year, end_year + 1):
        pickle_file = f"finBERT_S&P{year}.pkl"

        # Check if the pickle file exists
        if os.path.exists(pickle_file):
            # Load the data from the pickle file
            df = pd.read_pickle(pickle_file)
            
            # Append the data to the combined dataframe
            combined_df = pd.concat([combined_df, df])
        else:
            print(f"Pickle file for year {year} does not exist.")
    
    # Save the combined dataframe to a new pickle file
    combined_df.to_pickle("finBERT_S&P.pkl")
    print("Combined pickle file has been saved as finBERT_S&P.pkl")

# Call the function
combine_pickles()


Combined pickle file has been saved as finBERT_S&P.pkl
