In [1]:
import numpy as np
import pandas as pd 
import transformers
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings("ignore")

In [2]:
transformers.BertConfig()

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [3]:
import time

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [4]:
import re
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation, remove words containing numbers, and remove dots (except .com).'''
    text = text.lower()
    text = re.sub(r"[\[\]]", " ", text)
    text = re.sub('[%s]' % re.escape('!"#&()*+,-/:;<=>?@^_`{|}~'), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub(r'(?<!\w)(\.)((?!\bcom\b)\w)', r' \2', text)  # Remove dots except for .com

    return text


cleaner = lambda x: clean_text(x)

In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import BertTokenizerFast, BertForSequenceClassification
import torch

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize outside the function to avoid loading every time
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = BertTokenizerFast.from_pretrained(model_name)
sid = SentimentIntensityAnalyzer()

def financial_insights(transcript):
    # Set the maximum sequence length
    max_seq_length = 512
    
    # Tokenize the transcript
    tokenized_text = tokenizer.tokenize(transcript)

    # Account for [CLS] and [SEP] tokens
    max_tokens = max_seq_length - 2
    if len(tokenized_text) > max_tokens:
        tokenized_text = tokenized_text[:max_tokens]

    # Add special tokens and convert to input IDs
    input_ids = [tokenizer.cls_token_id] + tokenizer.convert_tokens_to_ids(tokenized_text) + [tokenizer.sep_token_id]
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)

    # Get the model's predictions
    with torch.no_grad():
        outputs = model(input_ids)
    predicted_labels = torch.argmax(outputs.logits, dim=1)

    # Extract the relevant information from the transcript
    entities = {
        'positive_indicators': ['growth', 'revenue', 'profit'],
        'negative_indicators': ['decline', 'loss', 'risk'],
    }

    # Find sentences containing the keywords
    sentences = transcript.split('. ')
    result = {}

    for entity, keywords in entities.items():
        result[entity] = []
        for sentence in sentences:
            if any(keyword in sentence.lower() for keyword in keywords):
                result[entity].append(sentence)
    return result

def sentiment_analysis(data):
    sentiment_score = []
    for i in range(len(data)):
        sentence = "".join(data[i])
        sentiment = sid.polarity_scores(sentence)
        sentiment_score.append(sentiment)
    return sentiment_score


def dicts_operation(positive_indicators_dict, negative_indicators_dict):
    combined_dict, result_dict = [], {}
    for i in range(len(positive_indicators_dict)):
        for key in positive_indicators_dict[i]:
            result_dict[key] = (positive_indicators_dict[i][key] + negative_indicators_dict[i][key])/2
        combined_dict.append(result_dict)
        result_dict = {}
    return combined_dict


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
for i in range(2006, 2024):
    tic()
    data = pd.read_pickle(f"Clean_S&P{i}.pkl")
    transcripts = data["content"].apply(cleaner) # List of transcripts
    results = [financial_insights(transcript) for transcript in transcripts]

    # Extract positive_indicators and negative_indicators from results
    positive_indicators = [results[i]["positive_indicators"] for i in range(len(results))]
    negative_indicators = [results[i]["negative_indicators"] for i in range(len(results))]
    data["positive_indicators"], data["negative_indicators"] = positive_indicators, negative_indicators
    data["sentiment"] = dicts_operation(sentiment_analysis(data['positive_indicators']), sentiment_analysis(data['negative_indicators']))
    diff_value = []
    for u in range(len(data["sentiment"])): 
        diff = data["sentiment"][u]["pos"] - data["sentiment"][u]["neg"] 
        diff_value.append(diff)
    data["pos_neg_diff"] = pd.Series(diff_value)
    data.to_pickle(f'VADERBERT_S&P{i}.pkl')
    print(f"Successfully Processed VADERBERT S&P{i}")
    tac()

Token indices sequence length is longer than the specified maximum sequence length for this model (9869 > 512). Running this sequence through the model will result in indexing errors


Successfully Processed VADERBERT S&P2006
Time passed: 0hour:4min:1sec
Successfully Processed VADERBERT S&P2007
Time passed: 0hour:9min:35sec
Successfully Processed VADERBERT S&P2008
Time passed: 0hour:24min:16sec
Successfully Processed VADERBERT S&P2009
Time passed: 0hour:24min:45sec
Successfully Processed VADERBERT S&P2010
Time passed: 0hour:27min:49sec
Successfully Processed VADERBERT S&P2011
Time passed: 0hour:30min:33sec
Successfully Processed VADERBERT S&P2012
Time passed: 0hour:31min:11sec
Successfully Processed VADERBERT S&P2013
Time passed: 0hour:31min:38sec
Successfully Processed VADERBERT S&P2014
Time passed: 0hour:32min:0sec
Successfully Processed VADERBERT S&P2015
Time passed: 0hour:32min:13sec
Successfully Processed VADERBERT S&P2016
Time passed: 0hour:32min:54sec
Successfully Processed VADERBERT S&P2017
Time passed: 0hour:33min:42sec
Successfully Processed VADERBERT S&P2018
Time passed: 0hour:34min:29sec
Successfully Processed VADERBERT S&P2019
Time passed: 0hour:34min:43

In [9]:
import os
import pandas as pd

def combine_pickles(start_year=2006, end_year=2023):
    # Initialize an empty dataframe
    combined_df = pd.DataFrame()

    # Loop over the range of years
    for year in range(start_year, end_year + 1):
        pickle_file = f"VADERBERT_S&P{year}.pkl"

        # Check if the pickle file exists
        if os.path.exists(pickle_file):
            # Load the data from the pickle file
            df = pd.read_pickle(pickle_file)
            
            # Append the data to the combined dataframe
            combined_df = pd.concat([combined_df, df])
        else:
            print(f"Pickle file for year {year} does not exist.")
    
    # Save the combined dataframe to a new pickle file
    combined_df.to_pickle("VADERBERT_S&P.pkl")
    print("Combined pickle file has been saved as VADERBERT_S&P.pkl")

# Call the function
combine_pickles()


Combined pickle file has been saved as VADERBERT_S&P.pkl
