**Interpreting Earnings Calls - Advancing Portfolio Management with LLMs**

**Author: Or Cohen**

In this notebook we will implement a methodology developed by S&P Global Research for deriving a couple of qualtative alpha factors from the transcripts of earnings calls. It's highly recommended to switch to a GPU runtime before running this notebook. For a full explanation of the methodology please read the [blog post](https://cohen-or.github.io/posts/ec_nlp)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
!pip install -q langchain
!pip install -U -q langchain-community
!pip install -q sentence-transformers
!pip install -q faiss-cpu
!pip install -q spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
!pip install -q transformers
!pip install -q accelerate
!pip install -U -q bitsandbytes

!pip install -q opendatasets
!pip install -q fpdf

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installa

In [3]:
# IF YOU DON'T HAVE A KAGGLE ACCOUNT - TRY ENTERING "O" FOR BOTH username and password
import opendatasets as od
dataset_url = "https://www.kaggle.com/datasets/tpotterer/motley-fool-scraped-earnings-call-transcripts/data"
od.download(dataset_url)

Skipping, found downloaded files in "./motley-fool-scraped-earnings-call-transcripts" (use force=True to force download)


In [4]:
df = pd.read_pickle('/content/motley-fool-scraped-earnings-call-transcripts/motley-fool-data.pkl')

### Part 1 - spliting the transcript to dictionary of contextual blcoks

In [5]:
slctd_ec = df.loc[18491,'transcript']

In [6]:
count_words = lambda line: len(line.split())

def parse_qa_section(text):
    # enquirer_pattern =
    lines = text.strip().split('\n')
    qa_pairs = {}
    current_qa = None
    parsing_state = 'looking_for_question'
    pair_counter = 1

    for line in lines:
        line = line.strip()

        # Check for question speaker line with word count validation
        if parsing_state == 'looking_for_question' and (line.count('--') == 2 and count_words(line) <= 10):
            current_qa = {
                'enquirer': line,
                'question_lines': [],
                'responder': None,
                'answer_lines': []
            }
            parsing_state = 'collecting_question'
            continue

        # Collect question lines
        if parsing_state == 'collecting_question':
            # Check if this is a new speaker line (responder)
            if  line.count('--') >= 1 and count_words(line.split('--')[0]) == 2:
                  current_qa['responder'] = line
                  parsing_state = 'collecting_answer'
                  continue

            # Collect question lines
            current_qa['question_lines'].append(line)
            continue

        # Collect answer lines
        if parsing_state == 'collecting_answer':
            # Check if this is the end of the answer
            if line == "Operator\n" or (("--" in line) and ("Investor Relations" in line)) or (line.count('--') == 2 and count_words(line) <= 10):
                # Save current Q&A pair
                current_qa['question'] = '\n'.join(current_qa['question_lines']).strip()
                current_qa['answer'] = '\n'.join(current_qa['answer_lines']).strip()

                # Remove intermediate line lists
                del current_qa['question_lines']
                del current_qa['answer_lines']


                qa_pairs[pair_counter] = current_qa
                pair_counter += 1


                if line == "Operator\n" or (("--" in line) and ("Investor Relations" in line)):
                  parsing_state = 'looking_for_question'
                else:
                  parsing_state = 'collecting_question'
                  # Reset for new Q&A pair
                  current_qa = {
                      'enquirer': line,
                      'question_lines': [],
                      'responder': None,
                      'answer_lines': []
                  }
                continue

            # Collect answer lines
            current_qa['answer_lines'].append(line)

    # Handle the last Q&A pair
    if current_qa and current_qa['question_lines']:
        current_qa['question'] = '\n'.join(current_qa['question_lines']).strip()
        current_qa['answer'] = '\n'.join(current_qa['answer_lines']).strip()

        # Remove intermediate line lists
        del current_qa['question_lines']
        del current_qa['answer_lines']

        qa_pairs[pair_counter] = current_qa

    return qa_pairs

In [7]:
import re
import json
import spacy
nlp = spacy.load("en_core_web_sm")

# Remove RTF formatting if needed (basic cleanup)
slctd_ec = re.sub(r'\\[a-zA-Z0-9]+', '', slctd_ec)  # Remove RTF commands
slctd_ec = re.sub(r'\{\}', '', slctd_ec)  # Remove empty braces
slctd_ec = re.sub(r'\n+', '\n', slctd_ec)  # Normalize newlines

# Define improved regex patterns
prepared_remarks_pattern = r"Prepared Remarks:(.*?)Questions (&|and) Answers:"
qa_section_pattern = r"Questions (&|and) Answers:(.*)Call participants:"

# Extract prepared remarks
prepared_remarks_match = re.search(prepared_remarks_pattern, slctd_ec, re.DOTALL)
prepared_remarks = prepared_remarks_match.group(1).strip() if prepared_remarks_match else ""

# Extract Q&A section
qa_section_match = re.search(qa_section_pattern, slctd_ec, re.DOTALL)
qa_section = qa_section_match.group(2).strip() if qa_section_match else ""

# Parse Q&A section
qa_dict = parse_qa_section(qa_section)

# Store results in a dictionary
result = {
    "Prepared Remarks": prepared_remarks,
    "Q&A": qa_dict
}


### Part 2 - "Cleaning" the transcript and summarizing the questions

In [8]:
# filter out short exchanges (expressions of gratitude and politeness)
qa_dict_filtered = {}
remove_keys = []
for key, value in qa_dict.items():
  question_sentences = list(nlp(value["question"]).sents)
  answer_sentences = list(nlp(value["answer"]).sents)
  if len(question_sentences) > 2 and len(answer_sentences) > 2 \
  and '\nOperator\n' not in value["answer"] and "Investor Relations" not in value['responder']:
    qa_dict_filtered[key] = value
qa_dict = qa_dict_filtered
result['Q&A'] = qa_dict_filtered


In [9]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_question(question):
    summary = summarizer(question, max_length=60, do_sample=False)
    return summary[0]['summary_text']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [10]:
for key, value in qa_dict.items():
  qa_dict[key]['question_summary'] = summarize_question(value['question'])

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


### Part 3 - Generate RAG answers

**Converting 'prepared remarks' section to embddeings and storing in a vector database (FAISS)**

In [11]:
import faiss
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [12]:
# Process the text with spacy to get sentences
doc = nlp(result['Prepared Remarks'])
remarks_sentences = [sent.text for sent in doc.sents]

# Load the pre-trained sentence transformer model
embedding_model = "all-MiniLM-L6-v2"
model = SentenceTransformer(embedding_model)

# Encode the sentences into embeddings
embeddings = model.encode(remarks_sentences)
embeddings = np.array(embeddings).astype('float32')

# Prepare FAISS index
# We will use the L2 distance index (faiss.IndexFlatL2) since we're working with dense vectors.
emb_dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(emb_dimension)
faiss_index.add(embeddings)

In [13]:
# from huggingface_hub import login

# # Set your token programmatically
# login(token="YOUR_TOKEN_HERE")

# def load_llm_model():
#     # Configure quantization
#     quantization_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_compute_dtype=torch.float16,
#         llm_int8_enable_fp32_cpu_offload=True
#     )

#     # Load tokenizer and model for Llama 3.1 8B
#     model_id = "meta-llama/Llama-3.1-8B"  # Update model ID for Llama 3.1 8B
#     tokenizer = AutoTokenizer.from_pretrained(model_id)
#     model = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         device_map="auto",
#         torch_dtype=torch.float16,
#         quantization_config=quantization_config
#     )

#     return tokenizer, model


In [15]:
def load_llm_model():
    # Configure quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    # Load tokenizer and model
    model_id = "microsoft/phi-2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16
    )

    return tokenizer, model

In [16]:
# Initialize RAG system
class RAGSystem:
    def __init__(self, faiss_index, remarks_sentences, embedding_model_name, tokenizer, llm_model):
        self.faiss_index = faiss_index
        self.context = remarks_sentences
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.tokenizer = tokenizer
        self.model = llm_model

    def retrieve_context(self, query, previous_answer, top_k=10):
            if previous_answer:
              previous_answer_sents = [span.text for span in previous_answer]
              # Add the previous answers to the context (prepared remarks and all question before)
              self.context += previous_answer_sents

              # Embed the previous answer and add to FAISS index
              previous_answer_embeddings = self.embedding_model.encode(previous_answer_sents)
              previous_answer_embeddings = np.array(previous_answer_embeddings).astype('float32')
              faiss_index.add(previous_answer_embeddings)

            # Embed the query
            query_embedding = self.embedding_model.encode([query])[0]
            query_embedding = np.array([query_embedding]).astype('float32')

            # Perform the FAISS search
            D, I = faiss_index.search(query_embedding, top_k)

            # Retrieve the corresponding sentences (context) from FAISS
            retrieved_texts = [self.context[idx] for idx in I[0]]

            return retrieved_texts

    def generate_answer(self, query, context):
        # Format prompt with context and query
        prompt = f"""
        From the perspective of a top executive, answer the following question raised by a financial analyst during an earnings conference call based on the
        following information.
        Information:
        {' '.join(context)}
        Question: {query}
        """

        # Tokenize the prompt
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=128,
                temperature=0.01,
                top_p=0.9,
                do_sample=True
            )

        # Decode and return the response
        response = self.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        return response

    def answer_question(self, query, previous_answers):
        # Retrieve relevant context (prepared remarks + previous answers)
        context = self.retrieve_context(query, previous_answers)

        # Generate answer based on retrieved context
        answer = self.generate_answer(query, context)

        return answer, context

In [17]:
# Create RAG system
tokenizer, model = load_llm_model()
rag = RAGSystem(faiss_index, remarks_sentences, embedding_model, tokenizer, model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
previous_answer = []  # Initialize an empty list to keep the sentences from the previous answer

for q_id in qa_dict.keys():
    query = qa_dict[q_id]['question']
    answer, context = rag.answer_question(query, previous_answer)
    qa_dict[q_id]['rag_answer'] = answer
    qa_dict[q_id]['context'] = context
    # Store the sentences in the current answer to be passed in the next iteration
    previous_answer = list(nlp(qa_dict[q_id]['answer']).sents)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

### Part 5 - Calculate Cosine Similarity

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

def avg_cosine_similarity(question, answer):
    """
    Compute the average cosine similarity between each pair of sentences
    from the question and answer inputs using spaCy.

    Parameters:
    - question (str): First text block.
    - answer (str): Second text block.

    Returns:
    - float: The average cosine similarity between all sentence pairs.
    """

    nlp = spacy.load("en_core_web_md")  # Use medium-sized model for better accuracy
    # Process both text blocks using spaCy
    doc1 = nlp(question)
    doc2 = nlp(answer)

    # Split both text blocks into sentences
    sentences1 = [sent.text.strip() for sent in doc1.sents]
    sentences2 = [sent.text.strip() for sent in doc2.sents]

    # Generate sentence embeddings for each sentence
    embeddings1 = np.array([sent.vector for sent in doc1.sents])
    embeddings2 = np.array([sent.vector for sent in doc2.sents])

    # Compute cosine similarity between each pair of sentence embeddings
    similarities = cosine_similarity(embeddings1, embeddings2)

    # Calculate the average similarity
    avg_similarity = np.mean(similarities)

    return avg_similarity

In [30]:
for q_id in qa_dict.keys():
  qa_dict[q_id]['on_topic_score'] = avg_cosine_similarity(qa_dict[q_id]['question_summary'], qa_dict[q_id]['answer'])
  qa_dict[q_id]['proactive_score'] = avg_cosine_similarity(qa_dict[q_id]['question_summary'], qa_dict[q_id]['rag_answer'])

In [31]:
# Calculate the call-wise average scores
result['on_topic_score'] = np.mean([qa_dict[q_id]['on_topic_score'] for q_id in qa_dict.keys()])
result['proactive_score'] = np.mean([qa_dict[q_id]['proactive_score'] for q_id in qa_dict.keys()])

### Export to PDF

In [21]:
from fpdf import FPDF

def export_to_pdf(result_dict, filename="report2.pdf"):
    """Exports the content of a dictionary to a PDF report.

    Args:
        result_dict (dict): The dictionary containing the content to be exported.
        filename (str, optional): The name of the PDF file. Defaults to "report.pdf".
    """

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    for key, value in result_dict.items():
        if isinstance(value, dict):  
            if key == "Q&A":  # Special handling for Q&A section
                for question_number, question_data in value.items():
                    # Iterate through each part (question, answer, etc.) inside the question dictionary
                    for part, part_value in question_data.items():
                        combined_key = f"{question_number}-{part}"
                        pdf.set_text_color(255,0,0)
                        pdf.cell(200, 10, txt=combined_key + ":", ln=True)
                        pdf.ln()  
                        pdf.set_text_color(0,0,0)
                        pdf.multi_cell(0, 10, txt=str(part_value))
                        pdf.ln()  

        else:
            # Handle other regular key-value pairs
            pdf.set_text_color(0,255,0)
            pdf.cell(200, 10, txt=str(key) + ":", ln=True)
            pdf.ln()  # Blank line after the key

            pdf.set_text_color(0,0,0)
            pdf.multi_cell(0, 10, txt=str(value))
            pdf.ln()

    # Output the PDF
    pdf.output(filename)


export_to_pdf(result)
