In [9]:
pip install pdfplumber transformers faiss-cpu==1.7.4 tqdm yfinance

Note: you may need to restart the kernel to use updated packages.


In [13]:
import os
import pdfplumber
import pickle
from tqdm import tqdm
import yfinance as yf

## 1. Splitter

In [None]:
# documents_path = "Documents\\"
# files = os.listdir(documents_path)

# docs = {}
# for file in files:
#     file_path = documents_path + file
#     with pdfplumber.open(file_path) as pdf:
#         all_text = ''
#         # Loop through each page in the document
#         for page in pdf.pages:
#             # Extract text from the page
#             all_text += page.extract_text()           
#     docs[file] = all_text

# docs = {k.replace(".pdf", ""): v for k, v in docs.items()}

# # Saving the dictionary to a pickle file
# with open('Docs.pkl', 'wb') as f:
#     pickle.dump(docs, f)

In [15]:
# Loading the documents from a pickle file
with open('Docs.pkl', 'rb') as f:
    docs = pickle.load(f)

In [17]:
def split_text_into_chunks(text, tokenizer, max_tokens=512, chunk_size=100, overlap=25):

    tokens = tokenizer.encode(text)
        
    # List to hold the chunks
    chunks = []
    
    # Start index
    start = 0
    
    while start < (len(tokens) - overlap):
        # End index of the chunk (consider overlap)
        end = start + chunk_size
        
        # Ensure we don't exceed the max token limit
        if end > len(tokens):
            end = len(tokens)
        
        # Create the chunk (from start to end index)
        chunk = tokens[start:end]
        
        # Convert tokens back to text
        chunk_text = tokenizer.decode(chunk)
        chunks.append(chunk_text)
        
        # Update the start index (add overlap)
        start = end - overlap
    
    return chunks

In [19]:
from transformers import AutoTokenizer
from tqdm import tqdm

# Load the FinBERT tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_tokens = 512  # BERT model max token limit
chunk_size = 100  # Target chunk size (tokens)
overlap = 25  # Overlap between chunks

paragraphs = {}
for stock, text in docs.items():
    print(f"{stock}...")
    paragraphs[stock] = split_text_into_chunks(text, tokenizer, max_tokens, chunk_size, overlap)

AMZN...
GOOGL...
META...
MSFT...
NVDA...


In [23]:
for stock in paragraphs.keys():
    print(f"{stock}: {len(paragraphs[stock])} paragraphs")

AMZN: 855 paragraphs
GOOGL: 975 paragraphs
META: 1345 paragraphs
MSFT: 1303 paragraphs
NVDA: 925 paragraphs


## 2. Create Embeddings

In [25]:
from transformers import AutoModel
import torch

# Load FinBERT model
model = AutoModel.from_pretrained(model_name)

In [27]:
def generate_embeddings(chunks, model, tokenizer):
    embeddings = []
    
    for chunk in tqdm(chunks):
        # Tokenize the chunk
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
        
        # Ensure we're working with the model in evaluation mode
        model.eval()
        
        # Get the embeddings from the model
        with torch.no_grad():
            outputs = model(**inputs)
        
        # The last hidden state gives the contextualized embeddings
        last_hidden_state = outputs.last_hidden_state
        # Mean pooling over the entire sequence
        chunk_embeddings = last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(chunk_embeddings)
    
    return embeddings

In [29]:
# vectors = {}
# for stock, chunks in paragraphs.items():
#     print(f"{stock}...")
#     vectors[stock] = generate_embeddings(chunks, model, tokenizer)

# # Saving the dictionary to a pickle file
# with open('Vectors.pkl', 'wb') as f:
#     pickle.dump(vectors, f)

## 3. Building the Vector Database

In [31]:
# Loading the dictionary from a pickle file
with open('Vectors.pkl', 'rb') as f:
    vectors = pickle.load(f)

In [33]:
import numpy as np

# Add metadata to vectors

metadata_vectors = []
for stock, company_vectors in tqdm(vectors.items()):
    # Fetch the company name using yfinance
    ticker = yf.Ticker(stock)
    
    for i, vector in enumerate(company_vectors):
        # Define metadata for each vector
        vector_metadata = {
            "vector": vector,
            "metadata": {
                "vector_id": f"{stock}-{i}",
                "ticker": stock,
                "paragraph_number": i,
                "paragraph_text": paragraphs[stock][i]  # Assuming 'paragraphs' is defined elsewhere
            }
        }
        # Append the structured data to the list
        metadata_vectors.append(vector_metadata)

# Company identifier
company_vector = []
for i in range (0, len(metadata_vectors)):
    company_vector.append(metadata_vectors[i]['metadata']['ticker'])

vectors = [entry['vector'] for entry in metadata_vectors]
vectors = np.array(vectors).reshape(len(vectors), -1)

100%|██████████| 5/5 [00:00<00:00, 28.08it/s]


In [35]:
company_name_map = {
    'GOOGL': ['google', 'alphabet', 'youtube', 'android'],
     'AMZN': ['amazon', 'aws'],
     'NVDA': ['nvidia'],
     'MSFT': ['Microsoft', 'Windows', 'Azure'],
     'META': ['Meta', 'Facebook', 'Instagram']
}

## 4. Retrieval

In [38]:
user_query = ["""
Talk about Microsoft stock-based compensation
"""]

In [40]:
import faiss

# If there are any mentions to a specific company, narrow the similarity search to the 10K documents of those companies only
focus_tickers = []
for ticker, words in company_name_map.items():
    for word in words:
        if word.lower() in user_query[0].lower():
            focus_tickers.append(ticker)
            break
focused_vectors_idx = [i for i, ticker in enumerate(company_vector) if ticker in focus_tickers]
vectors_focused = vectors[focused_vectors_idx]
vectors_focused = []
if len(vectors_focused) > 0:
    # Normalize the vectors to unit length
    normalized_vectors = vectors_focused / np.linalg.norm(vectors_focused, axis=1, keepdims=True)
else:
    normalized_vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

# Encode the query to get embeddings
query_vector = generate_embeddings(user_query, model, tokenizer)[0].reshape(1, -1)
# Normalize the query vector
query_vector = query_vector / np.linalg.norm(query_vector)

# Create a FAISS index for the normalized vectors
index = faiss.IndexFlatIP(normalized_vectors.shape[1])  # Use Inner Product (IP), which is equivalent to cosine similarity after normalization
index.add(normalized_vectors)
normalized_vectors.shape

# Perform the nearest neighbor search
distances, indices = index.search(query_vector, k=1)  # k=n for top N nearest neighbors

for idx, dist in zip(indices[0], distances[0]):
    print(f"Paragraph {idx} with distance {dist}")

  0%|          | 0/1 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 1/1 [00:01<00:00,  1.58s/it]

Paragraph 3937 with distance 0.6368639469146729





In [42]:
relevant_texts = ""
for i, idx in enumerate(indices[0]):
    text = metadata_vectors[idx]['metadata']['paragraph_text']
    if i == len(indices[0]) - 1:
        relevant_texts += text
    else:
        relevant_texts += text + "\n\n"
print(relevant_texts)

and savings plans we grant stock - based compensation to employees and directors. awards that expire or are canceled without delivery of shares generally become available for issuance under the plans. we issue new shares of microsoft common stock to satisfy vesting of awards granted under our stock plans. we also have an espp for all eligible employees. stock - based compensation expense and related income tax benefits were as follows : ( in millions ) year ended june 30, 2024 2023 2022 stock - based compensation expense $ 10, 734 $ 9


## 5. Generation

In [44]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForCausalLM, pipeline

# Load DistilGPT-2 model and tokenizer
model_name = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#### RAG Prompt (With Context)

In [45]:
prompt =  """\n\n Context:\n\n" """ + relevant_texts + """"\n""" + user_query[0]

In [46]:
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    inputs.input_ids,
    max_length=500,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id to eos_token_id for consistency
    temperature=0.7,  # Control randomness
    top_p=0.9,  # Nucleus sampling
    num_beams=10,  # Beam search for better answers
    no_repeat_ngram_size=2,  # Avoid repeating n-grams
    do_sample=True
)

# Decode and print the answer
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.




 Context:

" and savings plans we grant stock - based compensation to employees and directors. awards that expire or are canceled without delivery of shares generally become available for issuance under the plans. we issue new shares of microsoft common stock to satisfy vesting of awards granted under our stock plans. we also have an espp for all eligible employees. stock - based compensation expense and related income tax benefits were as follows : ( in millions ) year ended june 30, 2024 2023 2022 stock - based compensation expense $ 10, 734 $ 9"

Talk about Microsoft stock-based compensation
If you are a Microsoft employee, you may be eligible to receive a share of the company’s stock. If you have not received the share, the stock will not be offered to you. However, if you do not receive the shares, it will be subject to the terms and conditions of this agreement.
How do you get your share?
We offer a variety of options for you to choose from. For example, we offer the option of 

#### Non-RAG Prompt (NO Context)

In [54]:
prompt =  user_query[0]

In [56]:
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    inputs.input_ids,
    max_length=500,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id to eos_token_id for consistency
    temperature=0.7,  # Control randomness
    top_p=0.9,  # Nucleus sampling
    num_beams=10,  # Beam search for better answers
    no_repeat_ngram_size=2,  # Avoid repeating n-grams
    do_sample=True
)

# Decode and print the answer
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)


Talk about Microsoft stock-based compensation

What do you think?
Share your thoughts in the comments below.
This article was originally published on The Conversation. Read the original article.
