In [8]:
pip install torch transformers langchain langchain_community pypdf numpy pandas faiss-cpu opencv-python-headless



In [9]:
'''
author: Ayaan
This file is to load the pdfs, which will later be tokenized and turned into vectors
please note, to load the file, the filename must end with ".pdf"
'''

import os, re
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def loadPDF():
    '''
    opens pdfs, and format as needed
    '''
    data_path = 'sample_data/' #finances folder with document statements
    all_docs = []

    for filename in os.listdir(data_path):
        #re init at 0 for each pdg
        pages_num = 0
        total_token_count = 0
        total_char_count = 0

        file_path = os.path.join(data_path, filename)

        if filename.endswith('.pdf'): #only if its a pdf - double check to avoid errors
            file_path = os.path.join(data_path, filename)

            loader = PyPDFLoader(file_path)
            doc = loader.load()

            pages_num = len(doc)

            for page in doc:
                page.page_content = format_doc(page.page_content)
                total_char_count += len(page.page_content)
                total_token_count += len(page.page_content) / 4 #since 1 token is 4 chars

            text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=35) #chunk pdf for better results, overlap to maintain context
            split_docs = text_splitter.split_documents(doc)
            all_docs.extend(split_docs)

            print(f"-------- PDF INFO for {filename} --------")
            print(f'''Number of Pages in PDF: {pages_num}
Number of Characters in PDF: {total_char_count}
Number of Tokens in PDF: {total_token_count}''')

    return all_docs

def format_doc(doc):
    '''
    format each page of pdf for easier reading
    '''
    doc = re.sub(r'\n+', '\n', doc) ##replace multiple newlines with single
    doc = re.sub(r'\s+', ' ', doc) ##replace multiple spaces with single space
    doc = re.sub(r'[^\x00-\x7F]+', ' ', doc)  # Remove non-ASCII chars - assume they can't be processed by text
    doc = doc.strip()
    return doc



In [12]:
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import pandas as pd

MODEL_NAME = "ProsusAI/finbert" #replace with if any other model used later
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

def getEmbeddings(texts):
    '''
    takes in text, tokenizes them, generates FinBert embeddings
    '''
    embeddings_list = []
    batch_size = 16 #process texts in chunks of 4 for memory, processes 100 texts per 30 secondsish (depends on GPU, mine wasn't very good so did this)

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512) #generate tokens

        with torch.no_grad():
            outputs = model(**tokens) #pass tokens in model to get embeddings

        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() #extract CLS, first token of each sentence, which represents entire sentence
        embeddings_list.extend(cls_embeddings)

        if i % 100 == 0: #print statement to verify
            print(f"Processed {i+1}/{len(texts)} texts")

    return np.array(embeddings_list)

def saveEmbeddingsFaiss():
    '''
    process pdf documents, generate embeddings using external function, and feed those generated vector embeddings into
    faiss vector database
    '''
    texts = [doc.page_content for doc in loadPDF()] #get text
    embeddings = getEmbeddings(texts)

    # Create FAISS index
    dimention = embeddings.shape[1] #dimension = length of vector
    index = faiss.IndexFlatL2(dimention) #use L2 Eucdlian distance to get all points between vectors
    index.add(embeddings)  #add embeddings to index

    # Save FAISS index
    faiss.write_index(index, "faiss_db.idx")
    print("Embeddings saved to FAISS index!")

def visualizeIndex():
    '''
    visualize the faiss index database to get a better idea of whats happening
    save database to dataframe, export to retain knowledge
    '''
    index = faiss.read_index("faiss_db.idx")
    texts = [doc.page_content for doc in loadPDF()] #reload the texts for comparison

    stored_embeddings = np.array([index.reconstruct(i) for i in range(index.ntotal)]) #store embeddings in array

    df = pd.DataFrame(stored_embeddings)

    df.insert(0, "Text", texts) #put original texts in data frame

    df.to_csv('visualizedIndex.csv')

    print(df.head()) #print first few rows

if __name__ == "__main__":
    saveEmbeddingsFaiss()
    visualizeIndex()

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

-------- PDF INFO for Building Financial Models (John Tjia) (Z-Library).pdf --------
Number of Pages in PDF: 353
Number of Characters in PDF: 484201
Number of Tokens in PDF: 121050.25
-------- PDF INFO for Financial Statement Analysis (Martin S. Fridson and Fernando Alvarez) (Z-Library).pdf --------
Number of Pages in PDF: 447
Number of Characters in PDF: 1013162
Number of Tokens in PDF: 253290.5
-------- PDF INFO for Financial Simulation Modeling in Excel  A Step-by-Step Guide (KEITH ALLMAN, JOSH LAURITO and MICHAEL LOH) (Z-Library).pdf --------
Number of Pages in PDF: 211
Number of Characters in PDF: 429289
Number of Tokens in PDF: 107322.25
Processed 1/12029 texts
Processed 401/12029 texts
Processed 801/12029 texts
Processed 1201/12029 texts
Processed 1601/12029 texts
Processed 2001/12029 texts
Processed 2401/12029 texts
Processed 2801/12029 texts
Processed 3201/12029 texts
Processed 3601/12029 texts
Processed 4001/12029 texts
Processed 4401/12029 texts
Processed 4801/12029 texts
Pr



Embeddings saved to FAISS index!
-------- PDF INFO for Building Financial Models (John Tjia) (Z-Library).pdf --------
Number of Pages in PDF: 353
Number of Characters in PDF: 484201
Number of Tokens in PDF: 121050.25
-------- PDF INFO for Financial Statement Analysis (Martin S. Fridson and Fernando Alvarez) (Z-Library).pdf --------
Number of Pages in PDF: 447
Number of Characters in PDF: 1013162
Number of Tokens in PDF: 253290.5
-------- PDF INFO for Financial Simulation Modeling in Excel  A Step-by-Step Guide (KEITH ALLMAN, JOSH LAURITO and MICHAEL LOH) (Z-Library).pdf --------
Number of Pages in PDF: 211
Number of Characters in PDF: 429289
Number of Tokens in PDF: 107322.25
                                                Text         0         1  \
0                                           TLFeBOOK -0.169885  0.246148   
1  BUILDING FINANCIAL MODELS A Guide to Creating ... -0.321837  0.539561   
2  City Milan New Delhi San Juan Seoul Singapore ... -0.156787  0.009771   
3  Copyrigh