### 0. Import libraries, load data and define constants

In [2]:
import json
from tqdm import tqdm
from langchain.embeddings.openai import OpenAIEmbeddings

INPUT_FILE_PATH = "../datascraping/data/flyers_wo_outliers.json"
OUTPUT_PER_SECTION_FILE_PATH = "./data/flyers_embeddings.json"
OUTPUT_PER_PDF_FILE_PATH = "./data/pdf_embeddings.json"

In [3]:
# load filtered pdf data
with open(INPUT_FILE_PATH) as file:
    pdf_data = dict(json.load(file))

In [4]:
# instantiate OpenAI Embeddings model
embedder = OpenAIEmbeddings(model="text-embedding-ada-002")

### 1. Generate and store embeddings at section level

In [7]:
# dictionary that store the embeddings
embeddings = dict()

# number of files embedded when to store the current results
files_no = 50

# get number of pdfs
pdfs_no = len(pdf_data)

# iterate over PDFs
for i, (file_name, data) in enumerate(tqdm(pdf_data.items())):
    # dictionary that stores the embeddings of the current file
    embeddings_current_pdf = dict()

    # get each header and its section
    for header, section in data.items():
        text = f"Filename: {file_name}\nTitle: {header}\n{section}"

        # embed text
        embedding = embedder.embed_query(text)
        embeddings_current_pdf[header] = embedding
    
    # add current embeddings to the rest ones previously generated
    embeddings[file_name] = embeddings_current_pdf

    if (i % files_no == 0 and i != 0) or (i == pdfs_no - 1):
        # save data locally in JSON format
        with open(OUTPUT_PER_SECTION_FILE_PATH, "w") as file:
            file.write(json.dumps(embeddings, indent=4))

  0%|          | 0/1288 [00:00<?, ?it/s]

100%|██████████| 1288/1288 [1:13:18<00:00,  3.42s/it]


### 2. Generate and store embeddings at PDF level

In [5]:
# dictionary that store the embeddings
embeddings = dict()

# number of files embedded when to store the current results
files_no = 50

# get number of pdfs
pdfs_no = len(pdf_data)

# iterate over PDFs
for i, (file_name, data) in enumerate(tqdm(pdf_data.items())):
    text = f"Filename: {file_name}\n"

    # get each header and its section
    for header, section in data.items():
        text += f"Title: {header}\n{section}\n\n"

    # embed text
    embedding = embedder.embed_query(text)
    embeddings[file_name] = embedding
    
    if (i % files_no == 0 and i != 0) or (i == pdfs_no - 1):
        # save data locally in JSON format
        with open(OUTPUT_PER_PDF_FILE_PATH, "w") as file:
            file.write(json.dumps(embeddings, indent=4))

100%|██████████| 1288/1288 [09:06<00:00,  2.36it/s]
