In [1]:
import os
import glob

# Get the absolute path of the current project directory
project_dir = os.path.abspath('.')

# Get the parent of the parent directory
WORK_DIR = os.path.abspath(os.path.join(project_dir, '../../'))

# Change the working directory to the parent of the parent directory
os.chdir(WORK_DIR)

# Verify the change by printing the current working directory
print("Current Working Directory:", os.getcwd())

Current Working Directory: /Users/david.amat/Documents/david/pdf-search-llm-rag


In [2]:
import pandas as pd

In [3]:
df_filtered = pd.read_parquet("data/debug_read_pdf.parquet", engine="pyarrow")

In [4]:
df_text_processed = df_filtered.copy()

# Sentence Transformers

In [5]:
from sentence_transformers import SentenceTransformer
import torch

  from tqdm.autonotebook import tqdm, trange


In [6]:
MODEL_SENTENCE_TRANSFORMER = 'all-MiniLM-L6-v2'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
# Load the SentenceTransformer model
model = SentenceTransformer(
    MODEL_SENTENCE_TRANSFORMER,
).to(DEVICE)



In [8]:
df_text_processed

Unnamed: 0,file_name,page_number,paragraph,text
0,attention_is_all_you_need,1,1,"Provided proper attribution is provided, Googl..."
1,attention_is_all_you_need,1,2,Attention Is All You Need\nAshish Vaswani∗\nGo...
2,attention_is_all_you_need,1,3,∗Equal contribution. Listing order is random. ...
3,attention_is_all_you_need,1,4,†Work performed while at Google Brain
4,attention_is_all_you_need,1,5,‡Work performed while at Google Research
...,...,...,...,...
138,attention_is_all_you_need,15,2,<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfe...
139,attention_is_all_you_need,15,3,<EOS>\n<pad>\nInput-Input Layer5\nThe\nLaw\nwi...
140,attention_is_all_you_need,15,4,<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfe...
141,attention_is_all_you_need,15,5,<EOS>\n<pad>Figure 5: Many of the attention he...


In [9]:
len(df_text_processed["text"].tolist()[3])

37

In [10]:
df_text_processed["embeddings"] = pd.Series(
    model.encode(df_text_processed["text"], show_progress_bar=True).tolist(),
    index=df_text_processed.index,
)

Batches: 100%|██████████| 5/5 [00:01<00:00,  4.69it/s]


In [11]:
# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame
df_text_processed['context_id'] = [*range(df_text_processed.shape[0])]

In [12]:
df_text_processed

Unnamed: 0,file_name,page_number,paragraph,text,embeddings,context_id
0,attention_is_all_you_need,1,1,"Provided proper attribution is provided, Googl...","[-0.017848478630185127, 0.014465652406215668, ...",0
1,attention_is_all_you_need,1,2,Attention Is All You Need\nAshish Vaswani∗\nGo...,"[-0.07263379544019699, -0.1257372349500656, 0....",1
2,attention_is_all_you_need,1,3,∗Equal contribution. Listing order is random. ...,"[-0.08534739911556244, -0.09007889777421951, -...",2
3,attention_is_all_you_need,1,4,†Work performed while at Google Brain,"[-0.0912894532084465, -0.0209952425211668, 0.0...",3
4,attention_is_all_you_need,1,5,‡Work performed while at Google Research,"[-0.11088573932647705, 0.03885276988148689, 0....",4
...,...,...,...,...,...,...
138,attention_is_all_you_need,15,2,<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfe...,"[-0.04332689568400383, 0.022461799904704094, -...",138
139,attention_is_all_you_need,15,3,<EOS>\n<pad>\nInput-Input Layer5\nThe\nLaw\nwi...,"[-0.06781154125928879, -0.0022821910679340363,...",139
140,attention_is_all_you_need,15,4,<EOS>\n<pad>\nThe\nLaw\nwill\nnever\nbe\nperfe...,"[-0.04332689568400383, 0.022461799904704094, -...",140
141,attention_is_all_you_need,15,5,<EOS>\n<pad>Figure 5: Many of the attention he...,"[0.019566060975193977, -0.022429874166846275, ...",141


In [13]:
embedding_dim = model.get_sentence_embedding_dimension()
print(f"Embedding Dimension: {embedding_dim}")

Embedding Dimension: 384


In [14]:
df_text_processed.to_parquet("data/debug_embeddings_create.parquet", engine="pyarrow")