In [1]:
import os
import glob

# Get the absolute path of the current project directory
project_dir = os.path.abspath('.')

# Get the parent of the parent directory
WORK_DIR = os.path.abspath(os.path.join(project_dir, '../../'))

# Change the working directory to the parent of the parent directory
os.chdir(WORK_DIR)

# Verify the change by printing the current working directory
print("Current Working Directory:", os.getcwd())

Current Working Directory: /Users/david.amat/Documents/david/pdf-search-llm-rag


In [2]:
import pandas as pd

In [3]:
df_filtered = pd.read_parquet("data/debug_read_pdf.parquet", engine="pyarrow")

In [9]:
df_text_processed = df_filtered.copy()

# Sentence Transformers

In [6]:
from sentence_transformers import SentenceTransformer
import torch

In [7]:
MODEL_SENTENCE_TRANSFORMER = 'all-MiniLM-L6-v2'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
# Load the SentenceTransformer model
model = SentenceTransformer(
    MODEL_SENTENCE_TRANSFORMER,
).to(DEVICE)



In [11]:
df_text_processed

Unnamed: 0,file_name,page_number,paragraph,text
0,attention_is_all_you_need,1,1,"Provided proper attribution is provided, Googl..."
1,attention_is_all_you_need,1,2,reproduce the tables and figures in this paper...
2,attention_is_all_you_need,1,3,scholarly works.
3,attention_is_all_you_need,1,4,Attention Is All You Need
4,attention_is_all_you_need,1,5,Ashish Vaswani∗
...,...,...,...,...
826,attention_is_all_you_need,15,109,<EOS>
827,attention_is_all_you_need,15,110,<pad>Figure 5: Many of the attention heads exh...
828,attention_is_all_you_need,15,111,"sentence. We give two such examples above, fro..."
829,attention_is_all_you_need,15,112,at layer 5 of 6. The heads clearly learned to ...


In [18]:
len(df_text_processed["text"].tolist()[3])

25

In [20]:
df_text_processed["embeddings"] = pd.Series(
    model.encode(df_text_processed["text"], show_progress_bar=True).tolist(),
    index=df_text_processed.index,
)

Batches: 100%|██████████| 26/26 [00:00<00:00, 26.92it/s]


In [22]:
# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame
df_text_processed['context_id'] = [*range(df_text_processed.shape[0])]

In [23]:
df_text_processed

Unnamed: 0,file_name,page_number,paragraph,text,embeddings,context_id
0,attention_is_all_you_need,1,1,"Provided proper attribution is provided, Googl...","[-0.11509871482849121, 0.010262911207973957, 0...",0
1,attention_is_all_you_need,1,2,reproduce the tables and figures in this paper...,"[0.010889335535466671, 0.07457642257213593, -0...",1
2,attention_is_all_you_need,1,3,scholarly works.,"[-0.07605524361133575, 0.06901659071445465, -0...",2
3,attention_is_all_you_need,1,4,Attention Is All You Need,"[0.05313875898718834, -0.019583694636821747, -...",3
4,attention_is_all_you_need,1,5,Ashish Vaswani∗,"[-0.03129125386476517, 0.046448078006505966, 0...",4
...,...,...,...,...,...,...
826,attention_is_all_you_need,15,109,<EOS>,"[-0.015310936607420444, 0.07826308161020279, -...",826
827,attention_is_all_you_need,15,110,<pad>Figure 5: Many of the attention heads exh...,"[0.029949873685836792, -0.02654660865664482, 0...",827
828,attention_is_all_you_need,15,111,"sentence. We give two such examples above, fro...","[0.033357515931129456, -0.000603160064201802, ...",828
829,attention_is_all_you_need,15,112,at layer 5 of 6. The heads clearly learned to ...,"[-0.021467424929142, -0.008490338921546936, 0....",829


In [24]:
embedding_dim = model.get_sentence_embedding_dimension()
print(f"Embedding Dimension: {embedding_dim}")

Embedding Dimension: 384


# Hopswork

In [25]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/963732
Connected. Call `.close()` to terminate connection gracefully.


In [26]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings", 
    model.get_sentence_embedding_dimension(),
)

In [27]:
# Get or create the 'documents_fg' feature group
documents_fg = fs.get_or_create_feature_group(
    name="documents_fg",
    embedding_index=emb,
    primary_key=['context_id'],
    version=1,
    description='Information from various files, presenting details like file names, source links, and structured text excerpts from different pages and paragraphs.',
    online_enabled=True,
)

documents_fg.insert(df_text_processed)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/963732/fs/957507/fg/1102155


Uploading Dataframe: 100.00% |██████████| Rows 831/831 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: documents_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/963732/jobs/named/documents_fg_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x2f20cd760>, None)

In [28]:
# Get or create the 'documents' feature view
feature_view = fs.get_or_create_feature_view(
    name="documents",
    version=1,
    description='Chunked context for RAG system',
    query=documents_fg.select(["file_name", "page_number", "paragraph", "text"]),
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/963732/fs/957507/fv/documents/version/1


# Retrieve Feature Store data

In [29]:
# Retrieve the 'documents' feature view
feature_view = fs.get_feature_view(
    name='documents',
    version=1,
)

In [30]:
# Get batch data from the feature view
data = feature_view.get_batch_data()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.32s) 


In [36]:
data.sort_values(["page_number", "paragraph"], inplace=True)
data[data["page_number"] == 2].head(50).tail(10)

Unnamed: 0,file_name,page_number,paragraph,text
237,attention_is_all_you_need,2,41,entirely on self-attention to compute represen...
435,attention_is_all_you_need,2,42,aligned RNNs or convolution. In the following ...
580,attention_is_all_you_need,2,43,self-attention and discuss its advantages over...
820,attention_is_all_you_need,2,44,3 Model Architecture
111,attention_is_all_you_need,2,45,Most competitive neural sequence transduction ...
323,attention_is_all_you_need,2,46,"Here, the encoder maps an input sequence of sy..."
386,attention_is_all_you_need,2,47,"of continuous representations z= (z1, ..., z n..."
649,attention_is_all_you_need,2,48,"sequence (y1, ..., y m)of symbols one element ..."
770,attention_is_all_you_need,2,49,"[10], consuming the previously generated symbo..."
61,attention_is_all_you_need,2,50,2
