# Data Indexing

There are two central steps involved:

1. Documents are stored, prepared and split into smaller text chunks.
2. Text chunks are converted into vector embeddings and stored in a vector database (Vector DB) next to their respective text chunks.


*** 
**Background information**

* All files, chunks and embeddings are stored on a local **Supabase** server (open source Firebase alternative; based on Postgres, which is a relational database management system), see: https://supabase.com/


***
**Coding sources**

I extend the code provided and explained in the following YouTube Video: 

- RAG Langchain Python Project: Easy AI/Chat For Your Docs: https://www.youtube.com/watch?v=tcqEUSNCn8I
    + GitHub: https://github.com/pixegami/langchain-rag-tutorial




## If you facing issues running your Code:

It could be the case that chroma and langchain cause import issues, see: https://github.com/langchain-ai/langchain/issues/7509

In [3]:
## run in your terminal:
# pip install pydantic==1.10.8
# pip install chromadb==0.3.26

## Get API, local supabase server key(s)

In [4]:
import os
import sys

# Assuming 'src' is one level down (in the current directory or a subdirectory)
path_to_src = os.path.join('../..','src')  # Moves one level down to 'src' folder

# Add the path to sys.path
sys.path.append(path_to_src)

# Now you can import your API_key module
import API_key as key

## include self-written functions

In [None]:
import src.forDataIndexing as di

In [None]:
# Print the current working directory
print("Current working directory:", os.getcwd())

# Connect to our local supabase server

In [7]:
from supabase import create_client

supabase = create_client(key.SUPABASE_URL, key.SUPABASE_KEY)

# Data Preperation: Documents are stored, prepared and split into smaller text chunks

## Enter your search / how you have found the PDF files

In [None]:
res = supabase.table("searches").select("topic").execute()
# Convert the list of tuples to a dictionary
data_dict = dict(res)
# Access the 'data' key directly
data_items = data_dict.get('data', [])
# Extract topics from the list of dictionaries within 'data'
topics = [entry['topic'] for entry in data_items]
print("search topics in your DB:\n", topics)

In [None]:
entry_search = {'topic': "AI regulation", 'subtopic': "trust, risk, benefit", 
                'search_query':'"artificial intelligence" AND (trust OR risk* OR benef*) AND "regulation"', 'search_plattform': "Google Scholar", 
                'comment':"only retrieved the first 30 entries (2 excluded, because 1x not downloadable); N=29"}

# Check if the 'topic' in entry_search is in topics
if entry_search['topic'] in topics:
    # Update the existing entry where the topic matches
    data = supabase.table('searches').update(entry_search).eq('topic', entry_search['topic']).execute()
    print("search entry updated")
else:
    # Insert the new entry if the topic is not found in topics
    data = supabase.table('searches').insert(entry_search).execute()
    print("new search entry added")

## Upload your PDFs in the local DB

define folder path to your PDFs:

In [10]:
path_to_PDF = os.path.join('PDFs/AIregulation/')  # Moves one level up to 'PDFs' folder

upload PDFs to storage **AND** creates an entry in the DB:

Remark: this function takes a bit, because PDFs are temporarly loaded to get their number of pages

In [None]:
args_Search = {'topic': "AI regulation", 'subtopic': "trust, risk, benefit"}

di.upload_PDFs(folder_path=path_to_PDF, supabase_DB=supabase, args_Search=args_Search, verbose=False)

PDFs in your DB:

In [None]:
res = supabase.storage.from_('files').list()
# Extracting 'name' from each dictionary
file_names = [file['name'] for file in res]

# Output the list of file names
print(file_names)

# Number of PDFs in DB
print(len(file_names))

## PDFs in the local DB are prepared

set a folder path for the temporary download of your PDFs:

In [13]:
path_to_PDFs = os.path.join('tmp_downloads')  # Moves one level up to 'tmp_downloads' folder

aaaaaaaaaaaaaaaaaa GROBID

https://grobid.readthedocs.io/en/latest/Grobid-docker/#grobid-and-docker-containers

docker run --rm --gpus all --init --ulimit core=0 -p 8070:8070 grobid/grobid:0.8.1

In [None]:
ERROR

see: https://grobid.readthedocs.io/en/latest/Grobid-service/

In [49]:
import requests
from pathlib import Path

pdf_file = Path(path_to_PDFs + "/10.1007_s00146-023-01650-z.pdf")
files = {
    'input': open(pdf_file, 'rb'),
}

response = requests.post('http://localhost:8070/api/processFulltextDocument', files=files)

In [None]:
vars(response)

In [None]:
response.content

In [44]:
from grobid_client import Client

client = Client(base_url="https://cloud.science-miner.com/grobid/api")

In [None]:
client

In [None]:
from grobid_client.grobid_client import GrobidClient

In [33]:
from grobid_client import Client

client = Client(base_url="http://localhost:8070/")

In [46]:
from pathlib import Path
from grobid_client.api.pdf import process_fulltext_document
from grobid_client.models import Article, ProcessForm
from grobid_client.types import TEI, File
pdf_file = "MyPDFFile.pdf"
pdf_file = Path(path_to_PDFs + "/10.1007_s00146-023-01650-z.pdf")

with pdf_file.open("rb") as fin:
    form = ProcessForm(
        segment_sentences="1",
        input_=File(file_name=pdf_file.name, payload=fin, mime_type="application/pdf"),
    )
    r = process_fulltext_document.sync_detailed(client=client, multipart_data=form)
    if r.is_success:
        article: Article = TEI.parse(r.content, figures=False)
        assert article.title

In [None]:
r

aaaaaaaaaaaaaaaaaa GROBID

get the names of all PDFs, which have not been processed (chunks + embeddings):

In [None]:
non_processed_PDFs = di.non_processed_PDFs(supabase_DB=supabase, verbose=False)
print("non_processed_PDFs:\n", non_processed_PDFs)

In [None]:
args_Split = {'chunk_size': 800, 'chunk_overlap': 150}

di.load_split_embed(supabase_DB=supabase, path_to_PDFs=path_to_PDFs, args_Split=args_Split, LMM='all-MiniLM-L6-v2')
# delete all downloaded files

In [None]:
response = (
    supabase.table("documents_chunks")
    .select("*")
    .eq("order_chunks", 0)
    .execute()
)

print(len(response.data))
print(response.data)

# Outcomes

In [None]:
data = supabase.rpc('hello_world3').execute()
print("Hello World:", data)

In [None]:
from collections import namedtuple

# Define the namedtuple
Document = namedtuple('Document', ['page_content'])

# Create an instance of Document
doc = Document("How can AI Regulation be defined?")

# Pass the object inside a list
embedding = di.create_embeddings([doc], LMM='all-MiniLM-L6-v2', verbose=False)
print("embedding:\n", embedding)

In [30]:
match_threshold = 0.7  # Replace with your desired threshold

# Call the RPC function
data = supabase.rpc('match_documents_chunks', {
    'embedding': embedding[0].tolist(),
    'match_threshold': match_threshold,
    'match_count': 22
}).execute()

In [None]:
len(data.data)

In [None]:
data.data

In [None]:
ERROR

**load_pdfs_by_filename**: Loads and stores PDF pages by filename:

In [None]:



pdf_pages = di.load_pdfs_by_filename(path_to_PDFs, verbose=False)

# Optional: Print the loaded pages by filename
for filename, pages in pdf_pages.items():
    print(f"\nPDF: {filename}")
    print(f"Total Pages: {len(pages)}")
    # print(pages[0])

In [None]:
# Assuming pdf_chunks is the dictionary containing chunks for each PDF
first_key = list(pdf_pages.keys())[0]  # Get the first PDF filename
print("first PDF of folder:", first_key)
first_pdf_pages = pdf_pages[first_key]  # Get the chunks for the first PDF


# Print the first page
print("First Page:", first_pdf_pages[0], "\n\n")

**split_pdf_pages_into_chunks**: Splits and stores PDF pages into chunks by filename:

On average, one token is roughly 4 characters in English text. So, each chunk of 800 characters roughly corresponds to 200 tokens.


**The maximal number of tokens of `all-MiniLM-L6-v2` is 512.**

In [None]:
pdf_chunks = di.split_pdf_pages_into_chunks(pdf_pages, chunk_size=800, chunk_overlap=150, verbose=False)

# Optional: Print a summary of chunks created per PDF
for filename, chunks in pdf_chunks.items():
    print(f"\nPDF: {filename}")
    print(f"Total Chunks: {len(chunks)}")

In [None]:
# Assuming pdf_chunks is the dictionary containing chunks for each PDF
first_key = list(pdf_chunks.keys())[0]  # Get the first PDF filename
print("first PDF of folder:", first_key)
first_pdf_chunks = pdf_chunks[first_key]  # Get the chunks for the first PDF

# Access the first and second chunks
first_chunk = first_pdf_chunks[0]
second_chunk = first_pdf_chunks[1]

# Print the first two chunks
print("\nFirst Chunk:", first_chunk, "\n\n")
print("Second Chunk:", second_chunk)

In [None]:
print("page content:", first_chunk.page_content, "\n\n")
print("metadata:", first_chunk.metadata)

In [None]:
print(second_chunk.page_content)

In [None]:
len(first_pdf_chunks)
first_pdf_chunks[0].page_content

In [None]:
pdf_chunks.keys()

In [None]:
list(pdf_chunks.keys())

In [50]:



# Assuming first_pdf_chunks is an array of PDF page objects or text chunks
tmp_chunks = []  # Initialize an empty array to store the extracted content

# Iterate over each page or chunk in first_pdf_chunks
for chunk in first_pdf_chunks:
    # Extract the page content (assuming 'chunk' has a method or property called 'extract_content')
    content = chunk.page_content  # Modify this line based on how you extract content from your PDF chunks
    # Append the extracted content to tmp_chunks
    tmp_chunks.append(content)

# tmp_chunks now contains the extracted content from each page


In [None]:
len(tmp_chunks)

# Data Storage: Text chunks are converted into vector embeddings and stored in a vector database (Vector DB) next to their respective text chunks.

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd


# Define sentences
sentences = tmp_chunks

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract features
features = model.encode(sentences)

# Print the features as a pandas dataframe
pd.DataFrame(features)

In [None]:
len(features)
features[0]

In [None]:
similarities = model.similarity(features, features)
print(similarities)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import dendrogram, linkage
from sentence_transformers import SentenceTransformer

# Initialize the model and encode the corpus
embedder = SentenceTransformer("all-MiniLM-L6-v2")
corpus = sentences
corpus_embeddings = embedder.encode(corpus)

# Calculate similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(corpus_embeddings)

# Convert similarity matrix to a distance matrix
distance_matrix = 1 - similarity_matrix

# Perform hierarchical clustering
linked = linkage(distance_matrix, 'ward')

# Plot the dendrogram
plt.figure(figsize=(10, 8))
dendrogram(linked, labels=corpus, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title("Dendrogram of Sentence Clustering")
plt.xlabel("Sentences")
plt.ylabel("Distance")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
aaaaaaaaaaaa

In [24]:
import os
import json
#from dotenv import load_dotenv
from supabase import create_client, Client
from faker import Faker
import faker_commerce


def add_entries_to_vendor_table(supabase, vendor_count):
    fake = Faker()
    foreign_key_list = []
    fake.add_provider(faker_commerce.Provider)
    main_list = []
    for i in range(vendor_count):
        value = {'vendor_name': fake.company(), 'total_employees': fake.random_int(40, 169),
                 'vendor_location': fake.country()}

        main_list.append(value)
    data = supabase.table('vendor2').insert(main_list).execute()
    data_json = json.loads(data.json())
    data_entries = data_json['data']
    for i in range(len(data_entries)):
        foreign_key_list.append(int(data_entries[i]['vendor_id']))
    return foreign_key_list


def add_entries_to_product_table(supabase, vendor_id):
    fake = Faker()
    fake.add_provider(faker_commerce.Provider)
    main_list = []
    iterator = fake.random_int(1, 15)
    for i in range(iterator):
        value = {'vendor_id': vendor_id, 'product_name': fake.ecommerce_name(),
                 'inventory_count': fake.random_int(1, 100), 'price': fake.random_int(45, 100)}
        main_list.append(value)
    data = supabase.table('Product').insert(main_list).execute()


def main():
    vendor_count = 10
    supabase: Client = create_client(key.SUPABASE_URL, key.SUPABASE_KEY)
    fk_list = add_entries_to_vendor_table(supabase, vendor_count)
    #for i in range(len(fk_list)):
    #    add_entries_to_product_table(supabase, fk_list[i])


main()


In [None]:
from supabase import create_client, Client

supabase: Client = create_client(key.SUPABASE_URL, key.SUPABASE_KEY)

data = supabase.rpc('hello_world').execute()
print("Hello World:", data)


data = supabase.rpc('get_vendors').gt('total_employees', 160).execute()
print("Vendors:", data)
vars(data)
data.data[0]