In [2]:
import requests
import pdfplumber
import os
from io import BytesIO

from pinecone import Pinecone, ServerlessSpec

In [None]:
#----------GET PDF------------#

In [None]:
url = "https://services.google.com/fh/files/misc/ai_adoption_framework_whitepaper.pdf"

In [None]:
rq = requests.get(url)

In [None]:
#----------TOKENIZE AND VECTORIZE------------#

In [None]:
# tokenizer lib can be donwloaded by installing requirements or by uncommenting below

# spacy.cli.download("en_core_web_sm")

In [3]:
from sentence_transformers import SentenceTransformer
import spacy

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v1')
nlp = spacy.load('en_core_web_sm')

In [None]:
# Vectorize downloaded pdf file tokenized by sentences
vectors = []

with pdfplumber.open(BytesIO(rq.content)) as pdf:
    for i, page in enumerate(pdf.pages):
        page_text = page.extract_text()
        sentences = nlp(page_text).sents

        for j, sentence in enumerate(sentences):
            id = f"page_{i+1}_sentence_{j+1}"
            sent_text = str(sentence)
            vector = model.encode(sent_text)
            

            vectors.append({"id": id, "values": [float(i) for i in vector], "metadata": {"text": sent_text}})


In [None]:
#----------PINECONE DATA UPLOAD------------#

In [None]:
# pinecone client

pc = Pinecone(api_key=os.environ["pinecone_api_key"])

In [None]:
# create index with correspoinding to sentence-transformers/all-MiniLM-L6-v1 dimension
# and dotproduct metric

pc.create_index(
    name="whitepaper-embeddings-dotproduct", 
    dimension=384, 
    metric="dotproduct", 
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [None]:
index_name = "whitepaper-embeddings-dotproduct"
index = pc.Index(index_name)

index.describe_index_stats()

In [None]:
# upsert data to pinecone index
index_name = "whitepaper-embeddings-dotproduct"

index = pc.Index(index_name)
index.upsert(vectors=vectors)

In [None]:
#-----------QUERING-----------#

In [None]:
# now we can query 

sentence_to_search = "Organizations at this phase should look to develop the foundational skill set for core datawrangling and descriptive analytics"

vector_to_query = [float(i) for i in model.encode(sentence_to_search)]

In [None]:
results = index.query(
    index="whitepaper-embeddings-dotproduct", 
    vector=vector_to_query,
    top_k=3,
    #include_values=True,
    include_metadata=True,
)

In [None]:
results