In [3]:
import requests
import pdfplumber
import os
from io import BytesIO

from pinecone import Pinecone, ServerlessSpec

In [None]:
#----------GET PDF------------#

In [2]:
url = "https://services.google.com/fh/files/misc/ai_adoption_framework_whitepaper.pdf"

In [3]:
rq = requests.get(url)

In [None]:
#----------TOKENIZE AND VECTORIZE------------#

In [3]:
# tokenizer lib can be donwloaded by installing requirements or by uncommenting below

# spacy.cli.download("en_core_web_sm")

In [7]:
from sentence_transformers import SentenceTransformer
import spacy

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v1')
nlp = spacy.load('en_core_web_sm')

In [6]:
# Vectorize downloaded pdf file tokenized by sentences
vectors = []

with pdfplumber.open(BytesIO(rq.content)) as pdf:
    for i, page in enumerate(pdf.pages):
        page_text = page.extract_text()
        sentences = nlp(page_text).sents

        for j, sentence in enumerate(sentences):
            id = f"page_{i+1}_sentence_{j+1}"
            sent_text = str(sentence)
            vector = model.encode(sent_text)
            

            vectors.append({"id": id, "values": [float(i) for i in vector], "metadata": {"text": sent_text}})


In [10]:
#----------PINECONE DATA UPLOAD------------#

In [4]:
# pinecone client

pc = Pinecone(api_key=os.environ["pinecone_api_key"])

In [17]:
# create index with correspoinding to sentence-transformers/all-MiniLM-L6-v1 dimension
# and dotproduct metric

pc.create_index(
    name="whitepaper-embeddings-dotproduct", 
    dimension=384, 
    metric="dotproduct", 
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': 'b98583175a0cdbceba25179e87c334ab', 'Date': 'Mon, 11 Nov 2024 23:19:29 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [5]:
index_name = "whitepaper-embeddings-dotproduct"
index = pc.Index(index_name)

index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 373}},
 'total_vector_count': 373}

In [19]:
#vectors = [{'id': vector["id"], 'values': vector["values"], 'metadata':{"text": vector["text"]}} for vector in vectors]


In [33]:
# upsert data to pinecone index
index_name = "whitepaper-embeddings-dotproduct"

index = pc.Index(index_name)
index.upsert(vectors=vectors)

{'upserted_count': 373}

In [None]:
#-----------QUERING-----------#

In [11]:
# now we can query 

sentence_to_search = "Organizations at this phase should look to develop the foundational skill set for core datawrangling and descriptive analytics"

vector_to_query = [float(i) for i in model.encode(sentence_to_search)]

In [14]:
results = index.query(
    index="whitepaper-embeddings-dotproduct", 
    vector=vector_to_query,
    top_k=3,
    #include_values=True,
    include_metadata=True,
)

In [16]:
results

{'matches': [{'id': 'page_17_sentence_1',
              'metadata': {'text': '15\n'
                                   'Organizations at this phase should look to '
                                   'develop the foundational skill set for '
                                   'core data\n'
                                   'wrangling and descriptive analytics.'},
              'score': 0.891715467,
              'values': []},
             {'id': 'page_8_sentence_3',
              'metadata': {'text': 'What data and ML skill sets\n'
                                   'are required in the organization?'},
              'score': 0.638456106,
              'values': []},
             {'id': 'page_18_sentence_5',
              'metadata': {'text': 'At this phase, teams have skills in data '
                                   'wrangling and descriptive and predictive '
                                   'analytics;\n'
                                   'they use existing frameworks, method