In [7]:
!pip install --quiet --upgrade google_cloud_firestore google_cloud_aiplatform langchain langchain-google-vertexai langchain_community langchain_experimental pymupdf

In [8]:
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

import pickle
from IPython.display import display, Markdown

from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

from google.cloud import firestore
from google.cloud.firestore_v1.vector import Vector
from google.cloud.firestore_v1.base_vector_query import DistanceMeasure



In [10]:
# Initialize Vertex AI with your project-id and a location
PROJECT_ID = "qwiklabs-gcp-00-3a6c7247b9bd"
LOCATION = "us-central1"  # @param {type:"string"}
print(PROJECT_ID)
vertexai.init(project=PROJECT_ID, location=LOCATION)

qwiklabs-gcp-00-3a6c7247b9bd


In [11]:
# Populate a variable named embedding_model with an instance of the
# langchain_google_vertexai class VertexAIEmbeddings.

from langchain_google_vertexai import VertexAIEmbeddings
embedding_model = VertexAIEmbeddings(model_name="text-embedding-004")

# Download the New York City Department of Health and Mental Hygiene's Food
# Protection Training Manual. This document will serve as our RAG source content.
!gcloud storage cp gs://<bucket>/nyc_food_safety_manual.pdf .

# Use the LangChain class PyMuPDFLoader to load the contents of the PDF
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader("./nyc_food_safety_manual.pdf")
data = loader.load()

# Create a function to do some basic cleaning on artifacts found in this particular document.
def clean_page(page):
  return page.page_content.replace("-\n","")\
                          .replace("\n"," ")\
                          .replace("\x02","")\
                          .replace("\x03","")\
                          .replace("fo d P R O T E C T I O N  T R A I N I N G  M A N U A L","")\
                          .replace("N E W  Y O R K  C I T Y  D E P A R T M E N T  O F  H E A L T H  &  M E N T A L  H Y G I E N E","")


/bin/bash: line 1: bucket: No such file or directory


In [12]:
# Create a variable called cleaned_pages that is a list of strings, with each string being a page of content cleaned by above function.
cleaned_pages = []
for pages in data:
  cleaned_pages.append(clean_page(pages))

In [13]:
# Use LangChain's SemanticChunker with the embedding_model created earlier to split the first five pages of cleaned_pages into text chunks.
text_splitter = SemanticChunker(embedding_model)
docs = text_splitter.create_documents(cleaned_pages[0:4])
chunked_content = [doc.page_content for doc in docs]


In [14]:
# Use the embedding_model to generate embeddings of the text chunks, saving them to a list called chunked_embeddings.
chunked_embeddings = embedding_model.embed_documents(chunked_content)



In [15]:
!gsutil cp gs://partner-genai-bucket/genai069/chunked_content.pkl .
!gsutil cp gs://partner-genai-bucket/genai069/chunked_embeddings.pkl .

chunked_content = pickle.load(open("chunked_content.pkl", "rb"))
chunked_embeddings = pickle.load(open("chunked_embeddings.pkl", "rb"))

Copying gs://partner-genai-bucket/genai069/chunked_content.pkl...
/ [1 files][280.7 KiB/280.7 KiB]                                                
Operation completed over 1 objects/280.7 KiB.                                    
Copying gs://partner-genai-bucket/genai069/chunked_embeddings.pkl...
/ [1 files][  1.8 MiB/  1.8 MiB]                                                
Operation completed over 1 objects/1.8 MiB.                                      


In [16]:
 #Above code only chunks & create embeddings of a short section of the
# document for demo purpose. To get the chunks & corresponding embeddings for
# the full document, run the following code to download pre-created chunks
# & embeddings
!gsutil cp gs://<bucket>/chunked_content.pkl .
!gsutil cp gs://<bucket>/chunked_embeddings.pkl .
chunked_content = pickle.load(open("chunked_content.pkl", "rb"))
chunked_embeddings = pickle.load(open("chunked_embeddings.pkl", "rb"))



/bin/bash: line 1: bucket: No such file or directory
/bin/bash: line 1: bucket: No such file or directory


In [23]:
# Create a Firestore database using console with the default name of (default)
# in Native Mode and leave the other settings to default.

from google.cloud import firestore

# Initialize Firestore client
db = firestore.Client()

# Create reference to "food-safety" collection
collection = db.collection('food-safety')

In [28]:
from google.cloud import firestore

db = firestore.Client(project='qwiklabs-gcp-00-3a6c7247b9bd')

In [29]:
collection = db.collection('food-safety')

In [32]:
!pip install --upgrade google-cloud-firestore



In [35]:
from google.cloud import firestore
import uuid

db = firestore.Client(project='qwiklabs-gcp-00-3a6c7247b9bd')
collection = db.collection('food-safety')

for content, embedding in zip(chunked_content, chunked_embeddings):
    doc_id = str(uuid.uuid4())
    doc_ref = collection.document(doc_id)
    doc_ref.set({
        "content": content,
        "embedding": embedding
    })

In [37]:
# Use a variable called collection to create a reference to a collection named food-safety.
collection = db.collection('food-safety')

# Using a combination of our lists chunked_content and chunked_embeddings,
# add a document to your collection for each of your chunked documents.
for i, (content, embedding) in enumerate(zip(chunked_content, chunked_embeddings)):
    doc_ref = collection.document(f"doc_{i}")
    doc_ref.set({
        "content": content,
        "embedding": Vector(embedding)
    })


In [38]:
# Create a vector index for your collection using your embedding field using gcloud firestore indexes command
!gcloud firestore indexes composite create \
--collection-group=food-safety \
--query-scope=COLLECTION \
--field-config field-path=embedding,vector-config='{"dimension":"768", "flat": "{}"}' \
--project=PROJECT_ID



[1;31mERROR:[0m (gcloud) The project property must be set to a valid project ID, not the project name [PROJECT_ID]
To set your project, run:

  $ gcloud config set project PROJECT_ID

or to unset it, run:

  $ gcloud config unset project


In [49]:

# Create a function to receive a query, get its embedding, and compile a context
# consisting of the text from the 5 documents with the most similar embeddings.
def search_vector_database(query: str):
  context = ""
  query_embedding = embedding_model.embed_query(query)
  vector_query = collection.find_nearest(
    vector_field="embedding",
    query_vector=Vector(query_embedding),
    distance_measure=DistanceMeasure.EUCLIDEAN,
    limit=5,
  )
  docs = vector_query.stream()
  context = [result.to_dict()['content'] for result in docs]
  return context




In [50]:
!gcloud firestore indexes composite create \
  --collection-group=food-safety \
  --query-scope=COLLECTION \
  --field-config field-path=content,order=ASCENDING \
  --field-config field-path=embedding,vector-config='{"dimension":"768", "flat": "{}"}' \
  --project=qwiklabs-gcp-00-3a6c7247b9bd

[1;31mERROR:[0m (gcloud.firestore.indexes.composite.create) ALREADY_EXISTS: index already exists


In [51]:
 # Create a function to receive a query, get its embedding, and compile a context
# consisting of the text from the 5 documents with the most similar embeddings.
def search_vector_database(query: str):
  context = ""
  query_embedding = embedding_model.embed_query(query)
  vector_query = collection.find_nearest(
    vector_field="embedding",
    query_vector=Vector(query_embedding),
    distance_measure=DistanceMeasure.EUCLIDEAN,
    limit=5,
  )
  docs = vector_query.stream()
  context = [result.to_dict()['content'] for result in docs]
  return context


In [53]:
!gcloud firestore indexes composite create \
  --project=qwiklabs-gcp-00-3a6c7247b9bd \
  --collection-group=food-safety \
  --query-scope=COLLECTION \
  --field-config=vector-config='{"dimension":"768","flat": "{}"}',field-path=embedding

Create request issued
Created index [CICAgJiUpoMJ].


In [54]:
# Call the function with a sample query to confirm it's functionality.
search_vector_database("How should I store food?")


[' Store foods away from dripping condensate , at least six inches above the floor and with enough space between items to encourage air circulation. Freezer Storage Freezing is an excellent method for prolonging the shelf life of foods. By keeping foods frozen solid, the bacterial growth is minimal at best. However, if frozen foods are thawed and then refrozen, then harmful bacteria can reproduce to dangerous levels when thawed for the second time. In addition to that, the quality of the food is also affected. Never refreeze thawed foods, instead use them immediately. Keep the following rules in mind for freezer storage:  Use First In First Out method of stock rotation. All frozen foods should be frozen solid with temperature at 0°F or lower. Always use clean containers that are clearly labeled and marked, and have proper and secure lids. Allow adequate spacing between food containers to allow for proper air circulation. Never use the freezer for cooling hot foods. * * Tip: When receiv