In [None]:
# !pip install -qU langchain
# !pip install -qU langchain-google-genai
# !pip install -q sentence-transformers
# !pip install -q pinecone
# !pip install -q pypdf
# !pip install -q opencv-python


hi


In [9]:
# testing gemini model through langchain integration
from langchain_google_genai import ChatGoogleGenerativeAI
import os

api_gemini = os.environ['GEMINI']

test = ChatGoogleGenerativeAI(google_api_key=api_gemini, model="gemini-2.5-flash")
messages = [("system", "to every question you get, you just respond: 'I'm allergic to cats'"),
            ("user", "What do you think about Donald Trump?")]
response = test.invoke(messages)
print(response.content)

I'm allergic to cats


In [3]:
# processing data
from pypdf import PdfReader
from langchain_google_genai import ChatGoogleGenerativeAI
import cv2
import os

api_gemini = os.environ['GEMINI']

# text from the PDF
extracted = PdfReader("/home/lucas/srh/CS1/learning_rag/data/virtual_kaffe.pdf")
text = extracted.pages[0].extract_text()
print("Extracted text: ", text)

## using agentic chunking to chunk the text in semantically coherent chunks
chunking_model = ChatGoogleGenerativeAI(google_api_key=api_gemini, model='gemini-2.5-flash')
messages = [("system", """You're an AI Agent specialized in the pre-processing of data before being used to create embeddings.

                          Given the data you're handled, create chunks that semantically make sense, grouping pieces of the text
                          by ideas.

                          Output the data in this format: n_chunk_here * n+1_chunk_here * n+2_chunk_here * ...

                          IMPORTANT:
                          - Make sure to use asterisks as delimitators between chunks
                          - Make sure not to include text formatting characters like '\n' or bullet points"""),
            ("user", text)]

response = chunking_model.invoke(messages)
text_list = response.content.split(sep="*")
print("After agentic chunking: ", text_list)

# frames from the video
video = cv2.VideoCapture('/home/lucas/srh/CS1/learning_rag/data/virtual_kaffe_video.mp4')
frame_counter = 0

while True:
  ret, frame = video.read()

  if not ret:
    break

  match frame_counter:
    case 0:
      filename = f"/home/lucas/srh/CS1/learning_rag/data/frame_{frame_counter}.jpg"
      cv2.imwrite(filename, frame)
      print(f"Frame {frame_counter} saved.")
    case 30:
      filename = f"/home/lucas/srh/CS1/learning_rag/data/frame_{frame_counter}.jpg"
      cv2.imwrite(filename, frame)
      print(f"Frame {frame_counter} saved.")
    case 60:
      filename = f"/home/lucas/srh/CS1/learning_rag/data/frame_{frame_counter}.jpg"
      cv2.imwrite(filename, frame)
      print(f"Frame {frame_counter} saved.")
    case 90:
      filename = f"/home/lucas/srh/CS1/learning_rag/data/frame_{frame_counter}.jpg"
      cv2.imwrite(filename, frame)
      print(f"Frame {frame_counter} saved.")
    case 120:
      filename = f"/home/lucas/srh/CS1/learning_rag/data/frame_{frame_counter}.jpg"
      cv2.imwrite(filename, frame)
      print(f"Frame {frame_counter} saved.")



  frame_counter += 1



Extracted text:  Virtual  Kaffe  –  Hours  &  Events  
Opening  Hours  
●  Monday:  07:00  –  20:00  
 ●  Tuesday:  07:00  –  20:00  
 ●  Wednesday:  07:00  –  22:00  
 ●  Thursday:  07:00  –  22:00  
 ●  Friday:  07:00  –  23:00  
 ●  Saturday:  08:00  –  23:00  
 ●  Sunday:  08:00  –  18:00  
Weekly  Events  
Monday  –  Slow  Brew  Sessions  
Learn  the  basics  of  pour-over,  Aeropress,  and  filter  brewing.  
 
Time:
 
17:00
 
–
 
18:00
 
Wednesday  –  Open  Laptop  Night  
Coworking  atmosphere  with  discounted  refills.  
 
Time:
 
18:00
 
–
 
22:00
 
Friday  –  Live  Acoustic  Evening  
Local  artists  play  guitar,  piano,  indie  sets.  
 
Time:
 
19:00
 
–
 
22:00
 
Saturday  –  Barista  for  a  Day  
Hands-on  espresso  workshop  (beginner-friendly).  
 
Time:
 
14:00
 
–
 
16:00
 
Sunday  –  Virtual  Reality  Coffee  Tour  
Guided  VR  experience  of  coffee  farms  around  the  world.  
 
Time:
 
10:00
 
–
 
12:00
 
After agentic chunking:  ['Virtual Kaffe – Hours & Eve

In [61]:
# setting up models for embeddings
from sentence_transformers import SentenceTransformer
from PIL import Image

model = SentenceTransformer('clip-ViT-B-32') # we use clip because it understands both images and text

In [62]:
# making the embeddings with the model
img_emb = model.encode(Image.open('/home/lucas/srh/CS1/learning_rag/data/virtual_kaffe_menu.jpg'))
text_emb = model.encode(text_list)

# creating a list of the images for the frames of the video
video_list = []
video_filename_iterator = 0
for n in range(5):
    video_list.append(Image.open(f'/home/lucas/srh/CS1/learning_rag/data/frame_{video_filename_iterator}.jpg'))
    video_filename_iterator += 30

# creating embeddings for the frames of the videos
videos_emb = model.encode(video_list)

    

print("Single Image Embedding:\n", img_emb.shape, "\nEmbeddings for text:\n", text_emb.shape,"\nEmbeddings for video:\n",videos_emb.shape)

Single Image Embedding:
 (512,) 
Embeddings for text:
 (8, 512) 
Embeddings for video:
 (5, 512)


In [None]:
# Sending our embeddings to the database with the correct metadata
from pinecone import Pinecone
from dotenv import find_dotenv, load_dotenv
import os

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

pinecone_api = os.getenv('PINECONE')
pc = Pinecone(api_key=pinecone_api)

index = pc.Index(host='https://secondrag-4hezoud.svc.aped-4627-b74a.pinecone.io')

# upsert text vectors with raw text metadata
text_emb.tolist()
vectors = []
for n in range(len(text_emb)):
    vectors.append(
    {
        "id":str(n),
        "values":text_emb[n],
        "metadata": {"original_text": text_list[n]}
    })
    

    
index.upsert(vectors, namespace="text")

# upsert single image vector with pathname as metadata
img_emb.tolist()
vectors = []
vectors.append({
    "id":"8",
    "values":img_emb,
    "metadata": {"pathname_image":"/home/lucas/srh/CS1/learning_rag/data/virtual_kaffe_menu.jpg"}
})
index.upsert(vectors, namespace="img")

# upsert video frames vectors with pathname as metadata
videos_emb.tolist()
vector = []
video_filename_iterator = 0
for n in range(len(videos_emb)):
    vector.append(
        {
            "id":str(n + 9),
            "values":videos_emb[n],
            "metadata": {"pathname_image": f"/home/lucas/srh/CS1/learning_rag/data/frame_{video_filename_iterator}.jpg"}
        }
    )
    video_filename_iterator += 30

index.upsert(vector, namespace="img")

UpsertResponse(upserted_count=5, _response_info={'raw_headers': {'date': 'Sun, 07 Dec 2025 18:55:08 GMT', 'content-type': 'application/json', 'content-length': '19', 'connection': 'keep-alive', 'x-pinecone-request-lsn': '2', 'x-pinecone-request-logical-size': '10609', 'x-pinecone-request-latency-ms': '146', 'x-pinecone-request-id': '5693768271665595421', 'x-envoy-upstream-service-time': '128', 'grpc-status': '0', 'server': 'envoy'}})

In [74]:
# performing similarity search based on the user input
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.messages import HumanMessage, SystemMessage
from dotenv import find_dotenv, load_dotenv
import os
from helpers import load_image_base64

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)



def similarity_search(query: str, top_k : int, namespace : str):
    # create embedding of the user query
    query_embedding = model.encode(query).tolist()

    # similarity search
    result = index.query(
        namespace = namespace,
        vector = query_embedding,
        top_k = top_k,
        include_metadata = True,
        include_values = False
    )
    
    return result

def get_relevant_metadata(result):
    # iterate over metadata from the matches and turn it into a list
    dict_values_list = []
    for n in range(len(result.matches)):
        title_of_metadata = result.matches[n].get('metadata').values() 
        value_turned_list = list(title_of_metadata) # turn into list to avoid "dict_values()" view
        dict_values_list.append(value_turned_list[0]) # always grab the first one because we create create a list for each value

    return dict_values_list


def gemini_call_normal(prompt: str):
    # simple call without any extra context
    gemini = ChatGoogleGenerativeAI(google_api_key = os.getenv('GEMINI'), model='gemini-2.5-flash')
    messages = [("system", "You're a helpful assistant"),
               ("user", prompt)]
    response = gemini.invoke(messages)
    return response.content

def gemini_call_rag(prompt:str):
    # get metadata from similarity search based on user prompt
    result_text = get_relevant_metadata(similarity_search(prompt, 5, "text"))
    result_pathnames_images = get_relevant_metadata(similarity_search(prompt, 3, "img"))

    # prepare to send extra content for RAG response
    system_msg = SystemMessage("You're a helpful assistant")

    # encoding data into base64 format because that's how the gemini-langchain interface works

    image_url_1 = result_pathnames_images[0]
    image_url_2 = result_pathnames_images[1]
    image_url_3 = result_pathnames_images[2]

    image_data_1 = load_image_base64(image_url_1)
    image_data_2 = load_image_base64(image_url_2)
    image_data_3 = load_image_base64(image_url_3)


    human_msg = HumanMessage(
            content=[
                {"type" : "text", "text" : prompt},
                {"type" : "text", "text": result_text[0]},
                {"type" : "text", "text": result_text[1]},
                {"type" : "text", "text": result_text[2]},
                {"type" : "text", "text": result_text[3]},
                {"type" : "text", "text": result_text[4]},
                {"type": "image_url", "image_url": {"url" : f"data:image/jpeg;base64,{image_data_1}"}},
                {"type": "image_url", "image_url": {"url" : f"data:image/jpeg;base64,{image_data_2}"}},
                {"type": "image_url", "image_url": {"url" : f"data:image/jpeg;base64,{image_data_3}"}},
            ]
    )

    # instantiate the model
    gemini = ChatGoogleGenerativeAI(google_api_key = os.getenv('GEMINI'), model='gemini-2.5-flash')
    response = gemini.invoke([system_msg, human_msg])

    return response.content

print(gemini_call_rag("what does the menu of the virtual kaffe looks like?"))



The menu for the Virtual Kaffe is displayed on a stylish, chalkboard-style sign. It features a black background with elegant white and gold lettering and decorative flourishes, giving it a classic yet inviting look.

At the top, the name "Virtual Kaffe" is prominently displayed in gold.

The menu is divided into three clear sections, each with a small illustrative icon:

1.  **COFFEE** (illustrated with a steaming coffee cup)
    *   Espresso: $3.00
    *   Americano: $4.50
    *   Latte: $4.80
    *   Mocha: $5.80
    *   Cold Brew: $4.00

2.  **PASTRIES** (illustrated with a croissant and a swirl pastry)
    *   Croissant: $3.80
    *   Pain au Chocolat: $4.20
    *   Almond Croissant: $5.00
    *   Cinnamon Roll: $3.50
    *   Blueberry Muffin: (Price partially obscured but listed as an item)

3.  **SNACKS** (illustrated with a bowl of granola/nuts)
    *   Granola Bar: $2.90
    *   Mixed Nuts: $3.20
    *   Cookie: $2.50
    *   Fruit Cup: $4.00
    *   Yogurt Parfait: $4.90
