In [80]:
# Debug pinecone installation
!pip show pinecone

Name: pinecone
Version: 6.0.2
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: c:\Users\kadda\github_repos\rag-gen-ai\.venv\Lib\site-packages
Requires: certifi, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: 


In [81]:
import sys
print(sys.executable)
print(sys.path)

c:\Users\kadda\github_repos\rag-gen-ai\.venv\Scripts\python.exe
['C:\\Users\\kadda\\AppData\\Local\\Programs\\Python\\Python313\\python313.zip', 'C:\\Users\\kadda\\AppData\\Local\\Programs\\Python\\Python313\\DLLs', 'C:\\Users\\kadda\\AppData\\Local\\Programs\\Python\\Python313\\Lib', 'C:\\Users\\kadda\\AppData\\Local\\Programs\\Python\\Python313', 'c:\\Users\\kadda\\github_repos\\rag-gen-ai\\.venv', '', 'c:\\Users\\kadda\\github_repos\\rag-gen-ai\\.venv\\Lib\\site-packages', 'c:\\Users\\kadda\\github_repos\\rag-gen-ai\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\kadda\\github_repos\\rag-gen-ai\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\kadda\\github_repos\\rag-gen-ai\\.venv\\Lib\\site-packages\\Pythonwin']


In [82]:
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from bs4 import BeautifulSoup

import hashlib
import os
from dotenv import load_dotenv

from datetime import datetime

from tqdm import tqdm

In [83]:
load_dotenv()
# Retrieve the Pinecone API key from user data
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

print(f"API key loaded: {pinecone_api_key[:8]}..." if pinecone_api_key else "No API key found")

# Initialize the OpenAI client
openai_client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

# Define constants [vector size, embedding engine] small: 1536 medium: 4096 large: 12288
ENGINE = 'text-embedding-3-small'

# Initialize Pinecone client
pc = Pinecone(
    pinecone_api_key=pinecone_api_key
)
print(pc)

API key loaded: pcsk_pxt...
<pinecone.control.pinecone.Pinecone object at 0x000002673D9A16E0>


In [84]:
INDEX_NAME = 'semantic-search-rag-index' # name of the collection of documents
NAMESPACE = 'default' # split the documents into namespaces. default is 'default'

def get_embeddings(text, engine=ENGINE):
    """
    Get the embeddings for a given text using OpenAI's API.
    """
    response = openai_client.embeddings.create(
            input=text,
            model=engine
        )
    return [data.embedding for data in list(response.data)]

def get_embedding(text, engine=ENGINE):
    """
    Get the embedding for a given text using OpenAI's API.
    Use the get_embeddings function to get the embedding for a single text
    """
    return get_embeddings(text, engine)[0]

len(get_embedding("Hello world!")), len(get_embeddings(["Hello world!", "Servus Welt"]))


(1536, 2)

In [85]:
if INDEX_NAME not in pc.list_indexes().names():
    # Create the index if it doesn't exist
    pc.create_index(
        name=INDEX_NAME, # name of the index
        dimension=1536, # dimension of the embedding for OpenAI's text-embedding-3-small
        metric='cosine', # metric for similarity search
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1',
        )
    )

# Store the index as variable
index = pc.Index(INDEX_NAME)
index

<pinecone.data.index.Index at 0x2673b7cfbd0>

In [86]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [87]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    # depending the vectorDB, We need to know how to upload the data to Pinecone and read
    # Compressed version of the text
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

'ae76cc4dfd345ecaeea9b8ba0d5c3437'

In [88]:
def prepare_data_for_pinecone(texts, engine=ENGINE, urls=None):
    """
    Prepare the data for Pinecone by creating a list of tuples with the ID, embedding, and metadata.
    """

    now = datetime.now()

    # Generate vector embeddings for each string in the input list, using the specified engine
    embeddings = get_embeddings(texts, engine)

    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() 
    # function is used to generate the current UTC date and time
    responses = [
        (
            my_hash(text),  # A unique ID for each string, generated using the my_hash() function
            embedding,  # The vector embedding of the string
            dict(text=text, date_uploaded=now.isoformat())  # A dictionary of metadata, including the original text and the current UTC date and time
        )
        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding
    ]

    if urls and len(urls) == len(texts):
        for response, url in zip(responses, urls):
            response[-1]['url'] = url

    return responses
    

In [89]:
texts = ['hi']

In [90]:
_id, embedding, metadata = prepare_data_for_pinecone(texts)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:   49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_uploaded': '2025-05-16T13:26:45.756376'}


In [91]:
urls = ['fake.url']
_id, embedding, metadata = prepare_data_for_pinecone(texts, urls=urls)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:   49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_uploaded': '2025-05-16T13:26:46.077622', 'url': 'fake.url'}


In [92]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False, urls=None):
    # Call the prepare_data_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        text_batch = texts[i: i + batch_size]
        if urls:
            url_batch = urls[i: i + batch_size]
            prepared_texts = prepare_data_for_pinecone(text_batch, urls=url_batch)
        else:
            prepared_texts = prepare_data_for_pinecone(text_batch)


        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            vectors=prepared_texts,
            namespace=namespace
        )['upserted_count']


    return total_upserted

In [93]:
# Call the upload_texts_to_pinecone() function with the input texts
upload_texts_to_pinecone(texts)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'default': {'vector_count': 1}},
 'total_vector_count': 1,
 'vector_type': 'dense'}

In [94]:
def query_from_pinecone(query, top_k=3, include_metadata=True):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query, engine=ENGINE)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=include_metadata   # gets the metadata (dates, text, etc)
    ).get('matches')

In [95]:
query_from_pinecone('hello')


[{'id': '49f68a5c8493ec2c0bf489821c21fc3b',
  'metadata': {'date_uploaded': '2025-05-16T13:26:46.436875', 'text': 'hi'},
  'score': 0.808478475,
  'values': []}]

In [96]:
def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    # Compute the hash (id) for each text
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]

    # The ids parameter is used to specify the list of IDs (hashes) to delete
    return index.delete(ids=hashes, namespace=namespace)

# delete our text
delete_texts_from_pinecone(texts)

{}

In [97]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [167]:
# specific of the website strucutre munchen.de
# https://www.muenchen.de/en/events/
base_url = 'https://www.muenchen.de'
event_url = f'{base_url}/en/events/'
print(base_url)
print(event_url)

https://www.muenchen.de
https://www.muenchen.de/en/events/


In [170]:
import requests
import re

def find_links_with_pattern(url, pattern):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Compile the regex pattern
    regex = re.compile(pattern)
    
    # Find all 'a' tags with href attribute matching the pattern
    links = soup.find_all('a', href=regex)
    
    # Extract and return the href values
    return [link.get('href') for link in links]

# Example usage
pattern = r'^/en/node/'

matching_links = find_links_with_pattern(event_url, pattern)
len(matching_links)

20

In [171]:
urls = []

from bs4 import BeautifulSoup
import requests

for matching_link in tqdm(matching_links):
    r = requests.get(base_url + matching_link)
    soup = BeautifulSoup(r.content, 'html.parser')
    for link in soup.find_all('a'):
        if 'href' in link.attrs:
            if link['href'].startswith('/') and 'sehenswuerdigkeiten' in link['href']:
                urls.append(base_url + link['href'])
    
urls = list(set([u.lower().strip() for u in urls]))
print(urls)
len(urls)

100%|██████████| 20/20 [00:03<00:00,  5.46it/s]

['https://www.muenchen.de/sehenswuerdigkeiten/museen/museum-urban-and-contemporary-art-muca', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/infopoint-museen-schloesser-bayern', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/kunsthalle-muenchen', 'https://www.muenchen.de/sehenswuerdigkeiten/tourismus', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/ns-dokumentationszentrum-muenchen', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/zentralinstitut-fuer-kunstgeschichte', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/bayerische-staatsbibliothek', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/museum-brandhorst', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/amuseum-of-contemporary-art', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/muenchner-stadtmuseum', 'https://www.muenchen.de/sehenswuerdigkeiten/museum-mineralogia-muenchen', 'https://www.muenchen.de/sehenswuerdigkeiten/museen/galerie-bezirk-oberbayern', 'https://www.muenchen.de/sehenswuerdigke




18

In [172]:
for url in urls:
    print(url)

https://www.muenchen.de/sehenswuerdigkeiten/museen/museum-urban-and-contemporary-art-muca
https://www.muenchen.de/sehenswuerdigkeiten/museen/infopoint-museen-schloesser-bayern
https://www.muenchen.de/sehenswuerdigkeiten/museen/kunsthalle-muenchen
https://www.muenchen.de/sehenswuerdigkeiten/tourismus
https://www.muenchen.de/sehenswuerdigkeiten/museen/ns-dokumentationszentrum-muenchen
https://www.muenchen.de/sehenswuerdigkeiten/museen/zentralinstitut-fuer-kunstgeschichte
https://www.muenchen.de/sehenswuerdigkeiten/museen/bayerische-staatsbibliothek
https://www.muenchen.de/sehenswuerdigkeiten/museen/museum-brandhorst
https://www.muenchen.de/sehenswuerdigkeiten/museen/amuseum-of-contemporary-art
https://www.muenchen.de/sehenswuerdigkeiten/museen/muenchner-stadtmuseum
https://www.muenchen.de/sehenswuerdigkeiten/museum-mineralogia-muenchen
https://www.muenchen.de/sehenswuerdigkeiten/museen/galerie-bezirk-oberbayern
https://www.muenchen.de/sehenswuerdigkeiten/museen/archaeologische-staatssamm

In [173]:
import re

def clean_string(text):  # courtesy of Claude 3.5 Sonnet
    # Remove extra whitespace, including newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove any leading/trailing whitespace
    text = text.strip()
    # Remove common filler phrases
    filler_phrases = [
        "Skip to content", "Skip to main content", "Menu", "Sign in", "Print", "Views:", "Comments \(0\)",
        "Footer menu", "Give us Feedback.", "Did this answer your question?", "No Yes No", "Thanks for your feedback.",
        "City Hall", "Events", "Culture, Leisure", "Restaurants", "Shopping", "Sights", "Tourism", 
        "Hotels", "Business", "Living", "Transportation", "Citizen service", "English", "Branchenbuch", 
        "Anzeige", "Search", "muenchen.de", "Das offizielle Stadtportal", "Home"
    ]
    for phrase in filler_phrases:
        text = re.sub(phrase, '', text, flags=re.IGNORECASE)
    # Remove any remaining brackets and their contents
    text = re.sub(r'\[.*?\]', '', text)
    # Remove any remaining parentheses and their contents
    text = re.sub(r'\(.*?\)', '', text)
    # Remove any URLs
    text = re.sub(r'http\S+', '', text)
    # Remove any remaining special characters
    text = re.sub(r'[^\w\s\.\,\?\!]', '', text)
    # Remove extra spaces that may have been created
    text = re.sub(r'\s+', ' ', text).strip()
    return text

  "Skip to content", "Skip to main content", "Menu", "Sign in", "Print", "Views:", "Comments \(0\)",


In [174]:
import requests

event_docs = []
for url in tqdm(urls):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    body = clean_string(soup.find('body').get_text())
    event_docs.append(dict(text=body, url=url))

100%|██████████| 18/18 [00:21<00:00,  1.20s/it]


In [175]:
event_docs[0]

{'text': 'Suche Close Suchbegriff eingeben Suche Menü Close Navigation Sprache Rathaus Zum Rathaus Politik und Beteiligung Stadtverwaltung Presse und Medien Karriere bei der Stadt Finanzen und Vergabe München im Portrait Projekte und Initiativen Veranstaltungen Freizeit Sehenswertes Verkehr Jobs Leben Bürgerservice Zu den Services Wohnen und Meldewesen Verkehr und Mobilität Ausland und Migration Gesundheit und Soziales Familie und Kind Schule und Bildung Wirtschaft und Gewerbe Branchen und Berufe Engagement und Freizeit Aktuelle SpracheDeutsch Français Italiano Español Русский العربية 中文 Sehenswürdigkeiten Museen Museum of Urban and Contemporary Art Museum of Urban and Contemporary Art Entdeckt StreetArt im MUCA Museum of Urban and Contemporary Art Infos zu Ausstellungen und mehr Das Museum of Urban and Contemporary Art zeigt auf drei Etagen und in einem angrenzenden Bunker die Werke internationaler Künstlerinnen der Urban Art Szene. Im MUCA läuft derzeit die Ausstellung Strata mit Wer

In [176]:
BATCH_SIZE = 16
upload_texts_to_pinecone(
    texts=[g['text'] for g in event_docs],
    batch_size=BATCH_SIZE, show_progress_bar=True, 
    urls=[g['url'] for g in event_docs]
)

100%|██████████| 2/2 [00:04<00:00,  2.34s/it]


18

In [177]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'default': {'vector_count': 18}},
 'total_vector_count': 18,
 'vector_type': 'dense'}

In [183]:
query = 'Ich will die Blaue Reiter bild von Franz Marc besuchen'

results = query_from_pinecone(query, top_k=10)
for result in results:
    print(result['metadata']['url'], result['score'], result['metadata']['text'][-50:])

https://www.muenchen.de/sehenswuerdigkeiten/museen/lenbachhaus-muenchen 0.483780771 hauptstadt München und der Stadtwerke München GmbH
https://www.muenchen.de/sehenswuerdigkeiten/museen/museum-brandhorst 0.428482056 hauptstadt München und der Stadtwerke München GmbH
https://www.muenchen.de/sehenswuerdigkeiten/museen/museum-urban-and-contemporary-art-muca 0.399327487 hauptstadt München und der Stadtwerke München GmbH
https://www.muenchen.de/sehenswuerdigkeiten/haus-der-kunst 0.391690969 hauptstadt München und der Stadtwerke München GmbH
https://www.muenchen.de/sehenswuerdigkeiten/museen/amuseum-of-contemporary-art 0.385513335 hauptstadt München und der Stadtwerke München GmbH
https://www.muenchen.de/sehenswuerdigkeiten/museen/galerie-bezirk-oberbayern 0.383822173 hauptstadt München und der Stadtwerke München GmbH
https://www.muenchen.de/sehenswuerdigkeiten/museen/zentralinstitut-fuer-kunstgeschichte 0.377721131 hauptstadt München und der Stadtwerke München GmbH
https://www.muenchen.de/s