Data manipulation to consolidate the file

In [None]:
import modin.pandas as pd
import numpy as np
from bs4 import BeautifulSoup
    
dfe = pd.read_csv('./oldData/3cixty_cotedazur_events.csv', dtype=str)

dfe = dfe.groupby('event', as_index=False).agg({
    'label': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'description': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'category': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'subject': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'placeLabel': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'placeLocality': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'timeBegin': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'timeEnd': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else ''
})

#rename the column event to url
dfe['url'] = dfe['event']
dfe.pop('event')
dfe['label'] = dfe['label'].apply(lambda x: 'Event name: ' + str(x) if pd.notna(x) else x)
dfe['description'] = dfe['description'].apply(lambda x: ' Event description: ' + str(x) if pd.notna(x) else x)
dfe['description'] = dfe['description'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text() if pd.notna(x) else x)
dfe['category'] = dfe['category'].apply(lambda x: 'Event category: ' + str(x) if pd.notna(x) else x)
dfe['subject'] = dfe['subject'].apply(lambda x: 'Event subject: ' + str(x) if pd.notna(x) else x)
dfe['placeLabel'] = dfe['placeLabel'].apply(lambda x: 'Event place: ' + str(x) if pd.notna(x) else x)
dfe['placeLocality'] = dfe['placeLocality'].apply(lambda x: 'Event place location: ' + str(x) if pd.notna(x) else x)
dfe['timeBegin'] = dfe['timeBegin'].apply(lambda x: 'Event time begin: ' + str(x) if pd.notna(x) else x)
dfe['timeEnd'] = dfe['timeEnd'].apply(lambda x: 'Event time end: ' + str(x) if pd.notna(x) else x)

print(dfe.head())

# Save the result to a new CSV file
dfe.to_csv('./data/events.csv', index=False)

dfp = pd.read_csv('./oldData/3cixty_cotedazur_places.csv', dtype=str)

dfp = dfp.groupby('place', as_index=False, sort=False).agg({
    'label': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'description': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'businessTypeLabel': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else '',
    'locality': lambda x: x.dropna().iloc[0] if x.dropna().size > 0 else ''
})

dfp['url'] = dfp['place']
dfp.pop('place')
dfp['label'] = dfp['label'].apply(lambda x: 'Place name: ' + str(x) if pd.notna(x) else x)
dfp['description'] = dfp['description'].apply(lambda x: 'Place description: ' + str(x) if pd.notna(x) else x)
dfp['businessTypeLabel'] = dfp['businessTypeLabel'].apply(lambda x: 'Place business type: ' + str(x) if pd.notna(x) else x)
dfp['locality'] = dfp['locality'].apply(lambda x: 'Place location: ' + str(x) if pd.notna(x) else x)

print(dfp.head())

dfp.to_csv('./data/places.csv', index=False)


In [1]:
import glob
from llama_index import VectorStoreIndex, ServiceContext, Document, StorageContext, set_global_service_context
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding
import chromadb
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
import modin.pandas as pd
from llama_index.node_parser import SentenceSplitter
from llama_index.indices.prompt_helper import PromptHelper
from langchain_openai import ChatOpenAI
import re
from llama_index.chat_engine import CondensePlusContextChatEngine
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever

In [3]:
system_prompt = (
    "You are a knowledgeable Tourism Assistant designed to provide visitors with "
    "information, recommendations, and tips for exploring and enjoying their destination. "
    "The assistant is familiar with a wide range of topics including historical sites, "
    "cultural events, local cuisine, accommodations, transportation options, and hidden gems. "
    "It offers up-to-date and personalized information to help tourists make the most of their trip."
)

llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.6)

#GPT4All(model="./mistral-7b-openorca.Q4_0.gguf", device='nvidia', n_threads=12, use_mlock=True, n_predict= 2000, temp=0.9)
#ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.9)
#GoogleGenerativeAI(model="gemini-pro")

service_context = ServiceContext.from_defaults( llm=llm, 
                                                prompt_helper = PromptHelper(),
                                                embed_model= LangchainEmbedding(HuggingFaceEmbeddings(model_name='dangvantuan/sentence-camembert-large',model_kwargs = {'device': 'cuda:0'})),
                                                node_parser=SentenceSplitter(),
                                                system_prompt=system_prompt,
                                                )

set_global_service_context(service_context)

# create collection
chroma_collection = chromadb.PersistentClient(path="./chroma_db").get_or_create_collection("tourism_db")
#embedding_functions.HuggingFaceEmbeddingFunction(model_name="dangvantuan/sentence-camembert-large")

# assign chroma as the vector_store to the context
storage_context = StorageContext.from_defaults(vector_store=ChromaVectorStore(chroma_collection=chroma_collection))

No sentence-transformers model found with name C:\Users\chris/.cache\torch\sentence_transformers\dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


If we need to compute the index:

In [3]:
documents = []

# Get a list of all CSV files in the directory
for file in glob.glob('./data/*.csv'):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file, dtype=str, parse_dates=True)

    df = df._to_pandas()
    
    # Convert the DataFrame into a list of Document objects
    docs = [Document(doc_id=str(i), text=row.to_string(), extra_info={col: row[col] for col in df.columns if col not in ['description']}) for i, row in df.iterrows()] #str(row.to_dict())

    # Add the documents to the list
    documents.extend(docs)


    import ray
    ray.init()

2024-01-15 15:57:15,610	INFO worker.py:1724 -- Started a local Ray instance.


In [None]:
batch_size = 5461  # Maximum batch size
for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]
    # Now add the batch to the index
    index = VectorStoreIndex.from_documents(batch, service_context=service_context, storage_context=storage_context, show_progress=True)
    
    #index = VectorStoreIndex.from_vector_store(ChromaVectorStore(chroma_collection=chroma_collection), storage_context=storage_context, show_progress=True).refresh_ref_docs(batch)
    #storage_context.persist(persist_dir=f"./chroma_db")
#storage_context.persist(persist_dir=f"./chroma_db")

if the index is already there:

In [4]:
index = VectorStoreIndex.from_vector_store(ChromaVectorStore(chroma_collection=chroma_collection), storage_context=storage_context, service_context=service_context)

In [7]:
#context_prompt= "Base the reply to the user question mainly on the Description field of the context "
#condense_prompt = " "

chatEngine = CondensePlusContextChatEngine.from_defaults(
    retriever=VectorIndexRetriever(index, similarity_top_k=5), #index.as_retriever(service_context=service_context, search_kwargs={"k": 1}),
    query_engine=index.as_query_engine(service_context=service_context, retriever=VectorIndexRetriever(index, similarity_top_k=5)),
    service_context=service_context,
    system_prompt=system_prompt,
    #condense_prompt=condense_prompt,
    #context_prompt=context_prompt,
    #verbose=True,
)

while True:
    question = input("Please enter your question: ")
    if question.lower() == "exit":
        break
    elif question.lower() == "reset":
        chatEngine.reset()
        print("The conversation has been reset.")
        continue
    else:
        response = chatEngine.chat(question)
        
        #index.as_chat_engine(chat_mode="condense_plus_context", similarity_top_k=1, service_context=service_context,memory=ConversationBufferMemory()).chat(question)
        #print(response)
        
        print(re.sub(r"(AI: |AI Assistant: |assistant: )", "", re.sub(r"^user: .*$", "", str(response), flags=re.MULTILINE)))
        
print("Goodbye!")

The "Séjour : le Brame du Cerf à la Réserve des Monts d'Azur" is an exclusive VIP experience that offers a unique insight into the world of the European red deer, known as the Cerf d'Europe. This special excursion allows visitors to observe and photograph the magnificent red deer in their natural habitat. The experience includes a guided safari in a horse-drawn carriage to witness the red deer during their mating season, known as the "brame." Additionally, there is an opportunity for a guided walking safari to approach the red deer and observe the fauna of the reserve. The package also includes accommodation in a bioclimatic room, an aperitif, and dinner. The experience is available from September 15th to October 30th on Wednesdays, Thursdays, Fridays, and Saturdays, with limited availability for 6 to 8 participants at certain times. It's a truly immersive and intimate encounter with the majestic red deer in the stunning setting of the Réserve des Monts d'Azur in Andon.
The "Séjour : l