In [None]:
import pandas as pd
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,)
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

try:
    from google.colab.userdata import get as getenv
    print("Running in colab")
except ImportError:
    from os import getenv
    import dotenv
    dotenv.load_dotenv()



In [None]:
OPENAI_API_KEY = getenv('OPENAI_API_KEY')
assert OPENAI_API_KEY, "An API key for OpenAI is required to be set as <OPENAI_API_KEY>."

## Set up LangSmith

In [5]:
LANGCHAIN_API_KEY = getenv('LANGCHAIN_API_KEY')
LANGCHAIN_ENDPOINT = getenv('LANGCHAIN_ENDPOINT')
assert LANGCHAIN_API_KEY, "An API key for LangChainSmith is required to be set as <LANGCHAIN_API_KEY>."

## Constants

In [None]:
# Dataset files
DATASET_PATH = "Amap-results_NJU-Gulou-3000m.csv"

LLM_MODEL_NAME = "gpt-4"
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

# Embeddings
EMBEDDINGS_CACHE_STORE="./cache/"

# Faiss
FAISS_REVIEWS_PATH_COSINE = "faiss_index_cosine"
FAISS_INDEX_NAME = "index"
FAISS_DISTANCE_STRATEGY_COSINE = "COSINE_DISTANCE"

## Load Dataset

Here we are using 2 csv files containing places (restuarants, bars, ...) info and reviews for each of them.

In [None]:
def get_documents(content_func=lambda row:row['name'] + '\n' + row['tag'],
                  source_func=lambda row:row['address'],
                  metadata_fields=[]):

  # Load the dataset
  dataset_df = pd.read_csv(DATASET_PATH)
  dataset_df.drop_duplicates(inplace=True)

  # Add page_content and source columns using their corresponding functions
  dataset_df['page_content'] = dataset_df.apply(content_func, axis=1)
  dataset_df['source'] = dataset_df.apply(source_func, axis=1)

  # Update metadata_fields with 'page_content', 'source'
  metadata_fields = list(set(metadata_fields + ['page_content', 'source']))

  loader = DataFrameLoader(dataset_df[metadata_fields], page_content_column='page_content')
  return loader.load()

In [None]:
def content_func(row) -> str:
  content_fields = ["name",
                    "address",
                    "type",
                    "tag",
                    "cost",
                    "rating",
                    "opentime_today",
                    "tel"]
  return '\n'.join(f"{key}={row[key]}" for key in content_fields if pd.notna(row[key]))

metadata_fields = ["location", "opentime_week"]

documents = get_documents(content_func, metadata_fields=metadata_fields)

In [9]:
## Take a look at a sample document
print(documents[0].page_content)
print(documents[0].metadata)



## Load Embeddings model

In [None]:
embedding_model = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME, openai_api_key=OPENAI_API_KEY)

In [11]:
result = embedding_model.embed_query("One sample query!")

In [12]:
import numpy as np
array = np.array(result)
print(f"embedding shape: {array.shape}\nembedding norm: {np.linalg.norm(array, ord=2)}")



## Create FAISS (Vector Database)

In [13]:
def get_vector_database(documents, embedding_model, distance_strategy):

  vector_database = FAISS.from_documents(
      documents, embedding_model,
      distance_strategy= distance_strategy
      )
  return vector_database

In [10]:
import time
doclen = len(documents)
for batch in range(doclen//100 + 1):
    docs = documents[batch*100:(batch+1)*100]
    if batch ==0:
        vector_db = get_vector_database(docs, embedding_model, FAISS_DISTANCE_STRATEGY_COSINE)
    else:

        vector_db.merge_from(get_vector_database(docs, embedding_model, FAISS_DISTANCE_STRATEGY_COSINE))
    time.sleep(10) # Sleep for 10 seconds to avoid hitting rate limits

In [11]:
vector_db.save_local(folder_path=FAISS_REVIEWS_PATH_COSINE, index_name=FAISS_INDEX_NAME)

In [6]:
vector_db = FAISS.load_local(folder_path=FAISS_REVIEWS_PATH_EUCLIDEAN,
                             embeddings=embedding_model,
                             index_name=FAISS_INDEX_NAME)

In [11]:
docs = vector_db.similarity_search("Give me information about some of the best pizza restaurant in the city?", k = 5)
for doc in docs:
    print(doc, end="\n\n")



In [4]:
question = "where is the Enoteca Barcollo located? and what is its phone number?"

docs = vector_db.similarity_search(question, k = 5)

for i in range(5):
  print(docs[i], end="\n\n")



## Load Vector Database

In [15]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from os import getenv
import dotenv
dotenv.load_dotenv()


# Faiss
FAISS_REVIEWS_PATH_EUCLIDEAN = "faiss_index_euclidean"
FAISS_INDEX_NAME = "index"
FAISS_DISTANCE_STRATEGY='EUCLIDEAN_DISTANCE'
EMBEDDING_MODEL_NAME = "models/text-embedding-004"
EMBEDDINGS_CACHE_STORE="./cache/"

GOOGLE_API_KEY = getenv('GOOGLE_API_KEY')

embedding_model = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME)
store = LocalFileStore(EMBEDDINGS_CACHE_STORE)
embedding_model = CacheBackedEmbeddings.from_bytes_store(embedding_model, store)

vector_db = FAISS.load_local(folder_path=FAISS_REVIEWS_PATH_EUCLIDEAN,
                             embeddings=embedding_model,
                             index_name=FAISS_INDEX_NAME,
                             allow_dangerous_deserialization=True)

## Load LLM

In [None]:
llm = ChatOpenAI(model=LLM_MODEL_NAME, openai_api_key=OPENAI_API_KEY)

In [12]:
llm.invoke("Hi")



## Create LangChain pipeline

In [14]:
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,)
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough

In [17]:
review_template_str = """
Your job is to use Google Map restaurants and bars reviews to help people find best places to go for a meal or a drink.
Use the following information and reviews to answer the questions. if the context is not about restaurants,
then kindly tell the user that you can only provide assistance and answer questions related to restaurants.
If you don't know an answer based on the context, say you don't know. Answer context:
{context}
"""

system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"], template=review_template_str
    )
)

human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=["question"], template="{question}")
)
messages = [system_prompt, human_prompt]

review_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"], messages=messages
)

reviews_retriever = vector_db.as_retriever(search_kwargs={'k': 20,})

review_chain = (
    {"context": reviews_retriever, "question": RunnablePassthrough()}
    | review_prompt_template
    | llm
    | StrOutputParser()
)

## Sample usage

In [24]:
question = """Where can I find delicious pizzas?"""
print(review_chain.invoke(question))



In [19]:
question = """Where can I find delicious pizzas?"""
print(review_chain.invoke(question))



In [18]:
question = """What are the pros and cons of Napoli Centrale?"""
print(review_chain.invoke(question))



In [19]:
question = """Give the name, address and phone number of some good steak houses for a romantic dinner."""
print(review_chain.invoke(question))



In [67]:
question = """Give the name, address and phone number of the best steak houses with a 50 euro budget?"""

print(review_chain.invoke(question))



In [26]:
question = """Give the name, address and phone number of the some good sandwich places?"""
print(review_chain.invoke(question))



In [37]:
question = """What are the most affordable but high-quality restaurants in City?"""
result = review_chain.invoke(question)
print(result)



In [72]:
question = """How can I make a roast beef sandwich at home?"""
print(review_chain.invoke(question))



In [None]:
question = """What is RAG?"""
print(review_chain.invoke(question))



In [None]:
question = """What is Natural Language Processing?"""
print(review_chain.invoke(question))



In [None]:
question = """Explain Natural Language Processing."""
print(review_chain.invoke(question))



## Evaluation on Synthetic Questions

In [34]:
from pprint import pprint
import random

In [76]:
def get_question_answer_pairs(documents, generator_llm, num_pairs=30):
  question_answer_pairs = []
  for _ in range(num_pairs):
    document = random.choice(documents)
    page_content = document.page_content
    prompt = f"This is a factual text passage: {page_content}. Write only one question about the restaurant based on the provided text passage. only write the quesion and noting else."

    question = generator_llm.invoke(prompt).content
    answer = generator_llm.invoke(f"From the following passage, answer the question: {question}\n{page_content}").content
    question_answer_pairs.append({"question": question, "answer": answer, "document": document})

  return question_answer_pairs

In [77]:
generator_llm = ChatGoogleGenerativeAI(model=LLM_MODEL_NAME)
question_answer_pairs = get_question_answer_pairs(documents, generator_llm, num_pairs=30)
df = pd.DataFrame(question_answer_pairs)

In [16]:
rag_answers = []
for question in df["question"]:
    question = question.split("\n")[0]
    answer = review_chain.invoke(question)
    rag_answers.append(answer)

df["rag_answer"] = rag_answers
df.to_csv('question_answer_pairs.csv', index=False)

In [21]:
df.to_csv('question_answer_pairs.csv', index=False)
df.head()



In [None]:
df = pd.read_csv('question_answer_pairs.csv')
df.head()



In [40]:
for i in range(10): 
    print(df["question"].loc[i])



## Limitatoins of classic LLMs

In [None]:
question = "Does Bar Fortuna Sas in Padova city offer delivery services?"
answer = llm.invoke(question).content
pprint(question)
pprint(answer)



In [24]:
question = "Does Bar Fortuna Sas in Padova city offer delivery services?"
answer = review_chain.invoke(question)
print(question)
print(answer)



In [41]:
question = "What kind of food is served at Veni Vidi Vino Enoteca?"
answer = llm.invoke(question).content
pprint(question)
pprint(answer)



In [43]:
question = "What kind of food is served at Veni Vidi Vino Enoteca?"
answer = review_chain.invoke(question)
pprint(question)
pprint(answer)

