## Import Libraries

In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import HuggingFaceHub
from dotenv import load_dotenv
import os
from pinecone import Pinecone, ServerlessSpec
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
from langchain_openai import ChatOpenAI
import streamlit as st
import json

  from tqdm.autonotebook import tqdm


## Load and split documents

In [2]:
loader = TextLoader('../Chatbot/materials/torontoTravelAssistant.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=4)
docs = text_splitter.split_documents(documents)

## Initialize embeddings

In [3]:
embeddings = HuggingFaceEmbeddings()

  warn_deprecated(





## Initialize Pinecone instance

In [4]:
pc = Pinecone(api_key= os.getenv('PINECONE_API_KEY'))

index_name = "langchain-demo"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )            
    )
index = pc.Index(index_name)
docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

## Initialize ChatOpenAI

In [5]:
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name, organization='org-G8UtpAEtkeLatwCgEhQGaPOw')

## Define prompt template

In [6]:
template = """
You are a Toronto travel assistant. Users will ask you questions about their trip to Toronto. Use the following piece of context to answer the question.
If you don't know the answer, just say you don't know.
Your answer should be short and concise, no longer than 2 sentences.

Context: {context}
Question: {question}
Answer:
"""

## RAG Pipeline

In [7]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

rag_chain = RetrievalQA.from_chain_type(
    llm, retriever=docsearch.as_retriever(), chain_type_kwargs={"prompt": prompt}
)

## List of queries for evaluation

In [8]:
queries = [
    "What are the best places to visit in Toronto?",
    "Can you suggest some good restaurants in Toronto?",
    "What is the best time of year to visit Toronto?",
    "Are there any special events happening in Toronto this summer?",
    "Where can I find affordable accommodation in Toronto?"
]

## Function to generate context retrieval and answer

In [9]:
def generate_evaluation_data(query):
    retrieval_results = docsearch.similarity_search(query)
    contexts = [result.page_content for result in retrieval_results]
    # combined_context = " ".join(contexts)
    result = rag_chain.invoke(query)
    return {
        "query": query,
        "contexts": contexts,
        "answer": result["result"]
    }

## Generate and store evaluation data

In [10]:
evaluation_data = [generate_evaluation_data(query) for query in queries]

In [11]:
print("Query: " + evaluation_data[0]['query'] + "\n")
print("Context: " + evaluation_data[0]['contexts'][0] + "\n")
print("Answer: " + evaluation_data[0]['answer'])


Query: What are the best places to visit in Toronto?

Context: Document: Comprehensive Toronto Travel Guide
Table of Contents
1.	Introduction
2.	Top Attractions
o	CN Tower
o	Royal Ontario Museum
o	Toronto Islands
o	Ripley's Aquarium of Canada
o	Distillery District
o	Casa Loma
o	Art Gallery of Ontario
o	Toronto Zoo
o	High Park
3.	Food and Dining
o	St. Lawrence Market
o	Kensington Market
o	Chinatown
o	Little Italy
o	Yorkville
o	Greektown
o	The Danforth
4.	Accommodation Options
o	Luxury Hotels
o	Mid-Range Hotels
o	Budget Hotels
o	Vacation Rentals
5.	Sample Itineraries
o	3-Day Itinerary
o	5-Day Itinerary
o	7-Day Itinerary
6.	Seasonal Events and Festivals
o	Summer
o	Fall
o	Winter
o	Spring
7.	Transportation
o	Public Transit
o	Taxis and Ride-Sharing
o	Bike Rentals
o	Car Rentals
8.	Neighborhood Guides
o	Downtown
o	West End
o	East End
o	Midtown
9.	Outdoor Activities
o	Parks and Gardens
o	Beaches
o	Hiking Trails
10.	Preferred Times, Days, and Seasons for Activities
11.	FAQs
_____________________

## Save evaluation data to a JSON file for human evaluation

In [12]:
with open('evaluation_data.json', 'w') as f:
    json.dump(evaluation_data, f, indent=4)

print("Evaluation data has been generated and saved to evaluation_data.json")

Evaluation data has been generated and saved to evaluation_data.json


## Evaluation function for context relevance

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_context_relevance(questions, retrieved_contexts):
    vectorizer = TfidfVectorizer().fit_transform(retrieved_contexts + questions)
    vectors = vectorizer.toarray()
    relevance_scores = []
    
    for i, question in enumerate(questions):
        context_vector = vectors[i]
        question_vector = vectors[len(questions) + i]
        cosine_sim = cosine_similarity([context_vector], [question_vector])[0][0]
        relevance_scores.append(cosine_sim)
    
    return relevance_scores

## Evaluation function for answer relevance

In [15]:
from rouge_score import rouge_scorer

def evaluate_answer_relevance(generated_answers, reference_answers):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rougeL': []}
    
    for generated, reference in zip(generated_answers, reference_answers):
        scores = scorer.score(reference, generated)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
    
    return rouge_scores

## Example reference answers

In [16]:
reference_answers = [
    "The best places to visit in Toronto include the CN Tower, Royal Ontario Museum, and the Distillery District. You can also explore Toronto Islands and the Art Gallery of Ontario for a mix of nature and culture.",
    "Some highly recommended restaurants in Toronto are Alo for fine dining, Pai Northern Thai Kitchen for Thai cuisine, and Richmond Station for a great farm-to-table experience. St. Lawrence Market is also a great spot for diverse food options.",
    "The best time to visit Toronto is from late April to early June and from September to early November when the weather is pleasant, and the city hosts various festivals. These periods also offer fewer crowds compared to the peak summer season.",
    "Yes, Toronto hosts several special events in the summer, including the Toronto International Film Festival (TIFF) in September and the Toronto Caribbean Carnival in August. The city also features numerous music festivals and outdoor activities.",
    "Affordable accommodation in Toronto can be found in areas like Kensington Market, the Annex, and Queen Street West. Options include budget hotels, hostels, and Airbnb rentals that offer a range of prices and amenities."
]

## Retrieve context and generate answers

In [31]:
retrieved_contexts = []

for data in evaluation_data:
    retrieved_contexts.append(data['contexts'][0])

generated_answers = []

for data in evaluation_data:
    generated_answers.append(data['answer'])

## Evaluate context relevance

In [33]:
context_relevance_scores = evaluate_context_relevance(queries, retrieved_contexts)

context_relevance_scores

[0.2722216541941082,
 0.06539651144388514,
 0.36628252812040546,
 0.08463996755913449,
 0.06560117653352551]

## Evaluate answer relevance

In [34]:
answer_relevance_scores = evaluate_answer_relevance(generated_answers, reference_answers)

answer_relevance_scores

{'rouge1': [0.6875,
  0.2711864406779661,
  0.5428571428571429,
  0.456140350877193,
  0.39999999999999997],
 'rougeL': [0.5625,
  0.23728813559322032,
  0.5142857142857143,
  0.3508771929824561,
  0.29090909090909084]}

## Display results

In [37]:
evaluation_results = {
    "questions": queries,
    "retrieved_contexts": retrieved_contexts,
    "generated_answers": generated_answers,
    "context_relevance_scores": context_relevance_scores,
    "answer_relevance_scores": answer_relevance_scores
}

print(json.dumps(evaluation_results, indent=4))

with open('evaluation_data.json', 'w') as f:
    json.dump(evaluation_results, f, indent=4)

print("Evaluation data has been generated and saved to evaluation_data.json")

{
    "questions": [
        "What are the best places to visit in Toronto?",
        "Can you suggest some good restaurants in Toronto?",
        "What is the best time of year to visit Toronto?",
        "Are there any special events happening in Toronto this summer?",
        "Where can I find affordable accommodation in Toronto?"
    ],
    "retrieved_contexts": [
        "Document: Comprehensive Toronto Travel Guide\nTable of Contents\n1.\tIntroduction\n2.\tTop Attractions\no\tCN Tower\no\tRoyal Ontario Museum\no\tToronto Islands\no\tRipley's Aquarium of Canada\no\tDistillery District\no\tCasa Loma\no\tArt Gallery of Ontario\no\tToronto Zoo\no\tHigh Park\n3.\tFood and Dining\no\tSt. Lawrence Market\no\tKensington Market\no\tChinatown\no\tLittle Italy\no\tYorkville\no\tGreektown\no\tThe Danforth\n4.\tAccommodation Options\no\tLuxury Hotels\no\tMid-Range Hotels\no\tBudget Hotels\no\tVacation Rentals\n5.\tSample Itineraries\no\t3-Day Itinerary\no\t5-Day Itinerary\no\t7-Day Itinerary\