In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
import faiss
import random
import numpy as np
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
from IPython.display import display
from elasticsearch import Elasticsearch
from ipywidgets import interact, widgets
from sentence_transformers import SentenceTransformer

In [13]:
# Load the sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')



## Load Data

In [4]:
activities_df = pd.read_csv("../data/activities.csv")

In [5]:
file_path = "../data/travel_iterenary.json"

with open(file_path, "rt") as f_in:
    docs_raw = json.load(f_in)
    
def extract_text(data_entry):
    docs_fin = {}
    plan = ""
    
    for day in data_entry['Iterenary']:  # Fixed 'days' to 'day'
        itinerary = "Travel itinerary for " + data_entry.get('duration', '') + " days."
        
        plan += day['Plan']
        plan += "\n"
        
        docs_fin["itinerary"] = itinerary
        docs_fin["plan"] = plan
        
    return docs_fin
docs_fin = [extract_text(doc) for doc in docs_raw]

## Ingest Data

### ElasticSearch

In [6]:
#check that elastic search works
!curl http://localhost:9200/

{
  "name" : "082f13835804",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "4qyEGnycTq--Lo33EHJHNw",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


In [7]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': '082f13835804', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4qyEGnycTq--Lo33EHJHNw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
#define index mapping
index_mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "itinerary": {"type": "text"},  
            "plan": {"type": "text"}        
            }
    }
}

index_name = "tip-index"

# Check if the index already exists
# Check if the index already exists
es_client.indices.delete(index=index_name, ignore_unavailable=True)

ObjectApiResponse({'acknowledged': True})

#### The two curl commands below are to solve some memory issues

In [9]:
!curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_cluster/settings -d '{ "transient": { "cluster.routing.allocation.disk.threshold_enabled": false } }'

{"acknowledged":true,"persistent":{},"transient":{"cluster":{"routing":{"allocation":{"disk":{"threshold_enabled":"false"}}}}}}

In [67]:
!curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'

{"error":{"root_cause":[{"type":"index_not_found_exception","reason":"no such index [null] and no indices exist","resource.type":"index_expression","resource.id":"_all"}],"type":"index_not_found_exception","reason":"no such index [null] and no indices exist","resource.type":"index_expression","resource.id":"_all"},"status":404}

In [10]:
#create index
es_client.indices.create(index=index_name, body=index_mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'tip-index'})

In [11]:
#index document
for duration in tqdm(docs_fin):
    es_client.index(index=index_name, document=duration)

  0%|          | 0/7 [00:00<?, ?it/s]

### ElasticSearch as VectorDB

In [14]:
doc_vec = []
for doc in docs_fin:
    doc["itin_vector"] = model.encode(doc['itinerary']).tolist()
    doc_vec.append(doc)

In [15]:
#define index mapping
vec_index_mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "itinerary": {"type":"text"},  
            "plan": {"type":"text"},
            "itin_vector": {"type":"dense_vector", "dims":768, "index":True, "similarity":"cosine"} 
            }
    }
}

index_name = "vec-index"

# Check if the index already exists
es_client.indices.delete(index=index_name, ignore_unavailable=True)
#create index
es_client.indices.create(index=index_name, body=vec_index_mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vec-index'})

In [16]:
for doc in tqdm(doc_vec):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/7 [00:00<?, ?it/s]

### FAISS (Facebook AI Simlarity Search)

In [17]:
# Function to extract relevant text from each dictionary entry
def extract_text(data_entry):

    result = data_entry.get('itinerary', '') + "\n" + data_entry.get('plan', '')
    return result.strip()

# Extract all text data to be indexed
text_data = [extract_text(entry) for entry in docs_fin]

# Generate embeddings for each text using the model
embeddings = model.encode(text_data)

# Convert embeddings to a numpy array and ensure it's float32 (FAISS requires this format)
embeddings = np.array(embeddings, dtype='float32')  # Ensure embeddings are float32

# Initialize the FAISS index
dimension = embeddings.shape[1]  # Dimension of the embedding vector
index = faiss.IndexFlatL2(dimension)  # L2 distance metric (Euclidean)

# Add the embeddings to the FAISS index
index.add(embeddings)

# Save the index for future use
faiss.write_index(index, 'faiss_index_file.index')

print(f"Indexed {len(embeddings)} items into FAISS.")


Indexed 7 items into FAISS.


### User input

In [19]:
# Define the month in the dropdown
month_dropdown = widgets.Dropdown(
    options=["January", "February", "March", "April", "May", "June", 
             "July", "August", "September", "October", "November", "December"],
    value='January',  # Default value
    description='Month:',
)

# Display the dropdown widget
display(month_dropdown)

# Function to handle the value change and store the result in a variable
def on_value_change(change):
    selected_option = change['new']  # Save the selected option to a variable
    print(f"Selected option: {selected_option}")

# Observe value change in the dropdown
month_dropdown.observe(on_value_change, names='value')


Dropdown(description='Month:', options=('January', 'February', 'March', 'April', 'May', 'June', 'July', 'Augus…

In [20]:
# To directly access the selected value, you can also just do:
month = month_dropdown.value
month

'October'

In [21]:
# Define the duration in the dropdown
days_dropdown = widgets.Dropdown(
    options=["One", "Two", "Three", "Four", "Five", "Six", "Seven"],
    value='One',  # Default value
    description='Duration:',
)

# Display the dropdown widget
display(days_dropdown)

# Function to handle the value change and store the result in a variable
def on_value_change(change):
    selected_option = change['new']  # Save the selected option to a variable
    print(f"Selected option: {selected_option}")

# Observe value change in the dropdown
days_dropdown.observe(on_value_change, names='value')


Dropdown(description='Duration:', options=('One', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven'), value='One'…

In [22]:
# To directly access the selected value, you can also just do:
query = days_dropdown.value.lower()
query

'three'

In [23]:
# Define the activities in the dropdown
dropdown = widgets.SelectMultiple(
    options=['sightseeing', 'dinning', 'adventure', 'relaxation', 'culture'],
    value=['sightseeing'],  # Default value
    description='Activities:',
)

# Display the dropdown widget
display(dropdown)

# Function to handle the value change and store the result in a variable
def on_value_change(change):
    selected_option = change['new']  # Save the selected option to a variable
    print(f"Selected option: {selected_option}")

# Observe value change in the dropdown
dropdown.observe(on_value_change, names='value')


SelectMultiple(description='Activities:', index=(0,), options=('sightseeing', 'dinning', 'adventure', 'relaxat…

In [24]:
# To directly access the selected value, you can also just do:
activities = dropdown.value
activities

('sightseeing', 'adventure')

In [25]:
activity_list = activities_df[activities_df['activity'].isin(activities)]

## Retrieval

### Keyword Search with Elastic

In [193]:
def elastic_search(query):
    query_body = {
        "size": 1,
        "query": {
            "match": {
                "itinerary": query
            }
        },
        "fields": ["plan"]
    }
    response = es_client.search(index="tip-index", body=query_body)
    
    response_docs = []
    for hit in response["hits"]["hits"]:
        response_docs.append(hit["_source"])
        
    plan = response_docs[0]['plan']

    return plan.strip(), response_docs[0]['itinerary']

### Semantic Search with Elastic (Cosine Similarity)

In [190]:
def elastic__vector_search(query):

    vec_query = model.encode(query).tolist()
    
    query = {
        "knn": {
            "field": "itin_vector",
            "query_vector": vec_query,
            "k": 1,
            "num_candidates": 1000
            
            },
        "fields": ["plan"]
        }
    response = es_client.search(index="vec-index", body=query)

    response_docs = []
    for hit in response["hits"]["hits"]:
        response_docs.append(hit["_source"])
        
    plan = response_docs[0]['plan']

    return plan.strip(), response_docs[0]['itinerary']

### Semantic Search with FAISS (L2 Distance)

In [185]:
def fais__vector_search(query):
    
    query_embedding = model.encode([query]).astype(np.float32)
    
    # Perform the search (k=5 means returning the 5 most similar entries)
    D, I = index.search(query_embedding, k=1)

    # D contains the distances, I contains the indices of the nearest neighbors
    plan = text_data[I[0][0]]
    return plan.strip(), I[0][0]
    

### Retrieval evaluation 

In [53]:
## Generate unique id for each document
import hashlib
def generate_id(doc):
    id_source = f"{doc['itinerary']}-{doc['plan'][:10]}-{doc['itin_vector'][0]}"
    hash_object = hashlib.md5(id_source.encode())
    hash_hex = hash_object.hexdigest()
    doc_id = hash_hex[:8]
    
    return doc_id

for doc in docs_fin:
    doc["id"] = generate_id(doc)

In [54]:
# Save file
with open("../data/travel_itirenary_with_ids.json", "wt") as f_out:
    json.dump(docs_fin, f_out)

In [151]:
system_prompt = """
Act as a traveler planning a short trip to Barcelona. 
Based on the following search query, generate 10 similar traveler searches that have the 'SAME NUMBER OF DAYS' as the one provided. 
The searches should be varied but similar in nature.

search: {itinerary}

Provide the output as a list of strings.
["search","search2",...,"search10"]

Example:
search: Travel itinerary for three days.
output: ["Three-day food tour itinerary in Barcelona", 
"Three-day adventure itinerary in Barcelona",
"Travel itinerary for three days with kids in Barcelona",
"Three-day art and architecture itinerary in Barcelona"]
""".strip()

In [152]:
def generate_questions(prompt):
    response = client.chat.completions.create(
    model = "gpt-4o-mini",
    messages = [{"role": 'system', 'content': system_prompt}, {'role': 'user', 'content': f"search: {prompt}"}]
        )
    return response.choices[0].message.content

In [153]:
searches = {}
for doc in docs_fin:
    doc_id = doc["id"]
    prompt = doc["itinerary"]
    questions = generate_questions(prompt)
    searches[doc_id] = questions

In [145]:
import pickle
with open("../data/searches.bin", "wb") as f_out:
    pickle.dump(searches, f_out)

In [160]:
doc_index = {d["id"]: d for d in docs_fin}

In [163]:
ground_truth_list = []
for key, value in parsed_searches.items():
    itinerary = doc_index[key]['itinerary']
    for question in value:
        ground_truth_list.append((question, itinerary, key))

In [168]:
df = pd.DataFrame(ground_truth_list, columns=["question", "itinerary", "id"])
df.to_csv("../data/ground_truth.csv", index=False)
df["index"] = [i for i in range(7) for _ in range(10)]

In [167]:
parsed_searches = {}

for k,v in searches.items(): 
    parsed_searches[k] = itinerary_list = json.loads(v)


In [177]:
df["index"] = [i for i in range(7) for _ in range(10)]

In [197]:
stic_search, stic_vector, faiss_vector = [], [], []

for question in df.question:
    stic_search.append(elastic_search(question)[1])
    stic_vector.append(elastic__vector_search(question)[1])
    faiss_vector.append(fais__vector_search(question)[1])
    
df["stic_search"] = stic_search
df["stic_vector"] = stic_vector
df["faiss_vector"] = faiss_vector

#### Hit rate for Keyword Search with Elastic

In [222]:
round(sum(df.itinerary == df.stic_search)/len(df), 2)

0.99

#### Hit rate for Keyword Search with Elastic

In [223]:
round(sum(df.itinerary == df.stic_vector)/len(df), 2)

0.99

#### Hit rate for Semantic Search with FAISS (L2 Distance)

In [225]:
round(sum(df.index == df.faiss_vector)/len(df), 2)

0.0

From the evaluation results, we will proceed with the **keyword search with Elastic** for retrieval.

## RAG

In [227]:
full_query = f"Travel itinerary for {query} days"
plan = elastic_search(full_query)[0]

In [229]:
params = {"days":query, "activities":activities, "month":month, "plan":plan, "activity_list":activity_list}

In [233]:
def build_prompt(params):
    prompt = f"""
    You are a travel assistant specialized in creating personalized short-trip itineraries for Barcelona.
    Your task is to plan a {params["days"]}-day itinerary by referring to a list of sample travel plans, taking into account the traveler’s activity preferences and the time of year.
    
    - Prioritize activities from the ACTIVITY-CATALOGUE that align with the traveler’s interests.
    - Adapt the itinerary to be suitable for the weather and season during the month of {params["month"]}.
    - Ensure that the chosen activities are spread across the duration of the trip to provide a balanced and enjoyable experience.
    
    Below are the details for your reference:
    
    DURATION: {params["days"]} days
    ACTIVITIES: {params["activities"]}
    MONTH: {params["month"]}
    
    SAMPLE PLANS: 
    {params["plan"]}
    
    ACTIVITY-CATALOGUE:
    {params["activity_list"]}
    
    Please create a thoughtful and well-structured itinerary considering these details.
    """.strip()
    
    return prompt


In [231]:
prompt = build_prompt(params)

### OpenAI 40-mini

In [235]:
def gpt(prompt):
    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [{"role":"user", "content": prompt}]
    )
    return response.choices[0].message.content

In [236]:
gpt_responce = gpt(prompt)

### LLM as a Judge

In [243]:
def build_evaluation_prompt(itinerary, params):
    prompt = f"""
    You are tasked with evaluating a travel itinerary for a short trip to Barcelona. The itinerary was generated based on the following criteria:

    1. The trip duration is {params["days"]} days.
    2. The traveler prefers the following types of activities: {params["activities"]}.
    3. The trip will take place in the month of {params["month"]}, so the itinerary should account for the typical weather and seasonal activities in that month.
    4. The activities should be spread across the {params["days"]} days to provide a balanced and enjoyable experience.

    Here is the generated itinerary for evaluation:

    {itinerary}

    Please evaluate the itinerary based on the following aspects:
    - Relevance of activities to the traveler’s preferences
    - Appropriateness of activities for the weather/season
    - Distribution and balance of activities across the trip duration
    - Diversity of activities (if applicable)
    - Completeness of the itinerary for the given days

    Provide a single numerical score (between 1 and 10) that reflects the average score across these five aspects. Only return the average score.
    """.strip()

    return prompt

In [261]:
acts = ['sightseeing', 'dining', 'adventure', 'relaxation', 'culture']
days = ["one", "two", "three", "four", "five", "six", "seven"]
months = ["February", "May", "August", "November"]
eval_score = []

for day in tqdm(days):
    for month in months:
        for i in range(1, 6):
            act_list = random.sample(acts, i)
            
            query = day
            activities = act_list
            activity_list = activities_df[activities_df['activity'].isin(activities)]
            
            params = {
                "days": query, 
                "activities": activities, 
                "month": month, 
                "plan": plan, 
                "activity_list": activity_list
            }
            
            full_query = f"Travel itinerary for {query} days"
            plan = elastic_search(full_query)[0]
            
            prompt = build_prompt(params)
            gpt_response = gpt(prompt)
            
            eval_prompt = build_evaluation_prompt(gpt_response, params)
            score = gpt(eval_prompt)
            
            eval_score.append(score)


  0%|          | 0/7 [00:00<?, ?it/s]

In [268]:
rag_score = np.average([float(i) for i in eval_score])
print(f"Using LLM as a judge, the performance of our RAG on {len(eval_score)} prompts is {round(rag_score, 2)} out of 10.")

Using LLM as a judge, the performance of our RAG on 140 prompts is 8.58 out of 10.


In [283]:
check = os.getenv("$OPENAI_API_KEY")

In [286]:
check