# Data Preparation

## Import libraries

In [36]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
import openai 
from dotenv import load_dotenv
import shutil
import os
import csv

## Load the data

In [10]:
DATA_PATH = "./data/TravelDataset.csv" # Assuming your CSV is directly in this 'data' directory

def read_city_information(headers, row):
    # Combine headers with row values to create meaningful content
    content_parts = []
    row_data = {}
    for header, value in zip(headers, row):
        header_name = header.replace("_", " ")
        content_parts.append(f"{header_name}: {value}")
        row_data[header] = value # Store for easy access to city name
    
    page_content = "\n".join(content_parts)
    return page_content, row_data

def load_and_process_csv_into_documents(file_path: str) -> list[Document]:
    """
    Loads a CSV file, processes each row, and creates LangChain Document objects.
    Each document will represent one city (row), with column names included in the content
    and city name in metadata.
    """
    documents = []
    headers = []

    with open(file_path, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)

        # Read the header row
        try:
            headers = next(csv_reader)
        except StopIteration:
            print(f"Warning: CSV file '{file_path}' is empty.")
            return []

        # Process each data row
        for i, row in enumerate(csv_reader):
            page_content, row_data = read_city_information(headers, row)

            # Extract city name for metadata (assuming 'city' is one of your columns)
            city_name = row_data.get('city', 'Unknown City')
            document_id = row_data.get('id', None) # Assuming 'id' is a unique identifier

            # Create the Document object
            metadata = {
                "source_file": os.path.basename(file_path), # Name of the CSV file
                "row_number": i + 2, # +2 because of 0-indexed loop and 1-indexed header
                "city_name": city_name,
                "document_type": "city_data"
            }
            if document_id:
                metadata["id"] = document_id # Add the ID if it exists and is useful

            doc = Document(
                page_content=page_content,
                metadata=metadata
            )
            documents.append(doc)

    return documents

In [11]:
processed_documents = load_and_process_csv_into_documents(DATA_PATH)

In [12]:
def display_documents_info(documents: list[Document], num_to_display: int = 3):
    """
    Prints information about the generated documents (chunks).
    """
    print(f"Total documents (chunks) created: {len(documents)}")
    for i, doc in enumerate(documents):
        if i >= num_to_display:
            break
        print(f"\n--- Chunk {i+1} ---")
        print(f"Content:\n{doc.page_content}")
        print(f"Metadata: {doc.metadata}")
    if len(documents) > num_to_display:
        print(f"\n... (displaying first {num_to_display} chunks only)")


In [13]:
display_documents_info(processed_documents, num_to_display=3)

Total documents (chunks) created: 560

--- Chunk 1 ---
Content:
id: c54acf38-3029-496b-8c7a-8343ad82785c
city: Milan
country: Italy
region: europe
short description: Chic streets lined with fashion boutiques, historic architecture, and lively piazzas create a sophisticated yet welcoming atmosphere, perfect for leisurely exploration.
latitude: 45.4641943
longitude: 9.1896346
avg temp monthly: {"1":{"avg":3.7,"max":7.8,"min":0.4},"2":{"avg":7.1,"max":12,"min":2.8},"3":{"avg":10.5,"max":15.5,"min":5.5},"4":{"avg":13.8,"max":18.9,"min":8.7},"5":{"avg":17.9,"max":22.5,"min":13.4},"6":{"avg":23.5,"max":28.5,"min":18.1},"7":{"avg":25.8,"max":30.8,"min":20.5},"8":{"avg":25.2,"max":30.4,"min":20.2},"9":{"avg":20.8,"max":26,"min":16.1},"10":{"avg":15.2,"max":19.6,"min":11.5},"11":{"avg":8.8,"max":12.5,"min":5.6},"12":{"avg":4.7,"max":8.2,"min":1.9}}
ideal durations: ["Short trip","One week"]
budget level: Luxury
culture: 5
adventure: 2
nature: 2
beaches: 1
nightlife: 4
cuisine: 5
wellness: 3
urb

## Create vector database

In [14]:
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

In [15]:
embedding_model = "text-embedding-3-small"
embedding_function = OpenAIEmbeddings(model=embedding_model)

In [16]:
CHROMA_PATH = "chroma"

In [17]:
# Clear the previous version of the database
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

if not os.path.exists(CHROMA_PATH):
    os.makedirs(CHROMA_PATH)

In [18]:
db = Chroma.from_documents(
    processed_documents, embedding_function, persist_directory=CHROMA_PATH
)
print(f"Saved {len(processed_documents)} documents to the database {CHROMA_PATH}")

## Query relevant data

In [21]:
# load the database
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [65]:
query_text = "What are the most beautiful destinations I could travel to considering the architecture?"

In [66]:
results = db.similarity_search_with_relevance_scores(query_text, k=3)


In [67]:
if len(results) == 0:
    print("Unable to find any matching results")
else:
    print("Results:\n")
    for result in results:
        print(f"score: {result[1]}\n, the city: {result[0]}\n--------\n")

Results:

score: 0.17251878489324635
, the city: page_content='id: 784d64bc-ad12-4648-8094-07151548001d
city: Vienna
country: Austria
region: europe
short description: Grand architecture and cobblestone streets set a charming backdrop for cozy coffeehouses and vibrant cultural scenes, inviting leisurely exploration.
latitude: 48.2083537
longitude: 16.3725042
avg temp monthly: {"1":{"avg":2.5,"max":4.7,"min":0.5},"2":{"avg":5,"max":7.9,"min":2.5},"3":{"avg":7,"max":12.3,"min":3.6},"4":{"avg":12,"max":16.7,"min":7.8},"5":{"avg":16.2,"max":20.6,"min":12.2},"6":{"avg":21.6,"max":26.6,"min":16.9},"7":{"avg":23.3,"max":28.2,"min":18.7},"8":{"avg":23,"max":27.8,"min":18.7},"9":{"avg":18,"max":22.3,"min":14.3},"10":{"avg":12.7,"max":16.1,"min":10},"11":{"avg":7.3,"max":9.7,"min":5.2},"12":{"avg":3.9,"max":6,"min":2}}
ideal durations: ["One week","Short trip","Weekend"]
budget level: Mid-range
culture: 5
adventure: 2
nature: 3
beaches: 1
nightlife: 4
cuisine: 5
wellness: 4
urban: 5
seclusion: 2

## Craft response

In [68]:
PROMPT_TEMPLATE = """
Answer the following question about travelling {query} based on the following context:
{context}
"""

In [69]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(query=query_text, context=context_text)

In [70]:
model = ChatOpenAI(model="gpt-4o-mini")
response_text = model.invoke(prompt)

In [82]:
formatted_response = f"Response: {response_text.content}"
print(formatted_response)

Response: Considering architecture as the main focus of your travels, here are three beautiful destinations you could consider:

1. **Vienna, Austria**  
   - **Description**: Known for its grand architecture and charming cobblestone streets, Vienna offers a cozy atmosphere with vibrant cultural scenes and numerous coffeehouses. The city's rich history is reflected in its stunning buildings, including the Schönbrunn Palace and St. Stephen's Cathedral.
   - **Architecture Highlights**: Baroque and Gothic buildings, historic palaces, and modern designs coexist beautifully in the urban landscape.

2. **Moscow, Russia**  
   - **Description**: Moscow features a dynamic atmosphere where grand architectural landmarks connect history with modern life. The vibrant streets are filled with cultural experiences and stunning structures such as the Red Square, Kremlin, and Saint Basil's Cathedral.
   - **Architecture Highlights**: A mix of traditional Russian architecture with modern elements, show

In [87]:
sources, city_name, city_id = zip(*[(doc.metadata.get("source_file", None), doc.metadata.get("city_name", None), doc.metadata.get("id", None)) for doc, _score in results])

formatted_sources = "Sources:\n" + "\n".join(
    f"{source}, City: {city}, City_id: {city_id}"
    for source, city, city_id in zip(sources, city_name, city_id)
)

print(formatted_sources)

Sources:
TravelDataset.csv, City: Vienna, City_id: 784d64bc-ad12-4648-8094-07151548001d
TravelDataset.csv, City: Moscow, City_id: 5e7318b5-3cc9-462e-b9df-7c6f7bec1885
TravelDataset.csv, City: Frankfurt, City_id: 9a4f8eec-e269-42bb-96bd-4c0e0a82df57
