# Data Preparation

## Import libraries

In [2]:
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os
import csv

## Load the data

In [14]:
DATA_PATH = "./data/TravelDataset.csv" # Assuming your CSV is directly in this 'data' directory

def read_city_information(headers, row):
    # Combine headers with row values to create meaningful content
    content_parts = []
    row_data = {}
    for header, value in zip(headers, row):
        header_name = header.replace("_", " ")
        content_parts.append(f"{header_name}: {value}")
        row_data[header] = value # Store for easy access to city name
    
    page_content = "\n".join(content_parts)
    return page_content, row_data

def load_and_process_csv_into_documents(file_path: str) -> list[Document]:
    """
    Loads a CSV file, processes each row, and creates LangChain Document objects.
    Each document will represent one city (row), with column names included in the content
    and city name in metadata.
    """
    documents = []
    headers = []

    with open(file_path, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)

        # Read the header row
        try:
            headers = next(csv_reader)
        except StopIteration:
            print(f"Warning: CSV file '{file_path}' is empty.")
            return []

        # Process each data row
        for i, row in enumerate(csv_reader):
            page_content, row_data = read_city_information(headers, row)

            # Extract city name for metadata (assuming 'city' is one of your columns)
            city_name = row_data.get('city', 'Unknown City')
            document_id = row_data.get('id', None) # Assuming 'id' is a unique identifier

            # Create the Document object
            metadata = {
                "source_file": os.path.basename(file_path), # Name of the CSV file
                "row_number": i + 2, # +2 because of 0-indexed loop and 1-indexed header
                "city_name": city_name,
                "document_type": "city_data"
            }
            if document_id:
                metadata["id"] = document_id # Add the ID if it exists and is useful

            doc = Document(
                page_content=page_content,
                metadata=metadata
            )
            documents.append(doc)

    return documents

In [15]:
processed_documents = load_and_process_csv_into_documents(DATA_PATH)

In [16]:
def display_documents_info(documents: list[Document], num_to_display: int = 3):
    """
    Prints information about the generated documents (chunks).
    """
    print(f"Total documents (chunks) created: {len(documents)}")
    for i, doc in enumerate(documents):
        if i >= num_to_display:
            break
        print(f"\n--- Chunk {i+1} ---")
        print(f"Content:\n{doc.page_content}")
        print(f"Metadata: {doc.metadata}")
    if len(documents) > num_to_display:
        print(f"\n... (displaying first {num_to_display} chunks only)")


In [17]:
display_documents_info(processed_documents, num_to_display=3)

Total documents (chunks) created: 560

--- Chunk 1 ---
Content:
id: c54acf38-3029-496b-8c7a-8343ad82785c
city: Milan
country: Italy
region: europe
short description: Chic streets lined with fashion boutiques, historic architecture, and lively piazzas create a sophisticated yet welcoming atmosphere, perfect for leisurely exploration.
latitude: 45.4641943
longitude: 9.1896346
avg temp monthly: {"1":{"avg":3.7,"max":7.8,"min":0.4},"2":{"avg":7.1,"max":12,"min":2.8},"3":{"avg":10.5,"max":15.5,"min":5.5},"4":{"avg":13.8,"max":18.9,"min":8.7},"5":{"avg":17.9,"max":22.5,"min":13.4},"6":{"avg":23.5,"max":28.5,"min":18.1},"7":{"avg":25.8,"max":30.8,"min":20.5},"8":{"avg":25.2,"max":30.4,"min":20.2},"9":{"avg":20.8,"max":26,"min":16.1},"10":{"avg":15.2,"max":19.6,"min":11.5},"11":{"avg":8.8,"max":12.5,"min":5.6},"12":{"avg":4.7,"max":8.2,"min":1.9}}
ideal durations: ["Short trip","One week"]
budget level: Luxury
culture: 5
adventure: 2
nature: 2
beaches: 1
nightlife: 4
cuisine: 5
wellness: 3
urb

## Create vector database