In [1]:
import json
from typing import Dict, List

In [2]:
def preprocess_travel_data(data: Dict) -> List[Dict]:
    """
    Convert nested JSON to flat documents suitable for ChromaDB
    """
    documents = []
    
    for destination, info in data.items():
        # Create main destination document
        main_doc = {
            'destination': destination,
            'best_for': ', '.join(info['BestFor']),
            'state': info.get('State', 'N/A'),
            'duration': info['Duration'],
            'interests': info['BestFor'],  # Keep as list for filtering
        }
        
        # Create comprehensive text for semantic search
        itinerary_text = []
        for day, activities in info['Iternary'].items():
            day_activities = f"Day {day}: "
            day_activities += f"Morning - {activities.get('Morning', '')}, "
            day_activities += f"Afternoon - {activities.get('Afternoon', '')}, "
            day_activities += f"Evening - {activities.get('Evening', '')}"
            itinerary_text.append(day_activities)
        
        # Combine all information into searchable text
        full_text = f"""
        Destination: {destination}
        State: {info.get('State', 'N/A')}
        Best for: {', '.join(info['BestFor'])}
        Recommended Duration: {info['Duration']} days
        
        Itinerary:
        {' '.join(itinerary_text)}
        """.strip()
        
        main_doc['content'] = full_text
        main_doc['itinerary_summary'] = ' '.join(itinerary_text)
        
        documents.append(main_doc)
    
    return documents

In [3]:
# Load your data
with open('./data/Output.json', 'r') as f:
    raw_data = json.load(f)

In [4]:
processed_docs = preprocess_travel_data(raw_data)

## ChromaDB Ingestion with Metadata

In [5]:
#!pip install -U langchain-community

In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from typing import List

In [7]:
class TravelDataIngestion:
    def __init__(self, persist_directory="./chroma_travel_db"):
        self.embeddings = HuggingFaceEmbeddings(
            model_name="all-MiniLM-L6-v2"
        )
        self.persist_directory = persist_directory
        self.vectorstore = None
    
    def ingest_data(self, processed_docs: List[Dict]):
        """
        Ingest processed travel data into ChromaDB
        """
        documents = []
        
        for doc in processed_docs:
            # Create LangChain Document with metadata
            langchain_doc = Document(
                page_content=doc['content'],
                metadata={
                    'destination': doc['destination'],
                    'duration': doc['duration'],
                    'state': doc['state'],
                    'best_for': doc['best_for'],  # String for filtering
                    'interests': ','.join(doc['interests']),  # Comma-separated
                    'itinerary_summary': doc['itinerary_summary']
                }
            )
            documents.append(langchain_doc)
        
        # Create ChromaDB vectorstore
        self.vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,
            persist_directory=self.persist_directory
        )
        
        print(f"✅ Ingested {len(documents)} destinations into ChromaDB")
        return self.vectorstore
    
    def load_existing(self):
        """Load existing vectorstore"""
        self.vectorstore = Chroma(
            persist_directory=self.persist_directory,
            embedding_function=self.embeddings
        )
        return self.vectorstore

In [8]:
# Usage
ingestion = TravelDataIngestion()
vectorstore = ingestion.ingest_data(processed_docs)

  self.embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


✅ Ingested 168 destinations into ChromaDB


## Summary:

#### Transformation Process
We converted nested, semi-structured JSON into flat, searchable documents with:
- Text Content Generation: Combined all fields into a coherent, natural language paragraph
- Metadata Extraction: Separated structured data for filtering
- Itinerary Summarization: Flattened daily activities into searchable text