In [1]:
!pip install chromadb==0.4.24 sentence-transformers transformers joblib

Collecting chromadb==0.4.24
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb==0.4.24)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb==0.4.24)
  Downloading chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb==0.4.24)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb==0.4.24)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb==0.4.24)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting pulsar-client>=3.1.0 (from chromadb==0.4.24)
  Downloading pulsar_client-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting onnxruntime>=1.14.1 (from chromadb==0.4.24)
  Downloading onnxrunt

In [2]:
import pandas as pd
import re
import os
from typing import List, Dict, Optional
import uuid
import warnings
import logging
import joblib
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
from transformers import T5Tokenizer, T5ForConditionalGeneration
import shutil

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings('ignore', category=RuntimeWarning, module='pandas.io.formats.format')

def clean_text(text: str, preserve_case: bool = False) -> str:
    if pd.isna(text) or text is None:
        return ""
    text = str(text)
    text = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', text)
    text = re.sub(r'[^\w\s.,-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text if preserve_case else text.lower()

def extract_features(description: str, category: str) -> Dict[str, bool]:
    description = clean_text(description, preserve_case=False)
    category = clean_text(category, preserve_case=False)
    features = {'vegetarian': False, 'vegan': False, 'gluten_free': False, 'spicy': False}
    vegetarian_keywords = ['vegetarian', 'vegan', 'plant-based', 'tofu', 'mushroom', 'eggplant', 'jackfruit', 'vegetable', 'amaranth', 'curry']
    vegan_keywords = ['vegan', 'plant-based', 'no dairy', 'coconut milk']
    non_veg_keywords = ['chicken', 'prawn', 'fish', 'mutton', 'pork', 'bacon', 'ham', 'choriz', 'beef', 'lamb']
    if not any(keyword in description for keyword in non_veg_keywords):
        if any(keyword in description for keyword in vegetarian_keywords):
            features['vegetarian'] = True
        if any(keyword in description for keyword in vegan_keywords) or (
            features['vegetarian'] and 'cheese' not in description and 'egg' not in description
        ):
            features['vegan'] = True
    gluten_free_keywords = ['gluten-free', 'rice', 'bhakri', 'no wheat']
    if any(keyword in description for keyword in gluten_free_keywords):
        features['gluten_free'] = True
    spicy_keywords = ['spicy', 'chilli', 'masala', 'pepper', 'picante']
    if 'coffee' not in category:
        spicy_keywords.append('hot')
    if any(keyword in description for keyword in spicy_keywords):
        features['spicy'] = True
    return features

def preprocess_menu_data(df: pd.DataFrame, column_mapping: Dict[str, str], restaurant_name: str = "Unknown", location: str = "Unknown") -> List[Dict]:
    if df is None or df.empty:
        logging.error("DataFrame is empty or None")
        return []
    logging.info(f"Raw DataFrame rows: {len(df)}")
    logging.info("First 5 rows of raw DataFrame:\n" + df.head().to_string())
    df_processed = df.copy()
    expected_columns = ['section', 'category', 'item_name', 'description', 'price']
    missing_columns = [col for col in expected_columns if col not in column_mapping]
    if missing_columns:
        logging.warning(f"Missing column mappings for {missing_columns}. Using empty strings.")
        for col in missing_columns:
            df_processed[col] = ''
    else:
        for expected_col, actual_col in column_mapping.items():
            if actual_col in df_processed.columns:
                df_processed[expected_col] = df_processed[actual_col]
            else:
                logging.warning(f"Column '{actual_col}' not found in DataFrame. Using empty strings.")
                df_processed[expected_col] = ''
    df_normalized = df_processed.copy()
    for col in ['section', 'category', 'item_name', 'description', 'price']:
        if col in df_normalized.columns:
            df_normalized[col] = df_normalized[col].apply(lambda x: clean_text(x, preserve_case=False))
    duplicates = df_normalized[df_normalized.duplicated(subset=['section', 'category', 'item_name', 'description', 'price'], keep=False)]
    if not duplicates.empty:
        logging.warning(f"Found {len(duplicates)} normalized duplicate rows:\n" + duplicates.head().to_string())
    df_processed = df_processed.drop_duplicates(
        subset=['section', 'category', 'item_name', 'description', 'price'],
        keep='first',
        ignore_index=True
    )
    logging.info(f"After removing duplicates: {len(df_processed)}")
    for col in ['section', 'category', 'item_name', 'description']:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].apply(
                lambda x: clean_text(x, preserve_case=(col in ['item_name', 'section']))
            )
    if 'price' in df_processed.columns:
        df_processed['price'] = df_processed['price'].fillna('Price not available').str.strip()
    else:
        df_processed['price'] = 'Price not available'
    df_processed['Features'] = df_processed.apply(
        lambda row: extract_features(row['description'], row['category']), axis=1
    )
    df_processed['combined_text'] = (
        df_processed['item_name'] + ' ' +
        df_processed['description'] + ' ' +
        df_processed['category'] + ' ' +
        df_processed['section'] + ' ' +
        ' '.join(df_processed['Features'].apply(lambda x: ' '.join([k for k, v in x.items() if v])))
    ).apply(clean_text)
    processed_data = []
    for _, row in df_processed.iterrows():
        menu_item = {
            'id': str(uuid.uuid4()),
            'restaurant_name': restaurant_name,
            'location': location,
            'section': row['section'],
            'category': row['category'],
            'item_name': row['item_name'],
            'description': row['description'],
            'price': row['price'],
            'features': row['Features'],
            'combined_text': row['combined_text']
        }
        processed_data.append(menu_item)
    return processed_data

def preprocess_all_csvs(directory: str = "/kaggle/input/restaurant", default_column_mapping: Optional[Dict[str, str]] = None, restaurant_name: str = "Unknown", location: str = "Unknown") -> List[Dict]:
    if default_column_mapping is None:
        default_column_mapping = {
            'section': 'Section',
            'category': 'Category',
            'item_name': 'Item Name',
            'description': 'Description',
            'price': 'Price'
        }
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    if not csv_files:
        logging.error(f"No CSV files found in {directory}")
        return []
    logging.info(f"Found {len(csv_files)} CSV files: {csv_files}")
    all_processed_data = []
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        logging.info(f"Processing {file_path}")
        try:
            df = pd.read_csv(
                file_path,
                encoding='utf-8',
                engine='python',
                keep_default_na=True,
                na_values=['', 'NaN']
            )
            processed_data = preprocess_menu_data(
                df=df,
                column_mapping=default_column_mapping,
                restaurant_name=f"{restaurant_name}_{csv_file.replace('.csv', '')}",
                location=location
            )
            if processed_data:
                all_processed_data.extend(processed_data)
                logging.info(f"Successfully processed {len(processed_data)} items from {csv_file}")
            else:
                logging.warning(f"No data processed from {csv_file}")
        except Exception as e:
            logging.error(f"Error processing {csv_file}: {str(e)}")
            continue
    logging.info(f"Total processed items from all CSVs: {len(all_processed_data)}")
    if all_processed_data:
        combined_df = pd.DataFrame(all_processed_data)
        combined_df.to_csv('combined_processed_data.csv', index=False)
        os.makedirs('/kaggle/working/cache', exist_ok=True)
        cache_file = '/kaggle/working/cache/preprocessed_data.pkl'
        joblib.dump(all_processed_data, cache_file)
        logging.info(f"Saved preprocessed data to {cache_file}")
    return all_processed_data

# Clean up existing Chroma data to avoid conflicts
shutil.rmtree("/kaggle/working/chroma/", ignore_errors=True)
logging.info("Cleaned up existing Chroma data directory")

# Initialize Chroma client
client = chromadb.PersistentClient(path="/kaggle/working/chroma/")
logging.info("Initialized Chroma client with persistent storage")


2025-04-21 03:34:13.586660: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745206453.800081      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745206453.859706      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
class MenuIndexer:
    """Efficient indexing system using ChromaDB with a shared client."""
    def __init__(self, processed_data: list):
        self.data = processed_data
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.client = client  # Use the global client
        self.collection = None
        self.build_index()

    def build_index(self):
        """Build ChromaDB collection with embeddings and metadata."""
        if not self.data:
            raise ValueError("No data to index")
        
        # Clean up existing collection if it exists
        try:
            self.client.delete_collection(name="menu_knowledge_base")
            logging.info("Deleted existing collection to avoid conflicts")
        except Exception:
            pass  # Ignore if collection doesn't exist
        
        sentence_embed = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name='all-MiniLM-L6-v2'
        )
        
        self.collection = self.client.get_or_create_collection(
            name="menu_knowledge_base",
            embedding_function=sentence_embed
        )
        
        ids = [item['id'] for item in self.data]
        texts = [item['combined_text'] for item in self.data]
        metadata = [{
            'restaurant_name': item['restaurant_name'],
            'location': item['location'],
            'section': item['section'],
            'category': item['category'],
            'item_name': item['item_name'],
            'description': item['description'],
            'price': item['price'],
            'vegetarian': str(item['features']['vegetarian']).lower(),
            'vegan': str(item['features']['vegan']).lower(),
            'gluten_free': str(item['features']['gluten_free']).lower(),
            'spicy': str(item['features']['spicy']).lower()
        } for item in self.data]
        
        self.collection.add(
            ids=ids,
            documents=texts,
            metadatas=metadata
        )
        logging.info(f"ChromaDB collection built with {len(self.data)} items")

    def search(self, query: str, top_k: int = 5, filters: Optional[Dict] = None) -> list:
        """Search the ChromaDB collection for relevant items."""
        where_clause = {}
        if filters:
            conditions = []
            for key, value in filters.items():
                if key in ['vegetarian', 'vegan', 'gluten_free', 'spicy']:
                    conditions.append({key: str(value).lower()})
                elif key == 'category':
                    conditions.append({'category': value})
                elif key == 'item_name':
                    conditions.append({'item_name': value})
            if conditions:
                where_clause = {"$and": conditions} if len(conditions) > 1 else conditions[0]

        results = self.collection.query(
            query_texts=[query],
            n_results=top_k,
            where=where_clause if where_clause else None
        )
        
        retrieved_ids = results['ids'][0]
        retrieved_items = [item for item in self.data if item['id'] in retrieved_ids]
        distances = results['distances'][0]
        for item, dist in zip(retrieved_items, distances):
            item['distance'] = float(dist)
        
        return retrieved_items

class MenuRAGChatbot:
    """RAG-based chatbot with retrieval and generation components using ChromaDB."""
    def __init__(self, indexer):
        self.indexer = indexer
        self.tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
        self.model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
        self.history = []
        self.max_history = 5

    def add_to_history(self, query: str, response: str):
        self.history.append({'query': query, 'response': response})
        if len(self.history) > self.max_history:
            self.history.pop(0)

    def retrieve(self, query: str, filters: Optional[Dict] = None) -> list:
        """Retrieve relevant menu items based on query and filters."""
        query_filters = {}
        
        # Extract feature-based filters
        for feature in ['vegetarian', 'vegan', 'gluten_free', 'spicy']:
            if feature in query.lower():
                query_filters[feature] = True
        
        # Extract category or item-specific filters
        if 'espresso' in query.lower():
            query_filters['item_name'] = 'espresso'
        if 'spicy' in query.lower():
            query_filters['spicy'] = True
        
        # Handle price range queries
        price_range = re.search(r'\$(\d+)-?\$?(\d+)?', query)
        if price_range:
            min_price, max_price = price_range.groups()
            min_price = float(min_price)
            max_price = float(max_price) if max_price else min_price + 10
            # Filter items in post-processing since Chroma doesn't support numeric range queries directly
            items = self.indexer.search(query, top_k=10, filters=query_filters or filters)
            return [
                item for item in items
                if item['price'] != 'Price not available' and
                min_price <= float(re.sub(r'[^\d.]', '', item['price'])) <= max_price
            ]
        
        return self.indexer.search(query, top_k=5, filters=query_filters or filters)

    def generate_response(self, query: str, retrieved_items: list) -> str:
        """Generate a response based on retrieved items."""
        if not retrieved_items:
            return "Sorry, I couldn't find any relevant menu items for your query. Try asking about specific dishes or dietary preferences!"
        
        context = "\n".join([
            f"{item['item_name']} ({item['restaurant_name']}): {item['description']} "
            f"(Price: {item['price']}, Features: {', '.join([k for k, v in item['features'].items() if v])})"
            for item in retrieved_items
        ])
        
        history_context = "\n".join([f"Previous query: {h['query']}, Response: {h['response']}" for h in self.history[-2:]])
        if history_context:
            context = f"{context}\nPrevious context:\n{history_context}"
        
        prompt = f"Answer the query based on the following menu items:\n{context}\n\nQuery: {query}\nAnswer:"
        inputs = self.tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True)
        outputs = self.model.generate(
            inputs['input_ids'],
            max_length=200,
            num_beams=5,
            early_stopping=True
        )
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    def handle_query(self, query: str) -> str:
        query = clean_text(query)
        if any(keyword in query.lower() for keyword in ['weather', 'news', 'stock']):
            return "Sorry, I can only assist with menu-related queries."
        if not any(word in query.lower() for word in ['what', 'which', 'price', 'diet', 'feature', 'compare']):
            return "Could you please clarify? For example, ask about menu items, prices, or dietary options."
        retrieved_items = self.retrieve(query)
        response = self.generate_response(query, retrieved_items)
        self.add_to_history(query, response)
        return response

    def get_history(self) -> list:
        return self.history.copy()


In [None]:
def main():
    # Load cached preprocessed data or process if not available
    cache_file = '/kaggle/working/cache/preprocessed_data.pkl'
    try:
        processed_data = joblib.load(cache_file)
        logging.info(f"Loaded preprocessed data from {cache_file}")
    except (FileNotFoundError, Exception) as e:
        logging.warning(f"Cache load failed: {e}. Processing CSVs...")
        processed_data = preprocess_all_csvs(
            directory="/kaggle/input/restaurant",
            default_column_mapping=None,
            restaurant_name="O Pedra",
            location="Unknown"
        )
        if not processed_data:
            logging.error("No data processed. Exiting.")
            return
    
    # Initialize indexer
    indexer = MenuIndexer(processed_data)
    
    # Initialize chatbot
    chatbot = MenuRAGChatbot(indexer)

    
    # Interactive mode
    while True:
        user_input = input("Enter your query (or 'exit' to quit): ").strip()
        if user_input.lower() == 'exit':
            break
        response = chatbot.handle_query(user_input)
        print(f"Response: {response}")
        print(f"History: {chatbot.get_history()}")

if __name__ == "__main__":
    main()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Enter your query (or 'exit' to quit):  Suggest some good restaurants


Response: Could you please clarify? For example, ask about menu items, prices, or dietary options.
History: []


Enter your query (or 'exit' to quit):  Do you know O Pedra?


Response: Could you please clarify? For example, ask about menu items, prices, or dietary options.
History: []


Enter your query (or 'exit' to quit):  Where is O pedro?


Response: Could you please clarify? For example, ask about menu items, prices, or dietary options.
History: []


Enter your query (or 'exit' to quit):  Veronica's Coffee


Response: Could you please clarify? For example, ask about menu items, prices, or dietary options.
History: []
