Cell 1: Install Dependencies

In [1]:
# Install required libraries
!pip install -q --upgrade --force-reinstall numpy==1.26.4
!pip install -q torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers==4.37.2 sentence-transformers==2.6.1
!pip install -q langchain==0.1.14 langchain-community==0.0.24
!pip install -q pydantic==1.10.13
!pip install -q faiss-cpu streamlit pypdf unstructured jq
!pip install -q langchain-huggingface



import importlib
packages = {
    'numpy': '1.26.4',
    'torch': '2.2.2',
    'transformers': '4.37.2',
    'sentence_transformers': '2.6.1',
    'langchain': '0.1.14',
    'faiss': None,
    'pydantic': '1.10.13'
}
for pkg, version in packages.items():
    try:
        module = importlib.import_module(pkg if pkg != 'sentence_transformers' else 'sentence_transformers')
        if version and hasattr(module, '__version__') and module.__version__ != version:
            print(f"Warning: {pkg} version {module.__version__} does not match expected {version}")
        else:
            print(f"✅ {pkg} is correctly installed with version {module.__version__}")
    except ImportError:
        print(f"❌ Failed to import {pkg}")

# Check langchain_huggingface
try:
    from langchain_huggingface import HuggingFaceEmbeddings
    print("✅ Successfully imported langchain_huggingface")
except ImportError:
    print("❌ Failed to import langchain_huggingface")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.5.0 requires fsspec[http]<=2024.12.0,>=2023.1.0, but you have fsspec 2025.3.2 which is incompatible.
nilearn 0.11.1 requires scikit-learn>=1.4.0, but you have scikit-learn 1.2.2 which is incompatible.
google-colab 1.0.0 requires notebook==6.5.5, but you have notebook 6.5.4 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.15.2 which is incompatible.
dopamine-rl 4.1.2 requires gymnasium>=1.0.0, but you have gymna

Cell 2: Utility Functions

In [2]:
import re
import pandas as pd
import logging
import warnings

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
warnings.filterwarnings('ignore', category=RuntimeWarning)

def clean_text(text: str, preserve_case: bool = False) -> str:
    if pd.isna(text) or text is None:
        return ""
    text = str(text)
    text = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', text)
    text = re.sub(r'[^\w\s.,-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text if preserve_case else text.lower()

def extract_features(description: str, category: str) -> dict:
    description = clean_text(description, preserve_case=False)
    category = clean_text(category, preserve_case=False)
    features = {'vegetarian': False, 'vegan': False, 'gluten_free': False, 'spicy': False}
    vegetarian_keywords = ['vegetarian', 'vegan', 'plant-based', 'tofu', 'mushroom', 'eggplant', 'jackfruit', 'vegetable', 'amaranth', 'curry']
    vegan_keywords = ['vegan', 'plant-based', 'no dairy', 'coconut milk']
    non_veg_keywords = ['chicken', 'prawn', 'fish', 'mutton', 'pork', 'bacon', 'ham', 'choriz', 'beef', 'lamb']
    if not any(keyword in description for keyword in non_veg_keywords):
        if any(keyword in description for keyword in vegetarian_keywords):
            features['vegetarian'] = True
        if any(keyword in description for keyword in vegan_keywords) or (
            features['vegetarian'] and 'cheese' not in description and 'egg' not in description
        ):
            features['vegan'] = True
    gluten_free_keywords = ['gluten-free', 'rice', 'bhakri', 'no wheat']
    if any(keyword in description for keyword in gluten_free_keywords):
        features['gluten_free'] = True
    spicy_keywords = ['spicy', 'chilli', 'masala', 'pepper', 'picante']
    if 'coffee' not in category:
        spicy_keywords.append('hot')
    if any(keyword in description for keyword in spicy_keywords):
        features['spicy'] = True
    return features

Cell 3: Data Preprocessing

In [3]:
import os
import uuid
from typing import List, Dict, Optional

def preprocess_menu_data(df: pd.DataFrame, column_mapping: Dict[str, str], restaurant_name: str, location: str) -> List[Dict]:
    if df is None or df.empty:
        logging.error("Menu DataFrame is empty or None")
        return []
    logging.info(f"Raw Menu DataFrame rows: {len(df)}")
    df_processed = df.copy()
    expected_columns = ['category', 'item_name', 'description', 'price']
    missing_columns = [col for col in expected_columns if col not in column_mapping]
    if missing_columns:
        logging.warning(f"Missing column mappings for {missing_columns}. Using empty strings.")
        for col in missing_columns:
            df_processed[col] = ''
    for expected_col, actual_col in column_mapping.items():
        if actual_col in df_processed.columns:
            df_processed[expected_col] = df_processed[actual_col]
        else:
            logging.warning(f"Column '{actual_col}' not found in Menu DataFrame. Using empty strings.")
            df_processed[expected_col] = ''
    for col in ['category', 'item_name', 'description']:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].apply(
                lambda x: clean_text(x, preserve_case=(col == 'item_name'))
            )
    if 'price' in df_processed.columns:
        df_processed['price'] = df_processed['price'].fillna('Price not available').str.strip()
    else:
        df_processed['price'] = 'Price not available'
    df_processed = df_processed.drop_duplicates(
        subset=['category', 'item_name', 'description', 'price'],
        keep='first',
        ignore_index=True
    )
    logging.info(f"After removing duplicates: {len(df_processed)}")
    df_processed['Features'] = df_processed.apply(
        lambda row: extract_features(row['description'], row['category']), axis=1
    )
    processed_data = []
    for _, row in df_processed.iterrows():
        features_text = ', '.join([k for k, v in row['Features'].items() if v])
        content = (
            f"Restaurant: {restaurant_name}\n"
            f"Location: {location}\n"
            f"Category: {row['category']}\n"
            f"Item: {row['item_name']}\n"
            f"Description: {row['description']}\n"
            f"Price: {row['price']}\n"
            f"Features: {features_text if features_text else 'None'}"
        )
        metadata = {
            'id': str(uuid.uuid4()),
            'restaurant_name': restaurant_name,
            'location': location,
            'category': row['category'],
            'item_name': row['item_name'],
            'price': row['price'],
            'vegetarian': str(row['Features']['vegetarian']).lower(),
            'vegan': str(row['Features']['vegan']).lower(),
            'gluten_free': str(row['Features']['gluten_free']).lower(),
            'spicy': str(row['Features']['spicy']).lower()
        }
        processed_data.append({'content': content, 'metadata': metadata})
    return processed_data

def preprocess_info_data(df: pd.DataFrame, restaurant_name: str) -> Dict:
    if df is None or df.empty:
        logging.error("Info DataFrame is empty or None")
        return {}
    logging.info(f"Raw Info DataFrame rows: {len(df)}")
    df_processed = df.copy()
    info_dict = {
        'restaurant_name': restaurant_name,
        'address': '',
        'phone': '',
        'email': '',
        'opening_hours': ''
    }
    for col in df_processed.columns:
        col_lower = col.lower()
        if any(keyword in col_lower for keyword in ['address', 'location']):
            info_dict['address'] = clean_text(df_processed[col].iloc[0] if len(df_processed[col]) > 0 else '')
        elif any(keyword in col_lower for keyword in ['phone', 'contact']):
            info_dict['phone'] = clean_text(df_processed[col].iloc[0] if len(df_processed[col]) > 0 else '')
        elif 'email' in col_lower:
            info_dict['email'] = clean_text(df_processed[col].iloc[0] if len(df_processed[col]) > 0 else '')
        elif any(keyword in col_lower for keyword in ['hours', 'opening', 'operation']):
            info_dict['opening_hours'] = clean_text(df_processed[col].iloc[0] if len(df_processed[col]) > 0 else '')
    content = (
        f"Restaurant: {restaurant_name}\n"
        f"Address: {info_dict['address']}\n"
        f"Phone: {info_dict['phone']}\n"
        f"Email: {info_dict['email']}\n"
        f"Opening Hours: {info_dict['opening_hours']}"
    )
    metadata = {
        'id': str(uuid.uuid4()),
        'restaurant_name': restaurant_name,
        'type': 'info'
    }
    return {'content': content, 'metadata': metadata}

Cell 4: Document Loading and Chunking

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

def load_and_process_documents(base_directory: str = "/kaggle/input/restaurant"):
    documents = []
    menu_column_mapping = {
        'category': 'Category',
        'item_name': 'Item Name',
        'description': 'Description',
        'price': 'Price'
    }
    file_list = [f for f in os.listdir(base_directory) if f.endswith(('.csv', '.xlsx'))]
    if not file_list:
        logging.error(f"No CSV or Excel files found in {base_directory}")
        return []
    logging.info(f"Found {len(file_list)} files: {file_list}")
    for file in file_list:
        file_path = os.path.join(base_directory, file)
        # Extract restaurant name from filename (remove _Menu or _Info suffix and extension)
        restaurant_name = file.split('_')[0]
        if file.endswith('_Menu.csv'):
            try:
                df_menu = pd.read_csv(file_path, encoding='utf-8', engine='python', keep_default_na=True, na_values=['', 'NaN'])
                actual_columns = df_menu.columns.str.lower()
                dynamic_mapping = {}
                for expected_col, default_col in menu_column_mapping.items():
                    matched_col = next((col for col in actual_columns if expected_col in col or default_col.lower() in col), None)
                    dynamic_mapping[expected_col] = matched_col if matched_col else default_col
                location = ''
                info_file = os.path.join(base_directory, f"{restaurant_name}_Info.csv")
                xlsx_info_file = os.path.join(base_directory, f"{restaurant_name}_Info.xlsx")
                if os.path.exists(info_file):
                    df_info = pd.read_csv(info_file, encoding='utf-8', engine='python')
                    info_data = preprocess_info_data(df_info, restaurant_name)
                    location = info_data['content'].split('Address: ')[1].split('\n')[0]
                    documents.append({
                        'page_content': info_data['content'],
                        'metadata': info_data['metadata']
                    })
                elif os.path.exists(xlsx_info_file):
                    df_info = pd.read_excel(xlsx_info_file)
                    info_data = preprocess_info_data(df_info, restaurant_name)
                    location = info_data['content'].split('Address: ')[1].split('\n')[0]
                    documents.append({
                        'page_content': info_data['content'],
                        'metadata': info_data['metadata']
                    })
                processed_menu = preprocess_menu_data(
                    df=df_menu,
                    column_mapping=dynamic_mapping,
                    restaurant_name=restaurant_name,
                    location=location
                )
                for item in processed_menu:
                    documents.append({
                        'page_content': item['content'],
                        'metadata': item['metadata']
                    })
                logging.info(f"Processed {len(processed_menu)} menu items from {restaurant_name}")
            except Exception as e:
                logging.error(f"Error processing menu for {restaurant_name}: {str(e)}")
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)
    chunked_docs = splitter.split_documents([
        type('Document', (), {'page_content': doc['page_content'], 'metadata': doc['metadata']})()
        for doc in documents
    ])
    return chunked_docs

# Load and process documents
chunked_docs = load_and_process_documents()

Cell 5: Vector Store Creation

In [7]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain-community)
  Downloading langchain_core-0.3.54-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain-community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain<1.0.0,>=0.3.23->langchain-community)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading

In [8]:
# Reset and install compatible versions of dependencies
!pip uninstall -y numpy torch transformers sentence-transformers langchain faiss-cpu
!pip install -q numpy==1.26.4 torch==2.5.1 transformers==4.44.2 sentence-transformers==2.7.0 langchain==0.2.10 faiss-cpu

# Verify installations
import importlib
packages = {
    'numpy': '1.26.4',
    'torch': '2.5.1',
    'transformers': '4.44.2',
    'sentence_transformers': '2.7.0',
    'langchain': '0.2.10',
    'faiss': None
}
for pkg, version in packages.items():
    try:
        module = importlib.import_module(pkg)
        if version and hasattr(module, '__version__') and module.__version__ != version:
            print(f"Warning: {pkg} version {module.__version__} does not match expected {version}")
        elif version:
            print(f"Successfully verified {pkg} version {module.__version__}")
        else:
            print(f"Successfully imported {pkg}")
    except ImportError:
        print(f"Failed to import {pkg}. Installation may have failed.")
        raise ImportError(f"Failed to import {pkg}. Please restart the kernel and reinstall dependencies.")

# Import required modules
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import streamlit as st

@st.cache_resource
def create_vector_store(_chunked_docs):
    embedding_model = "BAAI/bge-base-en-v1.5"
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    db = FAISS.from_documents(_chunked_docs, embeddings)
    return db

# Create vector store
db = create_vector_store(chunked_docs)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: torch 2.5.1
Uninstalling torch-2.5.1:
  Successfully uninstalled torch-2.5.1
Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Found existing installation: sentence-transformers 2.7.0
Uninstalling sentence-transformers-2.7.0:
  Successfully uninstalled sentence-transformers-2.7.0
Found existing installation: langchain 0.3.23
Uninstalling langchain-0.3.23:
  Successfully uninstalled langchain-0.3.23
Found existing installation: faiss-cpu 1.10.0
Uninstalling faiss-cpu-1.10.0:
  Successfully uninstalled faiss-cpu-1.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.17.2+cu121 requires torch==2.2.2, but you have torch 2.5.1 which 

2025-04-21 03:19:06.714 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]



RuntimeError: Failed to import transformers.models.aria.configuration_aria because of the following error (look up to see its traceback):
No module named 'transformers.models.aria.configuration_aria'

Cell 6: Model and Tokenizer Loading

In [6]:
# Install or verify bitsandbytes and triton
try:
    import bitsandbytes
    import triton
    print("bitsandbytes and triton are already installed.")
except ImportError:
    print("bitsandbytes or triton not found. Attempting to install...")
    # Install compatible versions
    !pip install -q --force-reinstall bitsandbytes==0.43.1 triton==2.3.0
    try:
        import bitsandbytes
        import triton
        print("Successfully installed bitsandbytes and triton.")
    except ImportError:
        print("bitsandbytes or triton installation failed. Falling back to model without 4-bit quantization...")
        # Fallback: Remove bitsandbytes dependency
        bnb_config = None

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

@st.cache_resource
def load_model_and_tokenizer(model_name, _bnb_config):
    if _bnb_config is not None:
        model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=_bnb_config)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)  # Load without quantization
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Load model and tokenizer
model_name = "HuggingFaceH4/zephyr-7b-beta"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
) if 'bnb_config' in locals() and bnb_config is not None else None
model, tokenizer = load_model_and_tokenizer(model_name, bnb_config)

if bnb_config is None:
    print("Warning: Running model without 4-bit quantization due to bitsandbytes/triton issues. Memory usage will be higher.")

bitsandbytes or triton not found. Attempting to install...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.2/779.2 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m:00:0

NameError: name 'st' is not defined

Cell 7: RAG Pipeline Setup

In [8]:
from langchain.prompts import PromptTemplate

# Define prompt template
prompt_template = """
Answer the question based on the following context about restaurant menus and information. Provide accurate and helpful details about menu items, dietary options, prices, or restaurant details as requested. If the query is unclear or out of scope, politely ask for clarification or state that the information is unavailable.

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

def run_model(inputs):
    context = inputs['context']
    question = inputs['question']
    formatted_input = prompt.format(context=context, question=question)
    input_ids = tokenizer(formatted_input, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=512, max_new_tokens=150)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def run_rag_chain(question):
    if any(keyword in question.lower() for keyword in ['weather', 'news', 'stock']):
        return "Sorry, I can only assist with restaurant-related queries."
    if not any(word in question.lower() for word in ['what', 'which', 'price', 'diet', 'feature', 'compare', 'address', 'hours', 'contact']):
        return "Could you please clarify? For example, ask about menu items, prices, dietary options, or restaurant details."
    context_docs = retriever.get_relevant_documents(question)
    context = "\n".join([doc.page_content for doc in context_docs])
    response = run_model({"context": context, "question": question})
    return response



RuntimeError: no validator found for <class 'langchain_core.prompts.prompt.PromptTemplate'>, see `arbitrary_types_allowed` in Config

Cell 8: Streamlit Interface

In [None]:
import streamlit as st

# Streamlit interface
st.title("Restaurant RAG-based Chatbot")
st.write("Ask about menu items, dietary options, prices, or restaurant details (e.g., opening hours, address).")

question = st.text_input("Enter your question:")
if question:
    with st.spinner("Generating response..."):
        response = run_rag_chain(question)
        st.write(response)

# Display conversation history
if 'history' not in st.session_state:
    st.session_state.history = []
if question:
    st.session_state.history.append({"question": question, "response": response})
st.subheader("Conversation History")
for entry in st.session_state.history[-5:]:
    st.write(f"**Q:** {entry['question']}")
    st.write(f"**A:** {entry['response']}")
    st.write("---")

Run Streamlit

In [None]:
!npm install localtunnel
!streamlit run streamlit_interface.py &>/kaggle/working/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com