# Wilmington Restaurant Recommendation Chatbot


# Install Dependencies

In [None]:
!pip3 install transformers sentence-transformers -q



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


# Import Libraries

In [None]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

## Prepare a Dataset

In [None]:
# Set file paths
business_path = "C:/Users/User/Desktop/yelp_dataset/yelp_academic_dataset_business.json"
review_path = "C:/Users/User/Desktop/yelp_dataset/yelp_academic_dataset_review.json"
output_csv_path = "yelp_business_reviews_combined.csv"

# Load JSONL files for business and review
print("Loading business.json...")
business_df = pd.read_json(business_path, lines=True)

print("Loading review.json...")
review_df = pd.read_json(review_path, lines=True)

print("Business rows:", len(business_df))
print("Review rows:", len(review_df))

# Only keep only useful columns from business file
business_cols_to_keep = [
    "business_id",
    "name",
    "address",
    "city",
    "state",
    "postal_code",
    "latitude",
    "longitude",
    "stars",
    "review_count",
    "is_open",
    "categories",
    "hours",
]

business_small = business_df[business_cols_to_keep]

# Combine reviews with business info using business_id
print("Merging review and business data...")
merged_df = review_df.merge(business_small, on="business_id", how="inner")

print("Merged rows:", len(merged_df))

# Remove not used columns for final CSV
rag_cols = [
    "review_id",
    "user_id",
    "business_id",
    "name",          # business name
    "address",
    "city",
    "state",
    "postal_code",
    "categories",
    "stars_x",       # review rating
    "stars_y",       # business rating
    "text",          # review
    "date",
    "useful",
    "funny",
    "cool",
]

merged_df = merged_df[rag_cols].rename(
    columns={
        "stars_x": "review_stars",
        "stars_y": "business_stars",
    }
)

print(f"Saving combined CSV to: {output_csv_path}")
merged_df.to_csv(output_csv_path, index=False)

print("Done! CSV ready for RAG chatbot.")

## Generate Dense Embeddings and Build an Index

In [None]:
# Load CSV

csv_path = "C:/Users/User/Desktop/yelp_rag_ready.csv"
print("Loading dataset...")

df = pd.read_csv(csv_path)

print(f"Loaded rows: {len(df)}")
df.head()

Loading dataset...
Loaded rows: 6990280


Unnamed: 0,business_id,review_stars,text,useful,funny,cool,name,address,city,state,review_count,business_stars,categories
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",0,0,0,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,169,3.0,"Restaurants, Breakfast & Brunch, Food, Juice B..."
1,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...,1,0,1,Body Cycle Spinning Studio,"1923 Chestnut St, 2nd Fl",Philadelphia,PA,144,5.0,"Active Life, Cycling Classes, Trainers, Gyms, ..."
2,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...,0,0,0,Kettle Restaurant,748 W Starr Pass Blvd,Tucson,AZ,47,3.5,"Restaurants, Breakfast & Brunch"
3,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo...",1,0,1,Zaika,2481 Grant Ave,Philadelphia,PA,181,4.0,"Halal, Pakistani, Restaurants, Indian"
4,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...,1,0,1,Melt,2549 Banks St,New Orleans,LA,32,4.0,"Sandwiches, Beer, Wine & Spirits, Bars, Food, ..."


In [None]:
print(f"Total rows loaded: {len(df)}")
print("="*60)

print("🔹 Unique City Names (first 50 shown):")
unique_cities = df["city"].dropna().unique()

print(f"Total number of unique cities: {len(unique_cities)}")
print(unique_cities[:50])   # preview
print("="*60)

# Show the number of reviews per city, we chose a review side that is suitable for this project scope

print("🔹 Review Counts per City (Top 20):")
city_counts = df["city"].value_counts()

print(city_counts.head(20))
print("="*60)

Total rows loaded: 6990280
🔹 Unique City Names (first 50 shown):
Total number of unique cities: 1416
['North Wales' 'Philadelphia' 'Tucson' 'New Orleans' 'Quakertown'
 'Santa Barbara' 'Tampa' 'Indianapolis' 'St Louis' 'Saint Louis' 'Carmel'
 'Carpinteria' 'Speedway' 'Phoenixville' 'Broomall' 'Nashville'
 'Eddystone' 'Goleta' 'Norristown' 'Reno' 'Lutz' 'Port Richey' 'Edmonton'
 'Penns Grove' 'Granite City' 'Meridian' 'Clearwater' 'Florissant'
 'Kenner' 'Willow Grove' 'Land O Lakes' 'Boise' 'Bensalem' 'Manchester'
 'West Chester' 'Fishers' 'Safety Harbor' 'Metairie' 'Sparks'
 'Saint Petersburg' 'Deptford' 'Malvern' 'Garden City' 'Mt. Laurel'
 'Newtown' 'Brookhaven' 'Riverview' 'Hermitage' 'Bryn Mawr' 'Springfield']
🔹 Review Counts per City (Top 20):
city
Philadelphia        967552
New Orleans         635364
Tampa               454889
Nashville           451571
Tucson              404880
Indianapolis        361489
Reno                351573
Santa Barbara       269630
Saint Louis         2

In [None]:
# Filter for Wilmington

df_wilmington = df[df["city"].str.contains("Wilmington", case=False, na=False)].copy()

# Remove rows with empty review text
df_wilmington = df_wilmington.dropna(subset=["text"])
df_wilmington = df_wilmington[df_wilmington["text"].str.strip() != ""]

print("Total Wilmington rows (all categories):", len(df_wilmington))

# Filter Wilmington restaurants only, remove all other business

restaurant_pattern = r"Restaurant|Food|Diner|Cafe|Pizza|Bar|Brewery|Pub|Bakery"

df_wilmington_restaurants = df_wilmington[
    df_wilmington["categories"].str.contains(restaurant_pattern, case=False, na=False)
].copy()

print("Wilmington restaurant rows:", len(df_wilmington_restaurants))

# Create embeddings for restaurant reviews only

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

documents = df_wilmington_restaurants["text"].tolist()

embeddings_wilmington = embedding_model.encode(
    documents,
    batch_size=64,
    show_progress_bar=True
)


df_wilmington_restaurants["embedding"] = embeddings_wilmington.tolist()

df_wilmington_restaurants.to_pickle("yelp_wilmington_restaurants_with_embeddings.pkl")

print("Saved Wilmington RESTAURANT subset with new embeddings.")


Total Wilmington rows (all categories): 44978
Wilmington restaurant rows: 34791


Batches:   0%|          | 0/544 [00:00<?, ?it/s]

Saved Wilmington RESTAURANT subset with new embeddings.


In [None]:
df_w = pd.read_pickle("yelp_wilmington_restaurants_with_embeddings.pkl")
df_w.head()

Unnamed: 0,business_id,review_stars,text,useful,funny,cool,name,address,city,state,review_count,business_stars,categories,embedding
364,W6S482dRQ_9nCcLjfv8VNg,1,Food was great. But very disappointed with the...,0,0,0,Bawarchi Indian Restaurant,2909 Concord Pike,Wilmington,DE,101,3.5,"Indian, Chinese, Restaurants","[0.04765835404396057, 0.08650010079145432, 0.0..."
436,OlOVRlcEstxKK_6hMjy63A,3,"One of the few diners in North Wilmington, and...",0,0,0,Hollywood Grill,1811 Concord Pike,Wilmington,DE,190,3.5,"Restaurants, Diners","[0.10098085552453995, -0.06393218785524368, 0...."
1103,5IFqqWJTaPNoY2ZgRlX73w,5,My boyfriend and I have lived in trolley squar...,0,0,0,Cafe Verdi Restaurant,"12A Trolley Sq, Ste A",Wilmington,DE,44,3.0,"Restaurants, Italian, Pizza","[-0.04978679120540619, 0.05157990753650665, 0...."
1298,ixsljpxu56VCTtayIELqUg,3,I went back here and either I didn't remember ...,0,0,0,Tokyo Sushi,3 S Orange St,Wilmington,DE,17,3.5,"Sushi Bars, Restaurants","[-0.008040367625653744, 0.03498610854148865, 0..."
1836,ixsljpxu56VCTtayIELqUg,4,This is my favorite in Delaware. The menu is ...,7,0,0,Tokyo Sushi,3 S Orange St,Wilmington,DE,17,3.5,"Sushi Bars, Restaurants","[-0.045511193573474884, 0.011481113731861115, ..."


In [None]:
# Load CSV created by group

group_csv_path = "C:/Users/User/Desktop/IAT360_Dataset_by_group.csv"
print("Loading dataset...")

our_df = pd.read_csv(group_csv_path)

print(f"Loaded rows: {len(df)}")
our_df.head()

Loading dataset...
Loaded rows: 60


Unnamed: 0,business_id,review_stars,text,useful,funny,cool,name,address,city,state,review_count,business_stars,categories
0,W6S482dRQ_9nCcLjfv8VNg,2,I have been to this location a number of times...,0,0,0,Bawarchi Indian Restaurant,2909 Concord Pike,Wilmington,DE,2,3.5,"Indian, Chinese, Restaurants"
1,W6S482dRQ_9nCcLjfv8VNg,1,I'm very disappointed with the food I received...,0,0,0,Bawarchi Indian Restaurant,2910 Concord Pike,Wilmington,DE,2,3.5,"Indian, Chinese, Restaurants"
2,25btSWnOGwwB2IQMXF7A-A,5,I love Trader Joe's. They have THE BEST snacks...,0,0,0,Trader Joe's,5605 Concord Pike,Wilmington,DE,2,4.5,"Grocery, Flowers & Gifts, Beer, Wine & Spirits..."
3,25btSWnOGwwB2IQMXF7A-A,5,Nice store with lots of unique finds. Fresh pr...,0,0,0,Trader Joe's,5605 Concord Pike,Wilmington,DE,2,4.5,"Grocery, Flowers & Gifts, Beer, Wine & Spirits..."
4,5IFqqWJTaPNoY2ZgRlX73w,5,The latte art is always impressive and adds a ...,0,0,0,Cafe Verdi Restaurant,"12A Trolley Sq, Ste A",Wilmington,DE,2,3.0,"Restaurants, Italian, Pizza"


In [None]:
# Embedding the review in our csv
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

our_documents = our_df["text"].astype(str).tolist()

our_embeddings = embedding_model.encode(
    our_documents,
    batch_size=64,
    show_progress_bar=True
)

our_df["embedding"] = our_embeddings.tolist()


output_path = "IAT360_Dataset_by_group_with_embeddings.pkl"
our_df.to_pickle(output_path)

print(f"Saved dataset with embeddings to: {output_path}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Saved dataset with embeddings to: IAT360_Dataset_by_group_with_embeddings.pkl


In [None]:
df_w = pd.read_pickle("IAT360_Dataset_by_group_with_embeddings.pkl")
df_w.head()

Unnamed: 0,business_id,review_stars,text,useful,funny,cool,name,address,city,state,review_count,business_stars,categories,embedding
0,W6S482dRQ_9nCcLjfv8VNg,2,I have been to this location a number of times...,0,0,0,Bawarchi Indian Restaurant,2909 Concord Pike,Wilmington,DE,2,3.5,"Indian, Chinese, Restaurants","[-0.0034387933555990458, -0.011476054787635803..."
1,W6S482dRQ_9nCcLjfv8VNg,1,I'm very disappointed with the food I received...,0,0,0,Bawarchi Indian Restaurant,2910 Concord Pike,Wilmington,DE,2,3.5,"Indian, Chinese, Restaurants","[0.013557997532188892, 0.010199910029768944, 0..."
2,25btSWnOGwwB2IQMXF7A-A,5,I love Trader Joe's. They have THE BEST snacks...,0,0,0,Trader Joe's,5605 Concord Pike,Wilmington,DE,2,4.5,"Grocery, Flowers & Gifts, Beer, Wine & Spirits...","[-0.06484764814376831, 0.031708601862192154, 0..."
3,25btSWnOGwwB2IQMXF7A-A,5,Nice store with lots of unique finds. Fresh pr...,0,0,0,Trader Joe's,5605 Concord Pike,Wilmington,DE,2,4.5,"Grocery, Flowers & Gifts, Beer, Wine & Spirits...","[-0.06595531105995178, 0.057717423886060715, 0..."
4,5IFqqWJTaPNoY2ZgRlX73w,5,The latte art is always impressive and adds a ...,0,0,0,Cafe Verdi Restaurant,"12A Trolley Sq, Ste A",Wilmington,DE,2,3.0,"Restaurants, Italian, Pizza","[-0.04427032545208931, -0.01786855049431324, -..."


In [None]:
embedding_path = "/Users/allycui/Desktop/SFU Fall2025/IAT 360/Group Assignment2/yelp_wilmington_restaurants_with_embeddings.pkl"
df_w = pd.read_pickle(embedding_path)
df_w.head()

Unnamed: 0,business_id,review_stars,text,useful,funny,cool,name,address,city,state,review_count,business_stars,categories,embedding
364,W6S482dRQ_9nCcLjfv8VNg,1,Food was great. But very disappointed with the...,0,0,0,Bawarchi Indian Restaurant,2909 Concord Pike,Wilmington,DE,101,3.5,"Indian, Chinese, Restaurants","[0.04765835404396057, 0.08650010079145432, 0.0..."
436,OlOVRlcEstxKK_6hMjy63A,3,"One of the few diners in North Wilmington, and...",0,0,0,Hollywood Grill,1811 Concord Pike,Wilmington,DE,190,3.5,"Restaurants, Diners","[0.10098085552453995, -0.06393218785524368, 0...."
1103,5IFqqWJTaPNoY2ZgRlX73w,5,My boyfriend and I have lived in trolley squar...,0,0,0,Cafe Verdi Restaurant,"12A Trolley Sq, Ste A",Wilmington,DE,44,3.0,"Restaurants, Italian, Pizza","[-0.04978679120540619, 0.05157990753650665, 0...."
1298,ixsljpxu56VCTtayIELqUg,3,I went back here and either I didn't remember ...,0,0,0,Tokyo Sushi,3 S Orange St,Wilmington,DE,17,3.5,"Sushi Bars, Restaurants","[-0.008040367625653744, 0.03498610854148865, 0..."
1836,ixsljpxu56VCTtayIELqUg,4,This is my favorite in Delaware. The menu is ...,7,0,0,Tokyo Sushi,3 S Orange St,Wilmington,DE,17,3.5,"Sushi Bars, Restaurants","[-0.045511193573474884, 0.011481113731861115, ..."


In [None]:
df = df_w.copy()

df["Document"] = df["text"]          # review
df["Embedding"] = df["embedding"]    # embedding vector for each review
# quick check
df[["name", "city", "categories", "business_stars", "Document"]].head()

Unnamed: 0,name,city,categories,business_stars,Document
364,Bawarchi Indian Restaurant,Wilmington,"Indian, Chinese, Restaurants",3.5,Food was great. But very disappointed with the...
436,Hollywood Grill,Wilmington,"Restaurants, Diners",3.5,"One of the few diners in North Wilmington, and..."
1103,Cafe Verdi Restaurant,Wilmington,"Restaurants, Italian, Pizza",3.0,My boyfriend and I have lived in trolley squar...
1298,Tokyo Sushi,Wilmington,"Sushi Bars, Restaurants",3.5,I went back here and either I didn't remember ...
1836,Tokyo Sushi,Wilmington,"Sushi Bars, Restaurants",3.5,This is my favorite in Delaware. The menu is ...


In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def retrieve_with_pandas(query: str, top_k: int = 5) -> pd.DataFrame:
    query_embedding = embedding_model.encode([query])[0]
    def cosine_sim(vec):
        vec = np.array(vec)
        return float(
            np.dot(query_embedding, vec)
            / (np.linalg.norm(query_embedding) * np.linalg.norm(vec) + 1e-10)
        )
    df_local = df.copy()
    df_local["Similarity"] = df_local["Embedding"].apply(cosine_sim)
    grouped = (
        df_local.sort_values(by="Similarity", ascending=False)
                .groupby("name", as_index=False)
                .first()
    )
    top_restaurants = grouped.sort_values(by="Similarity", ascending=False).head(top_k)
    cols_to_show = [
        "Similarity",
        "name",
        "address",
        "city",
        "business_stars",
        "categories",
        "Document",
    ]
    return top_restaurants[cols_to_show]

In [None]:
def print_retrieved_restaurants(results: pd.DataFrame):
    for i, (idx, row) in enumerate(results.iterrows()):
        rank = i + 1
        similarity = round(row["Similarity"], 3)
        name = row.get("name", "Unknown name")
        address = row.get("address", "Unknown address")
        city = row.get("city", "Unknown city")
        stars = row.get("business_stars", "N/A")
        categories = row.get("categories", "Unknown categories")
        snippet = str(row["Document"])[:220].replace("\n", " ")

        tag = " (TOP RECOMMENDATION)" if rank == 1 else ""

        print(f"{rank}. {name}{tag} (similarity={similarity}, stars={stars})")
        print(f"   {address}, {city}")
        print(f"   Categories: {categories}")
        print(f"   Review snippet: {snippet}...\n")

In [None]:
def generate_with_rag(query: str, top_k: int = 5, max_length: int = 250) -> str:
    retrieved = retrieve_with_pandas(query, top_k=top_k)
    context_entries = []
    for idx, row in retrieved.iterrows():
        context_entries.append(
            f"Restaurant: {row['name']}\n"
            f"Address: {row.get('address', 'Unknown')}, {row.get('city', '')}\n"
            f"Stars: {row.get('business_stars', 'N/A')}\n"
            f"Categories: {row.get('categories', '')}\n"
            f"Relevant Review: {row['Document']}\n"
        )
    context_text = "\n---\n".join(context_entries)
    prompt = (
        "You are a restaurant recommendation assistant. "
        "Use ONLY the following evidence from Yelp reviews to recommend restaurants. "
        "List restaurants by name and give short explanations using the evidence.\n\n"
        f"EVIDENCE:\n{context_text}\n\n"
        f"USER QUERY: {query}\n\n"
        "ASSISTANT ANSWER:"
    )
    response = llm(prompt, max_length=max_length, num_return_sequences=1)[0]["generated_text"]
    return response

# Step 3: Testing

In [None]:
query = "Recommend a good sushi restaurant in Wilmington with high ratings."
results = retrieve_with_pandas(query, top_k=5)
print("USER QUERY:")
print(query)
print("\nTOP MATCHED RESTAURANTS:\n")
print_retrieved_restaurants(results)

USER QUERY:
Recommend a good sushi restaurant in Wilmington with high ratings.

TOP MATCHED RESTAURANTS:

1. Le Shio (TOP RECOMMENDATION) (similarity=0.863, stars=3.5)
   2303 Concord Pike, Wilmington
   Categories: Restaurants, Asian Fusion
   Review snippet: So far the best sushi in Wilmington I have tried. The relation between quality and price is great....

2. Mikimotos Japanese Restaurant & Sushi Bar (similarity=0.852, stars=3.5)
   1212 N Washington St, Wilmington
   Categories: Nightlife, Japanese, Sushi Bars, Restaurants, Bars
   Review snippet: One of my favorite sushi places in the wilmington area.   Sushi was extremely fresh and services was excellent. I highly recommend the chef's choice as it gives you a great variety of sashmi and sushi (69$ for 2). I was ...

3. Tokyo Sushi (similarity=0.849, stars=3.5)
   3 S Orange St, Wilmington
   Categories: Sushi Bars, Restaurants
   Review snippet: This is my favorite go-to sushi place in the Wilmington area. Good , delicious , cl

# Step 4: Testing with self-created dataset

In [None]:
# load our self-created dataset with embeddings
group_dataset_path = (
    "/Users/allycui/Desktop/SFU Fall2025/"
    "IAT 360/Group Assignment2/IAT360_Dataset_by_group_with_embeddings.pkl"
)

df_group = pd.read_pickle(group_dataset_path)
df_group.head()

Unnamed: 0,business_id,review_stars,text,useful,funny,cool,name,address,city,state,review_count,business_stars,categories,embedding
0,W6S482dRQ_9nCcLjfv8VNg,2,I have been to this location a number of times...,0,0,0,Bawarchi Indian Restaurant,2909 Concord Pike,Wilmington,DE,2,3.5,"Indian, Chinese, Restaurants","[-0.0034387933555990458, -0.011476054787635803..."
1,W6S482dRQ_9nCcLjfv8VNg,1,I'm very disappointed with the food I received...,0,0,0,Bawarchi Indian Restaurant,2910 Concord Pike,Wilmington,DE,2,3.5,"Indian, Chinese, Restaurants","[0.013557997532188892, 0.010199910029768944, 0..."
2,25btSWnOGwwB2IQMXF7A-A,5,I love Trader Joe's. They have THE BEST snacks...,0,0,0,Trader Joe's,5605 Concord Pike,Wilmington,DE,2,4.5,"Grocery, Flowers & Gifts, Beer, Wine & Spirits...","[-0.06484764814376831, 0.031708601862192154, 0..."
3,25btSWnOGwwB2IQMXF7A-A,5,Nice store with lots of unique finds. Fresh pr...,0,0,0,Trader Joe's,5605 Concord Pike,Wilmington,DE,2,4.5,"Grocery, Flowers & Gifts, Beer, Wine & Spirits...","[-0.06595531105995178, 0.057717423886060715, 0..."
4,5IFqqWJTaPNoY2ZgRlX73w,5,The latte art is always impressive and adds a ...,0,0,0,Cafe Verdi Restaurant,"12A Trolley Sq, Ste A",Wilmington,DE,2,3.0,"Restaurants, Italian, Pizza","[-0.04427032545208931, -0.01786855049431324, -..."


In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
emb_matrix_group = np.vstack(df_group["embedding"].values)
_EPS = 1e-10

def embed_query(query: str) -> np.ndarray:
    return embedding_model.encode([query])[0]

def retrieve_from_group_df(query: str, top_k: int = 5) -> pd.DataFrame:
    q_emb = embed_query(query)
    dot_products = emb_matrix_group @ q_emb
    norms = np.linalg.norm(emb_matrix_group, axis=1) * (np.linalg.norm(q_emb) + _EPS)
    similarities = dot_products / (norms + _EPS)

    top_idx = np.argsort(similarities)[::-1][:top_k]
    top_scores = similarities[top_idx]

    results = df_group.iloc[top_idx].copy()
    results["similarity"] = top_scores
    return results

In [None]:
#self-created test set for our group dataset

test_cases = [
    # 1. Sushi – Tokyo Sushi or Sakura Japanese Restaurant
    {
        "query": "Recommend a good sushi restaurant in Wilmington with high ratings.",
        "expected_name_keywords": ["tokyo", "sakura"]
    },

    # 2. Indian – Bawarchi Indian Restaurant
    {
        "query": "I want authentic Indian food in Wilmington.",
        "expected_name_keywords": ["bawarchi"]
    },

    # 3. Mexican – Taqueria Los Primos or QDOBA Mexican Eats
    {
        "query": "Suggest a casual Mexican restaurant with tasty food for dinner.",
        "expected_name_keywords": ["primos", "qdoba"]
    },

    # 4. Cozy Italian date night – Ristorante Attilio, Piccolina Toscana, Mona Lisa, Cafe Napoli
    {
        "query": "Recommend a cozy Italian restaurant for a date night in Wilmington.",
        "expected_name_keywords": ["attilio", "toscan", "mona", "napoli"]
    },

    # 5. Pizza – Ciao's Trolley Pizza & Grill, Season's Pizza, Grotto Pizza, Pizza Hut
    {
        "query": "Where can I get good pizza in Wilmington?",
        "expected_name_keywords": ["ciao", "season", "grotto", "pizza hut"]
    },

    # 6. Brunch / breakfast – Eeffoc Cafe, Maryland Avenue Sub Shop, Russell's Quality Food
    {
        "query": "Recommend a place for a good breakfast or brunch with hearty portions.",
        "expected_name_keywords": ["eeffoc", "maryland", "russell"]
    },

    # 7. Cheap and quick – Subway, Little Caesars Pizza, QDOBA Mexican Eats
    {
        "query": "Find a cheap and quick place to grab a meal in Wilmington.",
        "expected_name_keywords": ["subway", "little caesars", "qdoba"]
    },

    # 8. Wings / sports bar – Buffalo Wild Wings, Chili's
    {
        "query": "I want a place with chicken wings to watch a game.",
        "expected_name_keywords": ["buffalo wild", "chili"]
    },

    # 9. Steakhouse – Walter's Steakhouse
    {
        "query": "Recommend a nice steakhouse in Wilmington for a special occasion.",
        "expected_name_keywords": ["walter"]
    },

    # 10. Asian fusion / Vietnamese – Bubble Shack, 8th & Union Kitchen
    {
        "query": "Suggest an Asian fusion or Vietnamese restaurant in Wilmington.",
        "expected_name_keywords": ["bubble", "union"]
    },

    # 11. Grocery / food store – Trader Joe's
    {
        "query": "Is there a grocery store with good food options in Wilmington?",
        "expected_name_keywords": ["trader joe"]
    },

    # 12. American diner / comfort – Mary's Kountry Kitchen, Red Robin
    {
        "query": "Recommend a casual American restaurant or diner with comfort food.",
        "expected_name_keywords": ["mary", "red robin"]
    },
]

test_df = pd.DataFrame(test_cases)
test_df

Unnamed: 0,query,expected_name_keywords
0,Recommend a good sushi restaurant in Wilmingto...,"[tokyo, sakura]"
1,I want authentic Indian food in Wilmington.,[bawarchi]
2,Suggest a casual Mexican restaurant with tasty...,"[primos, qdoba]"
3,Recommend a cozy Italian restaurant for a date...,"[attilio, toscan, mona, napoli]"
4,Where can I get good pizza in Wilmington?,"[ciao, season, grotto, pizza hut]"
5,Recommend a place for a good breakfast or brun...,"[eeffoc, maryland, russell]"
6,Find a cheap and quick place to grab a meal in...,"[subway, little caesars, qdoba]"
7,I want a place with chicken wings to watch a g...,"[buffalo wild, chili]"
8,Recommend a nice steakhouse in Wilmington for ...,[walter]
9,Suggest an Asian fusion or Vietnamese restaura...,"[bubble, union]"


In [None]:
def evaluate_testset(df_group, test_df, top_k: int = 5):
    total = len(test_df)
    hits = 0
    detailed_rows = []

    for idx, row in test_df.iterrows():
        query = row["query"]
        expected_raw = row["expected_name_keywords"]

        if isinstance(expected_raw, str):
            expected_keywords = [expected_raw.lower()]
        else:
            expected_keywords = [e.lower() for e in expected_raw]

        retrieved = retrieve_from_group_df(query, top_k=top_k)
        retrieved_names = [str(n).lower() for n in retrieved["name"].tolist()]

        hit = any(
            any(exp_kw in name for name in retrieved_names)
            for exp_kw in expected_keywords
        )
        hits += int(hit)

        detailed_rows.append({
            "query": query,
            "expected_keywords": expected_keywords,
            "retrieved_names": retrieved["name"].tolist(),
            f"hit@{top_k}": hit
        })

    hit_rate = hits / total if total > 0 else 0.0
    detailed_df = pd.DataFrame(detailed_rows)
    return hit_rate, detailed_df

In [None]:
top_k = 5
hit_rate, detailed_results = evaluate_testset(df_group, test_df, top_k=top_k)

print(f"Hit-rate@{top_k}: {hit_rate:.2f}  (i.e., {hit_rate*100:.1f}%)")
detailed_results

Hit-rate@5: 0.75  (i.e., 75.0%)


Unnamed: 0,query,expected_keywords,retrieved_names,hit@5
0,Recommend a good sushi restaurant in Wilmingto...,"[tokyo, sakura]","[Sakura Japanese Restaurant, Tokyo Sushi, Saku...",True
1,I want authentic Indian food in Wilmington.,[bawarchi],"[Mary's Kountry Kitchen, Russell's Quality Foo...",False
2,Suggest a casual Mexican restaurant with tasty...,"[primos, qdoba]","[QDOBA Mexican Eats, Cafe Napoli Restaurant & ...",True
3,Recommend a cozy Italian restaurant for a date...,"[attilio, toscan, mona, napoli]","[Mona Lisa, Mary's Kountry Kitchen, Cafe Napol...",True
4,Where can I get good pizza in Wilmington?,"[ciao, season, grotto, pizza hut]","[Mary's Kountry Kitchen, Grotto Pizza, Pizza H...",True
5,Recommend a place for a good breakfast or brun...,"[eeffoc, maryland, russell]","[Mona Lisa, Season's Pizza, QDOBA Mexican Eats...",False
6,Find a cheap and quick place to grab a meal in...,"[subway, little caesars, qdoba]","[Mary's Kountry Kitchen, Russell's Quality Foo...",True
7,I want a place with chicken wings to watch a g...,"[buffalo wild, chili]","[Buffalo Wild Wings, Buffalo Wild Wings, Piree...",True
8,Recommend a nice steakhouse in Wilmington for ...,[walter],"[Mary's Kountry Kitchen, Walter's Steakhouse, ...",True
9,Suggest an Asian fusion or Vietnamese restaura...,"[bubble, union]","[Mary's Kountry Kitchen, Mona Lisa, Russell's ...",True


# Step 4: Compare the Results

In [None]:
#without rag
no_rag_response = llm(query, max_length=220, num_return_sequences=1)[0]["generated_text"]

#with rag
rag_response = generate_with_rag(query, top_k=5, max_length=260)

print("\n[ANSWER WITHOUT RAG:]\n")
print(no_rag_response)

print("\n\n[ANSWER WITH RAG:]\n")
print(rag_response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=220) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=260) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



[ANSWER WITHOUT RAG:]

Recommend a good sushi restaurant in Wilmington with high ratings. A great place to live and work. This place has been on my list of my favorites for over a year now so my favorite one is the Oahu location. The sushi is not bad, but still not great. If you want to go to a good sushi place in Wilmington, you should check out Oahu. They have a lot of options and it's hard to find good sushi in Wilmington if you don't order from there. However, if you want to go to a good sushi restaurant and want to stay in Wilmington for a long time, this place is worth going to.

I love this place. It's a great place to live and work. On a weekday night I went for a nice quiet and casual sushi night with friends. It was so good I decided to order the Oahu line. The food was delicious, the service was exceptional and I love the local food. The sushi was fresh, the service was good, and the staff was friendly. The place is a bit busy, but I can tell you this place is worth every d