In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# ─── your other imports ───
# from flask import Flask, render_template, request
# import numpy as np, re, etc.

# ─── 1) Load business metadata ────────────────────────────────────────────────
biz_df = pd.read_json("business.json", lines=True)
cities = sorted(biz_df["city"].dropna().unique())

# ─── 2) Dynamically mine top dish-phrases from review.json ────────────────────
# (you can adjust min_df or the star cutoff to taste)
reviews_all = pd.read_json("review.json", lines=True)

# Only keep “positive” reviews to bias toward dishes folks love
pos = reviews_all[reviews_all["stars"] >= 4]["text"].dropna()

# Build a CountVectorizer over 1–2 grams, ignoring super-rare terms
cv = CountVectorizer(
    ngram_range=(1,2),
    stop_words="english",
    min_df=100   # appear in at least 100 different positive reviews
)
X = cv.fit_transform(pos)

# Sum up frequencies and pick the top 500 phrases
freq = X.sum(axis=0).A1               # total count per phrase
phrases = cv.get_feature_names_out()  # aligned indices
top_idx = freq.argsort()[::-1][:500]  # indices of 500 most frequent
dishes  = [phrases[i] for i in top_idx]

# Now `dishes` is a list like ["chicken", "naan", "tandoori chicken", ...]
# You can use it in your upload.html datalist just as before.


In [4]:
# Assuming 'dishes' is your list of strings
with open("seed_dishes.txt", "w", encoding="utf-8") as f:
    for dish in dishes:
        f.write(dish + "\n")


In [6]:
import pandas as pd
import json

# Load reviews (you can filter by city later using business metadata)
with open("review.json", "r") as f:
    reviews = [json.loads(line) for line in f]

df_reviews = pd.DataFrame(reviews)
df_reviews = df_reviews[["business_id", "stars", "text"]]


In [7]:
df_reviews["pos_review"] = df_reviews["stars"] >= 4

agg = df_reviews.groupby("business_id").agg({
    "stars": ["mean", "count"],
    "pos_review": "sum"
}).reset_index()

agg.columns = ["business_id", "avg_rating", "review_count", "pos_review_count"]


In [8]:
with open("business.json", "r") as f:
    businesses = [json.loads(line) for line in f]

df_business = pd.DataFrame(businesses)
df_business = df_business[["business_id", "name", "city", "categories"]]

df_merged = agg.merge(df_business, on="business_id")


In [9]:
import numpy as np
df_merged["score"] = df_merged["avg_rating"] * np.log1p(df_merged["pos_review_count"])
df_top = df_merged.sort_values(by="score", ascending=False).head(20)


In [10]:
df_top.head()

Unnamed: 0,business_id,avg_rating,review_count,pos_review_count,name,city,categories,score
2503,2e2e7WgqU1BnpxmQL5jbfw,4.335274,3263,2817,Earl of Sandwich,Las Vegas,"[Sandwiches, Restaurants]",34.438477
3773,4bEjOyTaDG24SY5TxsaUNQ,4.14046,3695,3008,Mon Ami Gabi,Las Vegas,"[Breakfast & Brunch, Steakhouses, French, Rest...",33.162448
10559,Es300Ys1XXPYg8aI7BKVYQ,4.241188,1986,1625,XS Nightclub,Las Vegas,"[Nightlife, Dance Clubs]",31.35883
3138,3f-RP2-EE94eifGnepUBpg,4.415631,1126,948,Cirque du Soleil - The Beatles LOVE,Las Vegas,"[Arts & Entertainment, Performing Arts]",30.270953
32749,lliksv-tglfUz1T3B3vgvA,4.204614,1647,1325,Lotus of Siam,Las Vegas,"[Thai, Restaurants]",30.230851


In [12]:
import json

input_path = "review.json"
output_path = "review_filtered.json"

with open(input_path, 'r') as fin, open(output_path, 'w') as fout:
    for line in fin:
        review = json.loads(line)
        if review.get("stars", 0) >= 3.5:
            fout.write(json.dumps(review) + '\n')
