# 1. Discover all restaurants in Christchurch

## 1.1 Test with small subset 

In [1]:
import os
import requests
import pandas as pd

In [2]:
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY") 

In [3]:
endpoint = "https://serpapi.com/search.json"
params = {
    "engine": "google_maps",
    "q": "restaurants in Christchurch, New Zealand",
    "hl": "en",
    "api_key": SERPAPI_API_KEY,
    "num": 4,
}

In [4]:
response = requests.get(endpoint, params=params, timeout=30)
print("Status:", response.status_code)

Status: 200


In [5]:
data = response.json()

In [6]:
print("Top-level keys:", list(data.keys()))

Top-level keys: ['search_metadata', 'search_parameters', 'search_information', 'local_results']


In [12]:
results = data.get("local_results") or data.get("places") or data.get("results") or []
print(f"Fetched {len(results)} restaurants")

Fetched 20 restaurants


In [17]:
# Normalize to DataFrame
rows = []
for r in results[:20]:  # Explicit limit of 4, just in case
    gps = r.get("gps_coordinates", {})
    rows.append({
        "title": r.get("title"),
        "address": r.get("address"),
        "rating": r.get("rating"),
        "reviews_count": r.get("reviews_count") or r.get("reviews"),
        "type": r.get("type") or r.get("category"),
        "lat": gps.get("latitude"),
        "lon": gps.get("longitude"),
        "place_id": r.get("place_id"),
        "link": r.get("link") or r.get("place_link"),
    })

In [19]:
df = pd.DataFrame(rows)
df.head(20)

Unnamed: 0,title,address,rating,reviews_count,type,lat,lon,place_id,link
0,Fiddlesticks Restaurant and Bar,"Corner of Worcester Boulevard &, Montreal Stre...",4.5,2151,Restaurant,-43.53113,172.630861,ChIJyWuAlj6KMW0RJoEgBUNTpUY,
1,Little High Eatery,"255 Saint Asaph Street, Christchurch Central C...",4.6,4772,Restaurant,-43.535783,172.64093,ChIJ715FmhiKMW0R3l3ead1fVoc,
2,Casa Publica,"180 Armagh Street, Christchurch Central City, ...",4.5,1918,Restaurant,-43.52887,172.638831,ChIJ-VFSwCSKMW0Ravlz2-ykUCk,
3,King of Snake,"Level 1/79 Cashel Street, Christchurch Central...",4.7,1982,Restaurant,-43.532998,172.634066,ChIJ9Sy6qDeKMW0R9VU19Pt-Tms,
4,Twenty Seven Steps,"Christchurch Central City, Christchurch 8011, ...",4.7,1117,Modern European restaurant,-43.529415,172.638797,ChIJOWVyvSSKMW0R9VyXIwcit6A,
5,5th Street,"5 Elgin Street, Sydenham, Christchurch 8023, N...",4.8,1103,Restaurant,-43.545705,172.633856,ChIJtWlYrdWLMW0RtM_yFEBNmyI,
6,The Monday Room,"161 High Street, Christchurch Central City, Ch...",4.7,577,Restaurant,-43.535844,172.641503,ChIJNeR2qx6KMW0RpEdCD-4OEiQ,
7,Bloody Mary's,"30 Latimer Square, Christchurch Central City, ...",4.4,1709,Bar & grill,-43.530631,172.643532,ChIJs1hbUiGKMW0RpEbiAchchGY,
8,Dux Dine,"28 Riccarton Road, Riccarton, Christchurch 801...",4.3,1495,Restaurant,-43.528824,172.608232,ChIJUb-b0F6KMW0RfpU-5ucKrz0,
9,Inati restaurant,"48 Hereford Street, Christchurch Central City,...",4.6,411,Restaurant,-43.53215,172.632474,ChIJeS560z2KMW0Rkq0hCGpDF54,


## 1.2 READ THE CHC RESTAURANT PLACE ID DATASET 

In [21]:
chc_restaurant = pd.read_csv("data/google-data/chc_google_places.csv")

chc_restaurant.head()

Unnamed: 0,place_id,data_id,title,address,lat,lon,type,rating,reviews_count,url,search_query,start_offset,serpapi_search_id,unique_key
0,ChIJ715FmhiKMW0R3l3ead1fVoc,0x6d318a189a455eef:0x87565fdd69de5dde,Little High Eatery,"255 Saint Asaph Street, Christchurch Central C...",-43.535783,172.64093,Restaurant,4.6,4772.0,,"restaurants in Christchurch, New Zealand",0,691504b4f5c2f87c4863c1f7,ChIJ715FmhiKMW0R3l3ead1fVoc
1,ChIJS9eZVjyKMW0R-LmTqSCw23U,0x6d318a3c5699d74b:0x75dbb020a993b9f8,The Rockpool Bar,"85 Hereford Street, Christchurch Central City,...",-43.531927,172.63459,Bar & grill,4.2,2757.0,,"bars in Christchurch, New Zealand",0,6915061fcc56d3342f0925f9,ChIJS9eZVjyKMW0R-LmTqSCw23U
2,ChIJ78Zg4HOLMW0Rbh5zuS_Mdc8,0x6d318b73e060c6ef:0xcf75cc2fb9731e6e,Smokey T's,"314 Cashel Street, Christchurch Central City, ...",-43.533386,172.648111,Barbecue restaurant,4.8,2423.0,,"restaurants in Christchurch, New Zealand",60,691504c127133264efad5ce1,ChIJ78Zg4HOLMW0Rbh5zuS_Mdc8
3,ChIJBV_ck_SKMW0RgIWgFz4arWs,0x6d318af493dc5f05:0x6bad1a3e17a08580,Drexel's Breakfast Restaurant,"Rotherham Street, Riccarton, Christchurch 8041...",-43.531336,172.600797,Breakfast restaurant,4.5,2415.0,,"restaurants in Christchurch, New Zealand",40,691504bc05bb59bb59ebd5b1,ChIJBV_ck_SKMW0RgIWgFz4arWs
4,ChIJ9XtNIxqKMW0Rm9tQ--IE0bc,0x6d318a1a234d7bf5:0xb7d104e2fb50db9b,Denny‚Äôs Christchurch,"382 Moorhouse Avenue, Christchurch Central Cit...",-43.540269,172.639198,Restaurant,3.7,2328.0,,"restaurants in Christchurch, New Zealand",100,691504c89c231269bd88432f,ChIJ9XtNIxqKMW0Rm9tQ--IE0bc


In [22]:
chc_restaurant.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551 entries, 0 to 550
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   place_id           551 non-null    object 
 1   data_id            551 non-null    object 
 2   title              551 non-null    object 
 3   address            551 non-null    object 
 4   lat                551 non-null    float64
 5   lon                551 non-null    float64
 6   type               551 non-null    object 
 7   rating             549 non-null    float64
 8   reviews_count      549 non-null    float64
 9   url                0 non-null      float64
 10  search_query       551 non-null    object 
 11  start_offset       551 non-null    int64  
 12  serpapi_search_id  551 non-null    object 
 13  unique_key         551 non-null    object 
dtypes: float64(5), int64(1), object(8)
memory usage: 60.4+ KB


In [54]:
chc_restaurant.isnull().sum()

place_id               0
data_id                0
title                  0
address                0
lat                    0
lon                    0
type                   0
rating                 2
reviews_count          2
url                  551
search_query           0
start_offset           0
serpapi_search_id      0
unique_key             0
dtype: int64

In [57]:
chc_restaurant[chc_restaurant["reviews_count"].isnull()]

Unnamed: 0,place_id,data_id,title,address,lat,lon,type,rating,reviews_count,url,search_query,start_offset,serpapi_search_id,unique_key
549,ChIJF1NhCJCLMW0RvyL-l-jBhJY,0x6d318b9008615317:0x9684c1e897fe22bf,stpierre stores in christchurch,"Papanui, Christchurch 8053, New Zealand",-43.490655,172.603093,Japanese restaurant,,,,"japanese restaurant in Christchurch, New Zealand",80,691506d313f4b687fe830e7b,ChIJF1NhCJCLMW0RvyL-l-jBhJY
550,ChIJi8Uaw7-LMW0R9t5QmJr_GGU,0x6d318bbfc31ac58b:0x6518ff9a9850def6,Zayed‚Äôs,"88 Cashel Street, Christchurch Central City, C...",-43.533252,172.634303,Indian restaurant,,,,"indian restaurant in Christchurch, New Zealand",60,691506e7556ce3627c5e66e1,ChIJi8Uaw7-LMW0R9t5QmJr_GGU


In [58]:
bad_place_ids = [
    "ChIJF1NhCJCLMW0RvyL-l-jBhJY",
    "ChIJi8Uaw7-LMW0R9t5QmJr_GGU"
]

# Filter them out
chc_restaurant_clean = chc_restaurant[~chc_restaurant["place_id"].isin(bad_place_ids)].copy()

len_before = len(chc_restaurant)
len_after = len(chc_restaurant_clean)

print(f"Removed {len_before - len_after} restaurants.")
print(f"Remaining restaurants: {len_after}")

Removed 2 restaurants.
Remaining restaurants: 549


In [59]:
chc_restaurant = chc_restaurant_clean

In [65]:
chc_restaurant.isnull().sum()

place_id               0
data_id                0
title                  0
address                0
lat                    0
lon                    0
type                   0
rating                 0
reviews_count          0
url                  549
search_query           0
start_offset           0
serpapi_search_id      0
unique_key             0
dtype: int64

In [66]:
chc_restaurant.to_csv("data/google-data/chc_google_places_v1.csv", index=False)

In [64]:
chc_restaurant["reviews_count"].describe()

count     549.000000
mean      471.593807
std       514.763436
min         1.000000
25%       141.000000
50%       295.000000
75%       606.000000
max      4772.000000
Name: reviews_count, dtype: float64

In [23]:
# How many unique restaurants in each search query
summary = (
    chc_restaurant
    .groupby("search_query")
    .agg(unique_restaurants=("unique_key", "nunique"))
    .sort_values("unique_restaurants", ascending=False)
)

print(summary)

                                                    unique_restaurants
search_query                                                          
bars in Christchurch, New Zealand                                  102
japanese restaurant in Christchurch, New Zealand                    86
chinese restaurant in Christchurch, New Zealand                     84
indian restaurant in Christchurch, New Zealand                      79
thai restaurant in Christchurch, New Zealand                        72
food in Christchurch, New Zealand                                   60
restaurants in Christchurch, New Zealand                            44
vietnamese restaurant in Christchurch, New Zealand                  24


* each restaurant may appear under multiple queries, we cannot simply sum the reviews_count column directly
  

In [24]:
# Sum the total reviews count
total_reviews = (
    chc_restaurant
    .groupby("unique_key")["reviews_count"]
    .max()      # or first(), reviews_count is the same for duplicate rows
    .sum()
)

print("Total review count:", int(total_reviews))

Total review count: 258905


In [27]:
unique_restaurants = (
    chc_restaurant
    .groupby("unique_key")
    .agg({
        "title": "first",
        "reviews_count": "max"
    })
)

unique_restaurants.head(20)

Unnamed: 0_level_0,title,reviews_count
unique_key,Unnamed: 1_level_1,Unnamed: 2_level_1
ChIJ---ig4OJMW0RicyQ-MSYXWc,Sun Ning Takeaways,277.0
ChIJ--CTb9CJMW0RRB_3kl9puIE,Ferry Indians Restaurant,373.0
ChIJ-3bod9qLMW0R4okkg02foAU,Tanoshi Christchurch,749.0
ChIJ-RznvSSKMW0R1zzUrxn2Ix8,The Nook Thai Eatery,262.0
ChIJ-VFSwCSKMW0Ravlz2-ykUCk,Casa Publica,1918.0
ChIJ-W_zzA2LMW0RrQJtBIQEgJU,Sushi Court,18.0
ChIJ-fT1P8SPMW0RJrXe3Cn6eQ0,Samairaz Indian Restaurant,711.0
ChIJ-xKFcWmLMW0RM_kBgFMJQQQ,Tavern Harewood,1689.0
ChIJ-zvZccf0MW0R0IJMWE_QzbA,Little India Spitfire,494.0
ChIJ0-A9udeLMW0R8IyGHtEZNIY,Bollyfood,172.0


- 551 unique restaurants accross CHC has been scrapped
- There are: 258905 reviews in total

#### Identify The Cutoff To Reduce Quota 

In [40]:
# total reviewa by restaurants
total_reviews_by_restaurants = chc_restaurant[['unique_key', 'reviews_count']]


total_reviews_by_restaurants['estimated_reviews_last_1_year'] = total_reviews_by_restaurants['reviews_count'] * 0.15
total_reviews_by_restaurants['estimated_reviews_last_2_years'] = total_reviews_by_restaurants['reviews_count'] * 0.30
total_reviews_by_restaurants['estimated_reviews_last_3_years'] = total_reviews_by_restaurants['reviews_count'] * 0.45


estimated_cost_1_year = total_reviews_by_restaurants['estimated_reviews_last_1_year'].sum() / 10
estimated_cost_2_years = total_reviews_by_restaurants['estimated_reviews_last_2_years'].sum() / 10
estimated_cost_3_years = total_reviews_by_restaurants['estimated_reviews_last_3_years'].sum() / 10

print("Requests for 1-year cutoff:", estimated_cost_1_year)
print("Requests for 2-year cutoff:", estimated_cost_2_years)
print("Requests for 3-year cutoff:", estimated_cost_3_years)

Requests for 1-year cutoff: 3883.5749999999994
Requests for 2-year cutoff: 7767.149999999999
Requests for 3-year cutoff: 11650.725


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_reviews_by_restaurants['estimated_reviews_last_1_year'] = total_reviews_by_restaurants['reviews_count'] * 0.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_reviews_by_restaurants['estimated_reviews_last_2_years'] = total_reviews_by_restaurants['reviews_count'] * 0.30
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ret

In [31]:
# Delete the search query "bars in Christchurch, New Zealand" as it obtain a highest restaurants
# Count before removal
print("Total restaurants before:", len(chc_restaurant))

# Define the search query to remove
REMOVE_QUERY = "bars in Christchurch, New Zealand"

chc_restaurants_without_bar = chc_restaurant[chc_restaurant["search_query"] != REMOVE_QUERY]

# Count after remove
print("Total restaurants after:", len(chc_restaurants_without_bar))

Total restaurants before: 551
Total restaurants after: 449


In [32]:
# How many unique restaurants in each search query
summary = (
    chc_restaurants_without_bar
    .groupby("search_query")
    .agg(unique_restaurants=("unique_key", "nunique"))
    .sort_values("unique_restaurants", ascending=False)
)

print(summary)

                                                    unique_restaurants
search_query                                                          
japanese restaurant in Christchurch, New Zealand                    86
chinese restaurant in Christchurch, New Zealand                     84
indian restaurant in Christchurch, New Zealand                      79
thai restaurant in Christchurch, New Zealand                        72
food in Christchurch, New Zealand                                   60
restaurants in Christchurch, New Zealand                            44
vietnamese restaurant in Christchurch, New Zealand                  24


In [34]:
chc_restaurants_without_bar['reviews_count'].sum()

np.float64(198044.0)

In [35]:
# Define the search query to remove
REMOVE_QUERY = "restaurants in Christchurch, New Zealand"

chc_restaurants_without_restaurants = chc_restaurants_without_bar[chc_restaurants_without_bar["search_query"] != REMOVE_QUERY]

# Count after remove
print("Total restaurants after:", len(chc_restaurants_without_restaurants))

Total restaurants after: 405


In [36]:
chc_restaurants_without_restaurants['reviews_count'].sum()

np.float64(153918.0)

# 2. Scrape Google Reviews Data 

After estimate the number of requests for each of 551 restaurants and there reviews counts is extremely large.

So I decide to scrape only 10 latest reviews (1st page) of each restaurants only.

So in total there will be 551 requests. 

## 2.1 Test with only one specific restaurant first 

In [42]:
# --- Select only "BB Vietnamese Street Food" ---
target_name = "BB Vietnamese Street Food"

row = chc_restaurant[chc_restaurant["title"] == target_name]

if row.empty:
    raise ValueError(f"Restaurant '{target_name}' not found in chc_restaurant dataset")

place_id = row.iloc[0]["place_id"]
data_id = row.iloc[0]["data_id"]

print("Selected restaurant:")
row[["title", "place_id", "data_id", "reviews_count"]]

Selected restaurant:


Unnamed: 0,title,place_id,data_id,reviews_count
409,BB Vietnamese Street Food,ChIJzZunKzCLMW0RIrdqXcaoYLo,0x6d318b302ba79bcd:0xba60a8c65d6ab722,142.0


In [43]:
# SerpAPI Key
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
if not SERPAPI_API_KEY:
    raise RuntimeError("Please set SERPAPI_API_KEY environment variable")


# --- SerpAPI endpoint ---
SERPAPI_ENDPOINT = "https://serpapi.com/search.json"

# --- Build request ---
params = {
    "engine": "google_maps_reviews",
    "api_key": SERPAPI_API_KEY,
    "place_id": place_id,     # either place_id or data_id works
    "hl": "en",
    "sort_by": "newestFirst", # important: newest 10 reviews only!
    # WARNING: num cannot be used on the first page per SerpAPI rules,
    # so we will always get 8-10 reviews automatically.
}

print("\nRequesting newest reviews‚Ä¶")

r = requests.get(SERPAPI_ENDPOINT, params=params)
print("HTTP Status:", r.status_code)

data = r.json()


Requesting newest reviews‚Ä¶
HTTP Status: 200


In [44]:
# --- Extract reviews ---
reviews = data.get("reviews", [])

print(f"\nüîç Retrieved {len(reviews)} newest reviews for: {target_name}\n")

# Pretty print first 1‚Äì2 reviews
for i, rv in enumerate(reviews[:2]):
    print(f"--- Review {i+1} ---")
    print("Rating:", rv.get("rating"))
    print("Date:", rv.get("date"))
    print("Text:", rv.get("snippet"))
    print()


üîç Retrieved 8 newest reviews for: BB Vietnamese Street Food

--- Review 1 ---
Rating: 5.0
Date: a week ago
Text: Really good beef stew,friendly staff, best service in Christchuch i have ever tried

--- Review 2 ---
Rating: 4.0
Date: 2 weeks ago
Text: Taste of food was pretty good. The shaking beefs flavour was pretty on par with a good one however the beef wasn't as tender as I have experienced elsewhere but none the less not bad at all. Bahn mi was very fresh and delicious and would highly recommend. A word of advice tho...make sure to either order in advance or avoid during lunch as the place gets pretty swamped making wait times super long as it is only a small kitchen.



In [46]:
# --- Convert to DataFrame ---
df_reviews = pd.DataFrame(reviews)

print("\nDataFrame columns:", df_reviews.columns.tolist())
df_reviews.head(20)


DataFrame columns: ['link', 'rating', 'date', 'iso_date', 'iso_date_of_last_edit', 'images', 'source', 'review_id', 'user', 'snippet', 'extracted_snippet', 'details', 'response']


Unnamed: 0,link,rating,date,iso_date,iso_date_of_last_edit,images,source,review_id,user,snippet,extracted_snippet,details,response
0,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-02T01:59:48Z,2025-11-02T01:59:48Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT2t0c2EyTlNOV2cwYWtkS1...,"{'name': 'My Le', 'link': 'https://www.google....","Really good beef stew,friendly staff, best ser...","{'original': 'Really good beef stew,friendly s...","{'meal_type': 'Lunch', 'price_per_person': 'NZ...",
1,https://www.google.com/maps/reviews/data=!4m8!...,4.0,2 weeks ago,2025-10-29T07:04:03Z,2025-10-29T07:08:00Z,,Google,Ci9DQUlRQUNvZENodHljRjlvT2xkMFprazBVV28zVlVoQ0...,"{'name': 'loading user', 'link': 'https://www....",Taste of food was pretty good. The shaking bee...,{'original': 'Taste of food was pretty good. T...,"{'price_per_person': 'NZ$20‚Äì30', 'food': 4, 's...","{'date': '2 weeks ago', 'iso_date': '2025-10-2..."
2,https://www.google.com/maps/reviews/data=!4m8!...,5.0,2 weeks ago,2025-10-25T23:07:53Z,2025-10-25T23:07:53Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT25sRWFISnpXR2RuUWpsbl...,"{'name': 'ÈÇ¢Ê†ÄÂ™õ', 'link': 'https://www.google.co...",The most delicious Vietnamese restaurant in Ch...,{'original': 'The most delicious Vietnamese re...,"{'meal_type': 'Lunch', 'price_per_person': 'NZ...","{'date': '2 weeks ago', 'iso_date': '2025-10-2..."
3,https://www.google.com/maps/reviews/data=!4m8!...,5.0,3 weeks ago,2025-10-16T00:41:51Z,2025-10-16T00:41:51Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT21neWQwdFpWMGx4V1U5SE...,"{'name': 'Usoalii Tafua', 'link': 'https://www...",Best food Best service üíØüèÜ,{'original': 'Best food Best service üíØüèÜ'},"{'meal_type': 'Dinner', 'price_per_person': 'N...","{'date': '2 weeks ago', 'iso_date': '2025-10-2..."
4,https://www.google.com/maps/reviews/data=!4m8!...,5.0,4 weeks ago,2025-10-14T00:53:36Z,2025-10-14T00:53:36Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT2tocVYyeDBkV3BHTkU0dF...,"{'name': '‡∏û‡∏ä‡∏£ ‡∏≠‡∏∏‡∏ï‡∏ï‡∏°‡πÇ‡∏†‡∏Ñ‡∏¥‡∏ô', 'link': 'https://ww...",Good,{'original': 'Good'},,"{'date': '2 weeks ago', 'iso_date': '2025-10-2..."
5,https://www.google.com/maps/reviews/data=!4m8!...,5.0,4 weeks ago,2025-10-13T08:00:54Z,2025-10-13T08:00:54Z,,Google,Ci9DQUlRQUNvZENodHljRjlvT21OM1MwcDZSV1UzWXpoTU...,"{'name': 'kevin wu', 'link': 'https://www.goog...",Great food and good service. The food has grea...,{'original': 'Great food and good service. The...,"{'service': 5, 'price_per_person': 'NZ$20‚Äì30',...","{'date': '2 weeks ago', 'iso_date': '2025-10-2..."
6,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a month ago,2025-10-08T08:15:53Z,2025-10-08T08:15:53Z,,Google,Ci9DQUlRQUNvZENodHljRjlvT2xkclp6ZG9iRmREUTFZMm...,"{'name': 'Angie Tso', 'link': 'https://www.goo...",Such a cute outdoor eatery with some of the BE...,{'original': 'Such a cute outdoor eatery with ...,"{'meal_type': 'Lunch', 'price_per_person': 'NZ...","{'date': 'a month ago', 'iso_date': '2025-10-0..."
7,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a month ago,2025-09-20T23:48:23Z,2025-09-20T23:48:23Z,,Google,Ci9DQUlRQUNvZENodHljRjlvT20wd1dYbHRaRTVwWTBoWG...,"{'name': 'Thu Hue', 'link': 'https://www.googl...",,,"{'service': 5, 'meal_type': 'Lunch', 'price_pe...","{'date': 'a month ago', 'iso_date': '2025-09-2..."


## 2.1 Test with Pagenition 

In [47]:
# Pick one restaurant by name
target_name = "BB Vietnamese Street Food"

restaurant_row = chc_restaurant[chc_restaurant["title"] == target_name].iloc[0]
place_id = restaurant_row["place_id"]

place_id

'ChIJzZunKzCLMW0RIrdqXcaoYLo'

In [48]:
def scrape_reviews_page(place_id, next_page_token=None):
    params = {
        "engine": "google_maps_reviews",
        "api_key": SERPAPI_API_KEY,
        "hl": "en",
        "place_id": place_id,
        "sort_by": "newestFirst",
    }

    # Only add next_page_token for page 2
    if next_page_token:
        params["next_page_token"] = next_page_token

    r = requests.get(SERPAPI_ENDPOINT, params=params, timeout=60)
    if r.status_code != 200:
        print(r.text)
        raise RuntimeError(f"SerpAPI error {r.status_code}")

    data = r.json()
    return data

In [49]:
# --- Page 1 ---
page1 = scrape_reviews_page(place_id)
page1_reviews = page1.get("reviews", [])

print("Page 1 reviews:", len(page1_reviews))

# Get token for page 2
next_token = page1.get("serpapi_pagination", {}).get("next_page_token")
print("Next page token:", next_token)

# --- Page 2 ---
if next_token:
    page2 = scrape_reviews_page(place_id, next_page_token=next_token)
    page2_reviews = page2.get("reviews", [])
    print("Page 2 reviews:", len(page2_reviews))
else:
    page2_reviews = []
    print("No second page available.")

Page 1 reviews: 8
Next page token: CAESY0NBRVFDQnBFUTJwRlNVRlNTWEJEWjI5QlVEY3lSMkZYWlV0RlZsOWZSV2hFYzNWTVJUZFBTSFppUWtwR2VWSndZMEZCUVVGQlIyZHVPUzE0WjBOYWNGbzBiMUpWV1VGRFNVRQ==
Page 2 reviews: 10


In [51]:
all_reviews = page1_reviews + page2_reviews

df_reviews = pd.json_normalize(all_reviews)
df_reviews.head()

Unnamed: 0,link,rating,date,iso_date,iso_date_of_last_edit,images,source,review_id,snippet,user.name,...,details.noise_level,details.wait_time,details.group_size,details.seating_type,details.parking_space,response.date,response.iso_date,response.iso_date_of_last_edit,response.snippet,response.extracted_snippet.original
0,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-02T01:59:48Z,2025-11-02T01:59:48Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT2t0c2EyTlNOV2cwYWtkS1...,"Really good beef stew,friendly staff, best ser...",My Le,...,"Quiet, easy to talk",No wait,,,,,,,,
1,https://www.google.com/maps/reviews/data=!4m8!...,4.0,2 weeks ago,2025-10-29T07:04:03Z,2025-10-29T07:08:00Z,,Google,Ci9DQUlRQUNvZENodHljRjlvT2xkMFprazBVV28zVlVoQ0...,Taste of food was pretty good. The shaking bee...,loading user,...,,,Not sure,"Outdoor patio / terrace, Counter seating",Not sure,2 weeks ago,2025-10-29T09:11:53Z,2025-10-29T09:11:53Z,Thank you so much for your kind review and ver...,Thank you so much for your kind review and ver...
2,https://www.google.com/maps/reviews/data=!4m8!...,5.0,2 weeks ago,2025-10-25T23:07:53Z,2025-10-25T23:07:53Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT25sRWFISnpXR2RuUWpsbl...,The most delicious Vietnamese restaurant in Ch...,ÈÇ¢Ê†ÄÂ™õ,...,Moderate noise,No wait,,,,2 weeks ago,2025-10-26T04:35:57Z,2025-10-26T04:35:57Z,"Thank you so much for your lovely review, we r...","Thank you so much for your lovely review, we r..."
3,https://www.google.com/maps/reviews/data=!4m8!...,5.0,3 weeks ago,2025-10-16T00:41:51Z,2025-10-16T00:41:51Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT21neWQwdFpWMGx4V1U5SE...,Best food Best service üíØüèÜ,Usoalii Tafua,...,,,Suitable for all group sizes,,,2 weeks ago,2025-10-26T04:36:29Z,2025-10-26T04:36:29Z,Thank you so much for rating us ü•∞,Thank you so much for rating us ü•∞
4,https://www.google.com/maps/reviews/data=!4m8!...,5.0,4 weeks ago,2025-10-14T00:53:36Z,2025-10-14T00:53:36Z,[https://lh3.googleusercontent.com/geougc-cs/A...,Google,Ci9DQUlRQUNvZENodHljRjlvT2tocVYyeDBkV3BHTkU0dF...,Good,‡∏û‡∏ä‡∏£ ‡∏≠‡∏∏‡∏ï‡∏ï‡∏°‡πÇ‡∏†‡∏Ñ‡∏¥‡∏ô,...,,,,,,2 weeks ago,2025-10-26T04:36:43Z,2025-10-26T04:36:43Z,Thank you so much ü•∞,Thank you so much ü•∞


In [53]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 32 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   link                                 18 non-null     object 
 1   rating                               18 non-null     float64
 2   date                                 18 non-null     object 
 3   iso_date                             18 non-null     object 
 4   iso_date_of_last_edit                18 non-null     object 
 5   images                               9 non-null      object 
 6   source                               18 non-null     object 
 7   review_id                            18 non-null     object 
 8   snippet                              17 non-null     object 
 9   user.name                            18 non-null     object 
 10  user.link                            18 non-null     object 
 11  user.contributor_id               

## 2.2 READ THE CHC GOOGLE REVIEWS DATA

In [71]:
chc_google_reviews = pd.read_csv('data/google-data/google-reviews/chc_reviews.csv')

chc_google_reviews.head()

Unnamed: 0,link,rating,date,iso_date,iso_date_of_last_edit,source,review_id,snippet,page_number,place_id,...,details.recommended_dishes,translated_details.kid_friendliness,details.special_offers,details.food_drinks,details.noteworthy_details,translated_details.vegetarian_options,response.extracted_snippet.translated,details.recommendation_for_vegetarians,details.vegetarian_offerings,translated_details.dietary_restrictions
0,https://www.google.com/maps/reviews/data=!4m8!...,3.0,2 days ago,2025-11-11T00:25:07Z,2025-11-11T00:25:07Z,Google,Ci9DQUlRQUNvZENodHljRjlvT2xWMFpGYzRTemhuU0dKSl...,"Thaifood hit n miss, bland mince meat rice, un...",1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
1,https://www.google.com/maps/reviews/data=!4m8!...,5.0,2 days ago,2025-11-10T10:15:50Z,2025-11-10T10:15:50Z,Google,Ci9DQUlRQUNvZENodHljRjlvT2w4NWQzbE1WRzFJY2w5bl...,Another one of those great finds....that you d...,1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
2,https://www.google.com/maps/reviews/data=!4m8!...,5.0,6 days ago,2025-11-06T18:56:05Z,2025-11-06T18:56:05Z,Google,Ci9DQUlRQUNvZENodHljRjlvT2xaSExTMHdUREJrZDFJd0...,Super food and atmosphere,1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
3,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-05T07:42:20Z,2025-11-05T07:42:20Z,Google,Ci9DQUlRQUNvZENodHljRjlvT25KVmFXSm9kMkZ4Y1hkbE...,"Food stalls.\nGot many things: from burger, bb...",1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
4,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-05T07:14:08Z,2025-11-05T07:16:10Z,Google,Ci9DQUlRQUNvZENodHljRjlvT205UFlVWm9hbHAyTkhGd2...,"Kao soi is delicious, very nice service kob kh...",1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,


In [72]:
chc_google_reviews['place_id'].nunique()

549

### Handle data with no text reviews 

In [86]:
chc_google_reviews['place_id'].count()

np.int64(4374)

In [83]:
chc_google_reviews['snippet'].isna().sum()

np.int64(1095)

In [84]:
# show the data of missing reviews
missing_rows = chc_google_reviews[chc_google_reviews["snippet"].isna()]
missing_rows.head()

Unnamed: 0,link,rating,date,iso_date,iso_date_of_last_edit,source,review_id,snippet,page_number,place_id,...,details.recommended_dishes,translated_details.kid_friendliness,details.special_offers,details.food_drinks,details.noteworthy_details,translated_details.vegetarian_options,response.extracted_snippet.translated,details.recommendation_for_vegetarians,details.vegetarian_offerings,translated_details.dietary_restrictions
5,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-05T00:19:29Z,2025-11-05T00:19:29Z,Google,Ci9DQUlRQUNvZENodHljRjlvT2pCVVZGVkxTRmx1UzJOUF...,,1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
7,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-02T06:37:42Z,2025-11-02T06:37:42Z,Google,Ci9DQUlRQUNvZENodHljRjlvT25GVFpFNWpXR0Z1VGsxRm...,,1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
10,https://www.google.com/maps/reviews/data=!4m8!...,3.0,4 days ago,2025-11-09T02:30:27Z,2025-11-09T02:30:27Z,Google,Ci9DQUlRQUNvZENodHljRjlvT21aSlJURm1aVlEzVWtSQl...,,1,ChIJS9eZVjyKMW0R-LmTqSCw23U,...,,,,,,,,,,
14,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-04T01:11:15Z,2025-11-04T01:11:15Z,Google,Ci9DQUlRQUNvZENodHljRjlvT21OSFNFVm1WMVZrUm0xT2...,,1,ChIJS9eZVjyKMW0R-LmTqSCw23U,...,,,,,,,,,,
15,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-04T00:05:23Z,2025-11-04T00:05:23Z,Google,Ci9DQUlRQUNvZENodHljRjlvT25GRFJuUkROMVUxZUVsem...,,1,ChIJS9eZVjyKMW0R-LmTqSCw23U,...,,,,,,,,,,


- There are total: 4374 reviews scraped
- There are: 1095 reviews is empty
- Delete the row with empty reviews 

In [87]:
df_clean = chc_google_reviews.dropna(subset=["snippet"])

In [89]:
df_clean['place_id'].count()

np.int64(3279)

In [91]:
df_clean.to_csv('data/google-data/google-reviews/final/chc-google-reviews.csv')

In [97]:
df_clean.to_parquet('data/google-data/google-reviews/final/chc-google-reviews.parquet')

## 2.3 Conver Json Structure

In [92]:
df_clean.head()

Unnamed: 0,link,rating,date,iso_date,iso_date_of_last_edit,source,review_id,snippet,page_number,place_id,...,details.recommended_dishes,translated_details.kid_friendliness,details.special_offers,details.food_drinks,details.noteworthy_details,translated_details.vegetarian_options,response.extracted_snippet.translated,details.recommendation_for_vegetarians,details.vegetarian_offerings,translated_details.dietary_restrictions
0,https://www.google.com/maps/reviews/data=!4m8!...,3.0,2 days ago,2025-11-11T00:25:07Z,2025-11-11T00:25:07Z,Google,Ci9DQUlRQUNvZENodHljRjlvT2xWMFpGYzRTemhuU0dKSl...,"Thaifood hit n miss, bland mince meat rice, un...",1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
1,https://www.google.com/maps/reviews/data=!4m8!...,5.0,2 days ago,2025-11-10T10:15:50Z,2025-11-10T10:15:50Z,Google,Ci9DQUlRQUNvZENodHljRjlvT2w4NWQzbE1WRzFJY2w5bl...,Another one of those great finds....that you d...,1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
2,https://www.google.com/maps/reviews/data=!4m8!...,5.0,6 days ago,2025-11-06T18:56:05Z,2025-11-06T18:56:05Z,Google,Ci9DQUlRQUNvZENodHljRjlvT2xaSExTMHdUREJrZDFJd0...,Super food and atmosphere,1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
3,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-05T07:42:20Z,2025-11-05T07:42:20Z,Google,Ci9DQUlRQUNvZENodHljRjlvT25KVmFXSm9kMkZ4Y1hkbE...,"Food stalls.\nGot many things: from burger, bb...",1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,
4,https://www.google.com/maps/reviews/data=!4m8!...,5.0,a week ago,2025-11-05T07:14:08Z,2025-11-05T07:16:10Z,Google,Ci9DQUlRQUNvZENodHljRjlvT205UFlVWm9hbHAyTkhGd2...,"Kao soi is delicious, very nice service kob kh...",1,ChIJ715FmhiKMW0R3l3ead1fVoc,...,,,,,,,,,,


In [93]:
restaurants = pd.read_csv("data/google-data/chc_google_places_v1.csv")
restaurants = restaurants[["place_id", "title"]].rename(columns={"title": "restaurant"})

df_clean = df_clean.merge(restaurants, on="place_id", how="left")

In [94]:
def format_google_review(row):
    return {
        "user": row.get("user", {}).get("name") if isinstance(row.get("user"), dict) else None,
        "rating": row.get("rating"),
        "date": row.get("iso_date"),
        "text": row.get("snippet"),
        "original_link": row.get("link"),   # useful to keep
    }

In [95]:
grouped = []

for place_id, group in df_clean.groupby("place_id"):
    restaurant_name = group["restaurant"].iloc[0]

    reviews_list = [
        format_google_review(row)
        for _, row in group.iterrows()
    ]

    grouped.append({
        "restaurant": restaurant_name,
        "place_id": place_id,
        "reviews": reviews_list
    })

In [96]:
import json

output_path = "data/google-data/google-reviews/final/chc_google_reviews.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(grouped, f, ensure_ascii=False, indent=4)

output_path

'data/google-data/google-reviews/final/chc_google_reviews.json'