In [191]:
import requests
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# from fuzzywuzzy import fuzz
from tqdm import tqdm  # Progress bar for bulk processing
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import json
from datetime import datetime
import seaborn as sns
from gensim.models import Word2Vec
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
from collections import Counter


In [192]:
# Replace with your actual Google Maps API Key
API_KEY = 'AIzaSyAKWuKVqN2_RXeKp0nTxolUV1YU41ObKOA'


In [193]:
# API Endpoints
TEXT_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"

In [194]:
# Google Places API Configuration
SEARCH_TEXT_URL = "https://places.googleapis.com/v1/places:searchText"
PLACE_DETAILS_URL = "https://places.googleapis.com/v1/{place_id}"

SEARCH_FIELDS = [
    "id",
    "displayName",
    # "nationalPhoneNumber",
    # "websiteUri",
    "formattedAddress",
    "priceLevel",
    "priceRange",
    "rating",
    "regularOpeningHours",
    "userRatingCount",
    "photos",
    "types",
    "takeout",
    "delivery",
    "dineIn",
    "reservable",
    "servesBreakfast",
    "servesLunch",
    "servesDinner",
    "servesBrunch",
    "servesBeer",
    "servesWine",
    "servesCocktails",
    "servesDessert",
    "servesCoffee",
    "outdoorSeating",
    "liveMusic",
    "allowsDogs",
    "paymentOptions",
    "editorialSummary",
    "evChargeOptions",
    "fuelOptions",
    "goodForChildren",
    "goodForGroups",
    "goodForWatchingSports",
    "menuForChildren",
    "parkingOptions",
]

# Valid fields for details endpoint
# DETAILS_FIELDS = [
#     "servingStatus",
#     "wheelchairAccessibleEntrance",
#     "wheelchairAccessibleRestroom",
#     "wheelchairAccessibleSeating",
#     "wheelchairAccessibleParking"
# ]

HEADERS = {
    "Content-Type": "application/json",
    "X-Goog-Api-Key": API_KEY,
    "X-Goog-FieldMask": f"places.{',places.'.join(SEARCH_FIELDS)}"
}

# def get_place_details(place_id):
#     """Get additional details using Place ID"""
#     response = requests.get(
#         PLACE_DETAILS_URL.format(place_id=place_id),
#         headers={
#             "X-Goog-Api-Key": API_KEY,
#             "X-Goog-FieldMask": ",".join(DETAILS_FIELDS)
#         }
#     )
#     return response.json()

def geocode_address(address):
    """Fallback geocoding using OpenStreetMap"""
    geolocator = Nominatim(user_agent="geoapiExercises")
    try:
        location = geolocator.geocode(address, timeout=10)
        return (location.latitude, location.longitude) if location else (None, None)
    except GeocoderTimedOut:
        return (None, None)

def search_google_place(row):
    """Enhanced search with multiple matching strategies"""
    name = row['name']
    lat = row['latitude']
    lng = row['longitude']
    address = row['address']
    
    # Try 1: Exact match with coordinates
    payload = {
        "textQuery": name,
        "locationBias": {
            "circle": {
                "center": {"latitude": lat, "longitude": lng},
                "radius": 500
            }
        }
    }
    
    try:
        response = requests.post(SEARCH_TEXT_URL, headers=HEADERS, json=payload)
        data = response.json()
        if data.get('places'):
            # Get additional details
            place = data['places'][0]
            # details = get_place_details(place['id'])
            # place.update(details)
            return place
    except Exception as e:
        print(f"Search error: {str(e)}")
        return None

    # Try 2: Fuzzy match with address components
    try:
        payload["locationRestriction"] = {
            "circle": {
                "center": {"latitude": lat, "longitude": lng},
                "radius": 5000
            }
        }
        response = requests.post(SEARCH_TEXT_URL, headers=HEADERS, json=payload)
        data = response.json()
        if data.get('places'):
            place = data['places'][0]
            # details = get_place_details(place['id'])
            # place.update(details)
            return place
    except:
        pass

    # Try 3: Reverse geocoding
    try:
        new_lat, new_lng = geocode_address(address)
        if new_lat and new_lng:
            payload["locationBias"]["circle"]["center"] = {"latitude": new_lat, "longitude": new_lng}
            response = requests.post(SEARCH_TEXT_URL, headers=HEADERS, json=payload)
            data = response.json()
            if data.get('places'):
                place = data['places'][0]
                # details = get_place_details(place['id'])
                # place.update(details)
                return place
    except:
        pass

    return None
    
def process_yelp_row(row):
    """Process each Yelp business with enhanced matching"""
    business_id = row['business_id']
    name = row['name']
    
    print(f"Processing: {name} (ID: {business_id})")
    place = search_google_place(row)
    
    if not place:
        print(f"⚠️ No match found for {name}")
        return {}

    payment_options = place.get('paymentOptions', {}).get('acceptedPaymentMethods', [])
    
    return {
        "business_id": business_id,
        "displayName": place.get("displayName", "N/A")['text'],
        # "phone": place.get("nationalPhoneNumber", "N/A"),
        # "website": place.get("websiteUri", "N/A"),
        "address": place.get("formattedAddress", "N/A"),
        "price_level": place.get("priceLevel", 0) if "priceLevel" in place else "N/A",
        "price_range": place.get("priceRange", "N/A"),
        "rating": place.get("rating", "N/A"),
        "regular_opening_hours": place.get("regularOpeningHours", {}),
        "user_rating_count": place.get("userRatingCount", "N/A"),
        "number_of_photos": len(place.get("photos", [])),
        "types": ", ".join(place.get("types", [])),
        "takeout": "Yes" if place.get("takeout", False) else "No",
        "delivery": "Yes" if place.get("delivery", False) else "No",
        "dine_in": "Yes" if place.get("dineIn", False) else "No",
        "reservable": "Yes" if place.get("reservable", False) else "No",
        "serves_breakfast": "Yes" if place.get("servesBreakfast", False) else "No",
        "serves_lunch": "Yes" if place.get("servesLunch", False) else "No",
        "serves_dinner": "Yes" if place.get("servesDinner", False) else "No",
        "serves_brunch": "Yes" if place.get("servesBrunch", False) else "No",
        "serves_beer": "Yes" if place.get("servesBeer", False) else "No",
        "serves_wine": "Yes" if place.get("servesWine", False) else "No",
        "serves_cocktails": "Yes" if place.get("servesCocktails", False) else "No",
        "serves_dessert": "Yes" if place.get("servesDessert", False) else "No",
        "serves_coffee": "Yes" if place.get("servesCoffee", False) else "No",
        "outdoor_seating": "Yes" if place.get("outdoorSeating", False) else "No",
        "live_music": "Yes" if place.get("liveMusic", False) else "No",
        "allows_dogs": "Yes" if place.get("allowsDogs", False) else "No",
        "payment_options": ", ".join(payment_options) if payment_options else "N/A",
        "editorial_summary": place.get("editorialSummary", "N/A"),
        "ev_charge_options": place.get("evChargeOptions", "N/A"),
        "fuel_options": place.get("fuelOptions", "N/A"),
        "good_for_children": "Yes" if place.get("goodForChildren", False) else "No",
        "good_for_groups": "Yes" if place.get("goodForGroups", False) else "No",
        "good_for_watching_sports": "Yes" if place.get("goodForWatchingSports", False) else "No",
        "menu_for_children": "Yes" if place.get("menuForChildren", False) else "No",
        "parking_options": place.get("parkingOptions", "N/A"),
        # "Serving Status": place.get("servingStatus", "N/A"),
        # "Wheelchair Accessible Entrance": "Yes" if place.get("wheelchairAccessibleEntrance", False) else "No",
        # "Wheelchair Accessible Restroom": "Yes" if place.get("wheelchairAccessibleRestroom", False) else "No",
        # "Wheelchair Accessible Seating": "Yes" if place.get("wheelchairAccessibleSeating", False) else "No",
        # "Wheelchair Accessible Parking": "Yes" if place.get("wheelchairAccessibleParking", False) else "No",
    }

def enrich_yelp_data(yelp_json_path, output_file="enriched_restaurants.csv"):
    # check if file is json
    if yelp_json_path.endswith('.json'):
        yelp_df = pd.read_json(yelp_json_path, lines = True)
    elif yelp_json_path.endswith('.csv'):
        yelp_df = pd.read_csv(yelp_json_path)
    else:
        print("Invalid file format. Please provide a JSON or CSV file")
        return
    # yelp_df = yelp_df.head(10)  # Limit to first 10 for testing
    results = []
    total = len(yelp_df)
    
    for i, row in yelp_df.iterrows():
        print(f"Processing {i+1}/{total}: ", end="")
        result = process_yelp_row(row)
        if result:
            results.append(result)
        # time.sleep(1.1)  # Maintain API rate limits

    if results:
        pd.DataFrame(results).to_csv(output_file, index=False)
        print(f"\n✅ Enriched data saved to {output_file} ({len(results)}/{total} matched)")
    else:
        print("No data to save")



In [195]:
# enrich_yelp_data("restaurants.csv") uncomment to rerun the 8k requests (takes 1 hour)


In [None]:
# # fill in address where address_x is null
# restaurants_merged["address_x"] = restaurants_merged["address_x"].fillna(restaurants_merged["address_y"])
# # remove address_y column
# restaurants_merged = restaurants_merged.drop(columns=["address_y"])
# # rename address_x to address
# restaurants_merged = restaurants_merged.rename(columns={"address_x": "address"})
# # get address description
# print(restaurants_merged['address'].describe())

count             8057
unique            7461
top       51 N 12th St
freq                26
Name: address, dtype: object


In [206]:
# adjust columns
# removed columns: payment_options, ev_charge_options, fuel_options, state, is_open
# to merge columns (handled by code below): 
# RestaurantsTakeOut and takeout
# RestaurantsDelivery and delivery
print(restaurants_merged.columns)
restaurants_merged = restaurants_merged[['business_id', 'name', 'categories', 'review_count', 'user_rating_count', 'stars', 'latitude', 'longitude', 'postal_code', 
                                         'city', 'address', 'RestaurantsTakeOut', 'hours', 'BusinessAcceptsCreditCards', 'RestaurantsDelivery', 
                                         'RestaurantsPriceRange2', 'RestaurantsReservations', 'HasTV', 'OutdoorSeating', 'RestaurantsGoodForGroups', 'WiFi', 
                                         'GoodForKids', 'BikeParking', 'RestaurantsAttire', 'Caters', 'BusinessParking', 'NoiseLevel', 'Ambience', 'GoodForMeal', 
                                         'RestaurantsTableService', 'WheelchairAccessible', 'HappyHour', 'Alcohol', 'DogsAllowed', 'BusinessAcceptsBitcoin', 'BYOB', 
                                         'Corkage', 'DriveThru', 'BestNights', 'CoatCheck', 'ByAppointmentOnly', 'GoodForDancing', 'Smoking', 'BYOBCorkage', 'Music', 
                                         'AgesAllowed', 'RestaurantsCounterService', 'Open24Hours', 'AcceptsInsurance', 'DietaryRestrictions', 'dist_highway', 
                                         'adjusted_gross_income', 'rural_urban_continuum_code_2023', 'unemployment_rate_2023', 'displayName', 'price_level', 
                                         'price_range', 'rating', 'regular_opening_hours', 'number_of_photos', 'types', 'takeout', 'delivery', 
                                         'dine_in', 'reservable', 'serves_breakfast', 'serves_lunch', 'serves_dinner', 'serves_brunch', 'serves_beer', 'serves_wine', 
                                         'serves_cocktails', 'serves_dessert', 'serves_coffee', 'outdoor_seating', 'live_music', 'allows_dogs', 'editorial_summary', 
                                         'good_for_children', 'good_for_groups', 'good_for_watching_sports', 'menu_for_children', 'parking_options']]
print(restaurants_merged.columns)

Index(['business_id', 'longitude', 'name', 'categories', 'review_count', 'stars', 'is_open', 'latitude', 'postal_code', 'state', 'city', 'address', 'RestaurantsTakeOut', 'hours',
       'BusinessAcceptsCreditCards', 'RestaurantsDelivery', 'RestaurantsPriceRange2', 'RestaurantsReservations', 'HasTV', 'OutdoorSeating', 'RestaurantsGoodForGroups', 'WiFi', 'GoodForKids',
       'BikeParking', 'RestaurantsAttire', 'Caters', 'BusinessParking', 'NoiseLevel', 'Ambience', 'GoodForMeal', 'RestaurantsTableService', 'WheelchairAccessible', 'HappyHour', 'Alcohol',
       'DogsAllowed', 'BusinessAcceptsBitcoin', 'BYOB', 'Corkage', 'DriveThru', 'BestNights', 'CoatCheck', 'ByAppointmentOnly', 'GoodForDancing', 'Smoking', 'BYOBCorkage', 'Music', 'AgesAllowed',
       'RestaurantsCounterService', 'Open24Hours', 'AcceptsInsurance', 'DietaryRestrictions', 'dist_highway', 'adjusted_gross_income', 'rural_urban_continuum_code_2023', 'unemployment_rate_2023',
       'displayName', 'price_level', 'price_range'

In [207]:
# check if all rows in categories column are string
print(restaurants_merged['categories'].apply(lambda x: isinstance(x, str)).all())
# check if all rows in types column are string
print(restaurants_merged['types'].apply(lambda x: isinstance(x, str)).all())
# find all types that are not string
print(restaurants_merged[~restaurants_merged['types'].apply(lambda x: isinstance(x, str))]['types'].unique())
# types are all strings except for some nan values


True
False
[nan]


In [208]:
# convert categories column to list of words
restaurants_merged['categories'] = restaurants_merged['categories'].apply(lambda x: x.split(", "))
restaurants_merged['types'] = restaurants_merged['types'].apply(lambda x: x.split(", ") if isinstance(x, str) else [])

In [209]:
# combine categories and types columns
restaurants_merged['categories'] = restaurants_merged['categories'] + restaurants_merged['types']
# restaurants_merged['categories'] = restaurants_merged['categories'].apply(lambda x: list(set(x)))

In [210]:
print(restaurants_merged['categories'].info())

# remove types column
restaurants_merged = restaurants_merged.drop(columns=["types"])

<class 'pandas.core.series.Series'>
Int64Index: 8069 entries, 0 to 8068
Series name: categories
Non-Null Count  Dtype 
--------------  ----- 
8069 non-null   object
dtypes: object(1)
memory usage: 126.1+ KB
None


In [211]:
print(restaurants_merged['categories'].describe())

# get top 5 categories
categories = restaurants_merged['categories'].explode()
print(categories.value_counts().head(5))

count                     8069
unique                    7027
top       [Restaurants, Pizza]
freq                        34
Name: categories, dtype: object
Restaurants          8069
establishment        6971
point_of_interest    6965
food                 6674
restaurant           6078
Name: categories, dtype: int64


In [212]:
print(restaurants_merged['review_count'].dtype)
print(restaurants_merged['review_count'].describe())
# find nulls
print(restaurants_merged['review_count'].isnull().sum())

int64
count    8069.000000
mean       99.986120
std       206.327774
min         5.000000
25%        16.000000
50%        41.000000
75%       106.000000
max      5721.000000
Name: review_count, dtype: float64
0


In [213]:
print(restaurants_merged['user_rating_count'].dtype)
print(restaurants_merged['user_rating_count'].describe())
# find nulls
print(restaurants_merged['user_rating_count'].isnull().sum())

float64
count     6949.000000
mean       643.744711
std       1333.073084
min          1.000000
25%        170.000000
50%        336.000000
75%        689.000000
max      43377.000000
Name: user_rating_count, dtype: float64
1120


In [214]:
print(restaurants_merged['RestaurantsTakeOut'].isnull().sum())
matching_columns = {"RestaurantsTakeOut": "takeout", "RestaurantsDelivery": "delivery", 'RestaurantsReservations': 'reservable',
                    'Music': 'live_music', 'DogsAllowed': 'allows_dogs', 'BusinessParking': 'parking_options', 'GoodForKids': 'good_for_children', 
                    'RestaurantsGoodForGroups': 'good_for_groups', 'OutdoorSeating': 'outdoor_seating'}
for yelp_col, google_col in matching_columns.items():
    # replace nulls in yelp column with values from google column
    restaurants_merged[yelp_col] = restaurants_merged[yelp_col].fillna(restaurants_merged[google_col])
    # remove google column
    restaurants_merged = restaurants_merged.drop(columns=[google_col])
    print(f'Describing {yelp_col}')
    print(restaurants_merged[yelp_col].describe())
    print(restaurants_merged[yelp_col].isnull().sum())

598
Describing RestaurantsTakeOut
count     7989
unique       4
top       True
freq      7230
Name: RestaurantsTakeOut, dtype: object
80
Describing RestaurantsDelivery
count     7942
unique       4
top       True
freq      5485
Name: RestaurantsDelivery, dtype: object
127
Describing RestaurantsReservations
count      7838
unique        4
top       False
freq       4236
Name: RestaurantsReservations, dtype: object
231
Describing Music
count     7041
unique      15
top         No
freq      6368
Name: Music, dtype: object
1028
Describing DogsAllowed
count     7246
unique       4
top         No
freq      5323
Name: DogsAllowed, dtype: object
823
Describing BusinessParking
count     7177
unique      96
top        lot
freq      2551
Name: BusinessParking, dtype: object
892
Describing GoodForKids
count     7781
unique       4
top       True
freq      4949
Name: GoodForKids, dtype: object
288
Describing RestaurantsGoodForGroups
count     7788
unique       4
top       True
freq      4870
Name: 

In [215]:
# print to csv
restaurants_merged.to_csv('restaurants_merged.csv', index=False)

In [216]:
# # Load the business data and filter for Pennsylvania restaurants
# business_data = []

# for filename in business_file_id:
#     with open(filename, 'r', encoding='utf-8') as f:
#         for line in f:
#             business_data.append(json.loads(line.strip()))

# # Create a DataFrame from the review_data
# business_data_raw = pd.DataFrame(business_data)
# # print("Dataframe Shape:", business_data_raw.shape)
# # business_data_raw.head(3)
# restaurants = business_data_raw[business_data_raw['categories'].str.contains('Restaurant.*') == True].reset_index()
# restaurants_PA = restaurants[restaurants['state'] == 'PA'].reset_index()
# # Ensure attributes is always a dictionary
# restaurants_PA["attributes"] = restaurants_PA["attributes"].apply(lambda x: x if isinstance(x, dict) else {})
# # Normalize attributes column
# attributes_df = pd.json_normalize(restaurants_PA["attributes"])

# # Concatenate the main dataframe with the expanded attributes
# restaurants_PA = restaurants_PA.drop(columns=["attributes"]).join(attributes_df)

# for att in ["Alcohol", "WiFi", "RestaurantsAttire", "NoiseLevel", "BYOBCorkage", "AgesAllowed", "Smoking"]:
#     restaurants_PA[att] = restaurants_PA[att].apply(
#         lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# restaurants_PA['RestaurantsPriceRange2'] = pd.to_numeric(
#     restaurants_PA['RestaurantsPriceRange2'], errors='coerce')

# for att in ["Ambience", "BusinessParking", "GoodForMeal", "Music", "BestNights", "DietaryRestrictions"]:
#     restaurants_PA[att] = restaurants_PA[att].apply(
#         lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
#     restaurants_PA[att] = restaurants_PA[att].apply(
#         lambda x: ", ".join([k for k, v in x.items() if v]
#                             ) if isinstance(x, dict) else ""
#     )

# def clean_values(x):
#     """Cleans and standardizes values across columns."""
#     if isinstance(x, str):
#         x_lower = x.strip().lower()
#         if x_lower in ["true"]:
#             return True
#         elif x_lower in ["false"]:
#             return False
#         elif x_lower in ["none", "nan", "null", ""]:
#             return np.nan  # Convert both "None" and "nan" strings to pd.NA
#         return x  # Keep other strings unchanged
#     elif pd.isna(x) or x is None:
#         return np.nan  # Convert NaN and None to pd.NA
#     return x  # Keep other values unchanged

# # Apply cleaning function to all columns
# restaurants = restaurants.apply(lambda col: col.map(clean_values))

# # Drop columns
# restaurants = restaurants.drop(columns=['index'])

# pd.set_option("display.max_columns", None)
# pd.set_option("display.width", 200)
# restaurants.head()

In [217]:
# # overlapping columns:
# # address: formattedAddress
# # RestaurantsTakeOut: takeout
# # hours: regularOpeningHours (requires a bit of work to combine both, tbd)
# # BusinessAcceptsCreditCards: paymentOptions (requires a bit of work to combine both, tbd)
# # RestaurantsDelivery: delivery
# # RestaurantsPriceRange2: priceLevel (1,2,4 for PRICE_LEVEL_INEXPENSIVE, PRICE_LEVEL_MODERATE, PRICE_LEVEL_EXPENSIVE)
# # RestaurantsReservations: reservable
# # HasTV: N/A
# # OutdoorSeating: outdoorSeating
# # RestaurantsGoodForGroups: goodForGroups
# # Wifi: N/A
# # GoodForKids: goodForChildren
# # RestaurantsAttire: N/A
# # Caters: N/A
# # BusinessParking: parkingOptions
# # NoiseLevel: N/A
# # Ambience: N/A
# # GoodForMeal: N/A
# # RestaurantsTableService: N/A
# # WheelchairAccessible: N/A
# # HappyHour: N/A
# # Alcohol: N/A
# # DogsAllowed: allowsDogs
# # BusinessAcceptsBitcoin: N/A
# # BYOB: N/A
# # Corkage: N/A
# # DriveThru: N/A
# # BestNights: N/A
# # CoatCheck: N/A
# # ByAppointmentOnly: N/A
# # GoodForDancing: N/A
# # Smoking: N/A
# # BYOBCorkage: N/A
# # Music: liveMusic
# # AgesAllowed: N/A
# # RestaurantsCounterService: N/A
# # Open24Hours: N/A
# # AcceptsInsurance: N/A
# # DietaryRestrictions: N/A
# # new column: "priceRange"
# # new column: "rating" (avg_rating)
# # new column: "userRatingCount",
# # new column: "photos",
#     "types",
#     "takeout",
#     "delivery",
#     "dineIn",
#     "reservable",
#     "servesBreakfast",
#     "servesLunch",
#     "servesDinner",
#     "servesBrunch",
#     "servesBeer",
#     "servesWine",
#     "servesCocktails",
#     "servesDessert",
#     "servesCoffee",
#     "outdoorSeating",
#     "liveMusic",
#     "allowsDogs",
#     "paymentOptions",
#     "editorialSummary",
#     "evChargeOptions",
#     "fuelOptions",
#     "goodForChildren",
#     "goodForGroups",
#     "goodForWatchingSports",
#     "menuForChildren",
#     "parkingOptions",


# # combine the 2 csv files now