In [31]:
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pickle

In [8]:
restaurant_data = pd.read_json("../dataset/restaurants_list.json")
restaurant_data.head(5)

Unnamed: 0,objectID,name,address,area,city,country,image_url,mobile_reserve_url,payment_options,phone,postal_code,price,reserve_url,state,_geoloc
0,101422,Town,348 Main Street,Denver / Colorado,Carbondale,US,https://www.opentable.com/img/restimages/10142...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",9709636328,81623,2,http://www.opentable.com/single.aspx?rid=101422,CO,"{'lat': 39.400235, 'lng': -107.210373}"
1,113494,Plates Kitchen,301 Glenwood Ave,Raleigh / Durham / Chapel Hill,Raleigh,US,https://www.opentable.com/img/restimages/11349...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Diners Club, MasterCard, Visa]",9198280018x,27603,2,http://www.opentable.com/single.aspx?rid=113494,NC,"{'lat': 35.784585, 'lng': -78.647982}"
2,152470,Pax Americana,4319 Montrose Blvd.,Houston,Houston,US,https://www.opentable.com/img/restimages/15247...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",7132390228,77006,2,http://www.opentable.com/single.aspx?rid=152470,TX,"{'lat': 29.733565, 'lng': -95.390868}"
3,145693,Vinotopia Restaurant and Bar,5724 West 136th Terrace,Kansas City,Overland Park,US,https://www.opentable.com/img/restimages/14569...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",9134029300,66223,2,http://www.opentable.com/single.aspx?rid=145693,KS,"{'lat': 38.882144, 'lng': -94.698596}"
4,22588,Biagio's Osteria,88 Ryder's Landing,New York / Tri-State Area,Stratford,US,https://www.opentable.com/img/restimages/22588...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",2033759071x,6614,2,http://www.opentable.com/single.aspx?rid=22588,CT,"{'lat': 41.2434, 'lng': -73.10084}"


In [27]:
def get_image_url_from_website(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, allow_redirects=True)
        response.raise_for_status()  # Check if the request was successful
        
        soup = BeautifulSoup(response.text, 'html.parser')

        img = soup.find('img', {'data-test': 'restaurant-profile-photo'})
        if img and 'src' in img.attrs:
            return img['src']
        else:
            return None
    except requests.RequestException as e:
        return None
    except ValueError as e:
        return None

In [29]:
restaurant_data['new_image_url'] = restaurant_data['reserve_url'].apply(lambda url: get_image_url_from_website(url))

In [63]:
with open('new_image_urls.pkl', 'wb') as f:
    pickle.dump(restaurant_data['new_image_url'], f)

In [34]:
restaurant_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   objectID            5000 non-null   int64 
 1   name                5000 non-null   object
 2   address             5000 non-null   object
 3   area                5000 non-null   object
 4   city                5000 non-null   object
 5   country             5000 non-null   object
 6   image_url           5000 non-null   object
 7   mobile_reserve_url  5000 non-null   object
 8   payment_options     5000 non-null   object
 9   phone               5000 non-null   object
 10  postal_code         5000 non-null   object
 11  price               5000 non-null   int64 
 12  reserve_url         5000 non-null   object
 13  state               5000 non-null   object
 14  _geoloc             5000 non-null   object
 15  new_image_url       1925 non-null   object
dtypes: int64(2), object(14)


In [37]:
restaurant_data.new_image_url.fillna(restaurant_data.image_url, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  restaurant_data.new_image_url.fillna(restaurant_data.image_url, inplace=True)


In [38]:
restaurant_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   objectID            5000 non-null   int64 
 1   name                5000 non-null   object
 2   address             5000 non-null   object
 3   area                5000 non-null   object
 4   city                5000 non-null   object
 5   country             5000 non-null   object
 6   image_url           5000 non-null   object
 7   mobile_reserve_url  5000 non-null   object
 8   payment_options     5000 non-null   object
 9   phone               5000 non-null   object
 10  postal_code         5000 non-null   object
 11  price               5000 non-null   int64 
 12  reserve_url         5000 non-null   object
 13  state               5000 non-null   object
 14  _geoloc             5000 non-null   object
 15  new_image_url       5000 non-null   object
dtypes: int64(2), object(14)


In [39]:
restaurant_info = pd.read_csv("../dataset/restaurants_info.csv", header=0, delimiter=";")
restaurant_info.head(5)


Unnamed: 0,objectID,food_type,stars_count,reviews_count,neighborhood,phone_number,price_range,dining_style
0,116272,Steak,4.2,204,Pepper Pike,(216) 378-8988,$31 to $50,Fine Dining
1,138901,Sushi,4.3,75,Medina,(330) 661-0606,$30 and under,Casual Dining
2,96892,Italian,4.3,161,Bath,(330) 666-9990,$30 and under,Casual Dining
3,108532,American,4.5,159,Boardman,(330) 965-5899,$30 and under,Casual Dining
4,25840,Sushi,4.5,332,Shaker Heights,(216) 767-1111,$30 and under,Casual Dining


In [40]:
restaurant_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   objectID       5000 non-null   int64  
 1   food_type      5000 non-null   object 
 2   stars_count    5000 non-null   float64
 3   reviews_count  5000 non-null   int64  
 4   neighborhood   5000 non-null   object 
 5   phone_number   5000 non-null   object 
 6   price_range    5000 non-null   object 
 7   dining_style   5000 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 312.6+ KB


In [41]:
data = pd.merge(restaurant_data, restaurant_info, on='objectID')
data.head(5)

Unnamed: 0,objectID,name,address,area,city,country,image_url,mobile_reserve_url,payment_options,phone,...,state,_geoloc,new_image_url,food_type,stars_count,reviews_count,neighborhood,phone_number,price_range,dining_style
0,101422,Town,348 Main Street,Denver / Colorado,Carbondale,US,https://www.opentable.com/img/restimages/10142...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",9709636328,...,CO,"{'lat': 39.400235, 'lng': -107.210373}",https://www.opentable.com/img/restimages/10142...,Contemporary American,4.6,180,Carbondale,(970) 963-6328,$30 and under,Casual Dining
1,113494,Plates Kitchen,301 Glenwood Ave,Raleigh / Durham / Chapel Hill,Raleigh,US,https://www.opentable.com/img/restimages/11349...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Diners Club, MasterCard, Visa]",9198280018x,...,NC,"{'lat': 35.784585, 'lng': -78.647982}",https://resizer.otstatic.com/v2/photos/xlarge/...,International,4.3,304,Raleigh,(919) 828-0018,$30 and under,Casual Elegant
2,152470,Pax Americana,4319 Montrose Blvd.,Houston,Houston,US,https://www.opentable.com/img/restimages/15247...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",7132390228,...,TX,"{'lat': 29.733565, 'lng': -95.390868}",https://www.opentable.com/img/restimages/15247...,American,4.5,224,Midtown / Montrose,(713) 239-0228,$30 and under,Casual Elegant
3,145693,Vinotopia Restaurant and Bar,5724 West 136th Terrace,Kansas City,Overland Park,US,https://www.opentable.com/img/restimages/14569...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",9134029300,...,KS,"{'lat': 38.882144, 'lng': -94.698596}",https://www.opentable.com/img/restimages/14569...,Contemporary American,3.4,72,Overland Park,(913) 402-9300,$50 and over,Casual Dining
4,22588,Biagio's Osteria,88 Ryder's Landing,New York / Tri-State Area,Stratford,US,https://www.opentable.com/img/restimages/22588...,http://mobile.opentable.com/opentable/?restId=...,"[AMEX, Discover, MasterCard, Visa]",2033759071x,...,CT,"{'lat': 41.2434, 'lng': -73.10084}",https://www.opentable.com/img/restimages/22588...,Italian,4.4,596,Stratford,(203) 375-9071,$30 and under,Casual Elegant


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   objectID            5000 non-null   int64  
 1   name                5000 non-null   object 
 2   address             5000 non-null   object 
 3   area                5000 non-null   object 
 4   city                5000 non-null   object 
 5   country             5000 non-null   object 
 6   image_url           5000 non-null   object 
 7   mobile_reserve_url  5000 non-null   object 
 8   payment_options     5000 non-null   object 
 9   phone               5000 non-null   object 
 10  postal_code         5000 non-null   object 
 11  price               5000 non-null   int64  
 12  reserve_url         5000 non-null   object 
 13  state               5000 non-null   object 
 14  _geoloc             5000 non-null   object 
 15  new_image_url       5000 non-null   object 
 16  food_t

In [43]:
payment_types = set()
for row in data['payment_options']:
    payment_types.update({item for item in row})

print(payment_types)

{'Diners Club', 'JCB', 'AMEX', 'Carte Blanche', 'Cash Only', 'Pay with OpenTable', 'Visa', 'MasterCard', 'Discover'}


In [55]:
accepted_payment_types = {'Diners Club': 
                          {'allowed': True,
                           'name': 'Discover'},
                           'JCB': 
                           {'allowed': False,
                           'name': ''},
                            'AMEX': 
                           {'allowed': True,
                           'name': 'AMEX'},
                             'Carte Blanche':
                             {'allowed': True,
                           'name': 'Discover'},
                             'Cash Only': 
                           {'allowed': False,
                           'name': ''},
                            'Pay with OpenTable': 
                           {'allowed': False,
                           'name': ''},
                             'Visa': 
                             {'allowed': True,
                           'name': 'Visa'},
                           'MasterCard': 
                             {'allowed': True,
                           'name': 'MasterCard'}, 
                           'Discover': 
                             {'allowed': True,
                           'name': 'Discover'},}

def fix_payment_options(option_list):
    out = []
    for option in option_list:
        if accepted_payment_types[option]['allowed']:
            out.append(accepted_payment_types[option]['name'])
    
    return list(set(out))

In [57]:
data['payment_options'] = data['payment_options'].apply(lambda x: fix_payment_options(x))

In [62]:
json_str = data.to_json(orient="records")
parsed = json.loads(json_str)

with open("merged_restaurant_info.json", 'w') as json_file:
    json_file.write(json.dumps(parsed, indent=4 ))