In [11]:
from bs4 import BeautifulSoup
import requests
import csv
import numpy as np
import pandas as pd
import re
import json

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
}

In [3]:
base_url = 'https://www.yellowpages.com'
current_page = '/search?search_terms=restaurants&geo_location_terms=New+York+City%2C+NY'

In [4]:
counter = 1
restaurant_data = []

In [5]:
price_range_mapping = {
    '$': 'Affordable',
    '$$': 'Moderate',
    '$$$': 'Upscale',
    '$$$$': 'Fine Dining',
    '$$$$$': 'Luxury'
}

In [6]:
while current_page:
    print('Fetching page: ', counter)
    response = requests.get(base_url + current_page, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        for element in soup.find_all('div', class_='info'):
            # Extract YellowPages ID & Business Name
            restaurant_id = element.find('h2', class_='n').text.strip().split('.')[0]
            business_name = element.find('a', class_='business-name').find('span').text.strip()

            # Extract categories
            category_element = element.find('div', class_='categories')
            if category_element:
                category_list = category_element.find_all('a')
                categories = [category.text.strip() for category in category_list]
            else:
                categories = None
            # print(categories)
                
            # Extract TripAdvisor Ratings
            ratings_element = element.find('div', class_='ratings')
            if ratings_element:
                ta_data = ratings_element.get('data-tripadvisor')
                if ta_data:
                    ta_json = json.loads(ta_data)
                    ta_rating = float(ta_json.get('rating'))
                    ta_rating_count = int(ta_json.get('count'))
                else:
                    ta_rating = None
                    ta_rating_count = None
            else:
                ta_rating = None
                ta_rating_count = None
            # print(ta_rating, ta_rating_count)
                
            # Extract YellowPages Ratings
            yp_rating_element = element.find('a', class_='hasExtraRating')
            if yp_rating_element:
                rating_mapping = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5}
                yp_rating = rating_mapping.get(yp_rating_element.find('div', class_='result-rating')['class'][1].lower())
                yp_rating_count = int(yp_rating_element.find('span', class_='count').text.strip('()'))
            else:
                yp_rating = None
                yp_rating_count = None
            # print(yp_rating, yp_rating_count)
                
            # Extract Phone Number
            phone_number_element = element.find('div', class_='phones')
            if phone_number_element:
                phone_number = phone_number_element.text.strip()
            else:
                phone_number = None
            # print(phone_number)

            #Extract Street Address & Locality
            address_element = element.find('div', class_='adr')
            if address_element:
                street_element = address_element.find('div', class_='street-address')
                if street_element:
                    street_address = re.sub(' +', ' ', street_element.text.strip())
                else:
                    street_address = None
                locality_element = address_element.find('div', class_='locality')
                if locality_element:
                    locality = locality_element.text.strip()
                    zip_code_match = re.search(r'\b\d{5}\b', locality)

                    if zip_code_match:
                        zip_code = zip_code_match.group(0)
                    else:
                        zip_code = None
                else:
                    locality = None
                    zip_code = None
            
            # Extract Price Range
            price_element = element.find('div', class_='price-range')
            if price_element:
                price_range_symbol = price_element.text.strip()
                price_range = price_range_mapping.get(price_range_symbol, 'Unknown')
            else:
                price_range = 'Unknown'
            # print(price_range)
            
            # Extract Years In Business
            years_in_business_element = element.find('div', class_='years-in-business')
            if years_in_business_element:
                years = int(re.search(r'\d+', years_in_business_element.find('div', class_='count').text.strip()).group(0))
            else:
                years = None

            # Extract Website URL
            website_element = element.find('a', class_='track-visit-website')
            if website_element:
                website_url = website_element.get('href')
            else:
                website_url = None
                
            # Extract Menu URL
            menu_element = element.find('a', class_='menu')
            if menu_element:
                menu_url = 'http://yellowpages.com' + menu_element.get('href')
            else:
                menu_url = None
            
            # Extract Secondary Info
            snippet_element = element.find('div', class_='snippet')
            if snippet_element:
                snippet = snippet_element.find('p', class_='body').text.strip()
            else:
                snippet = None
            # print(snippet)
            
            # Extract Order Online Status
            order_online_element = element.find('div', class_='listing-ctas').find('a', class_='order-online')
            if order_online_element:
                order_online = 1
            else:
                order_online = 0
            # print(order_online)
            
            # Add to dataframe
            restaurant_data.append([restaurant_id, business_name, categories, ta_rating, ta_rating_count, yp_rating, yp_rating_count, \
                                    phone_number, street_address, locality, zip_code, price_range, years, website_url, \
                                    menu_url, snippet, order_online])
    
    next_page = soup.find('a', class_='next ajax-page')
    if next_page:
        current_page = next_page.get('href')
        counter = counter + 1
    else:
        current_page = None
            

Fetching page:  1
Fetching page:  2
Fetching page:  3
Fetching page:  4
Fetching page:  5
Fetching page:  6
Fetching page:  7
Fetching page:  8
Fetching page:  9
Fetching page:  10
Fetching page:  11
Fetching page:  12
Fetching page:  13
Fetching page:  14
Fetching page:  15
Fetching page:  16
Fetching page:  17
Fetching page:  18
Fetching page:  19
Fetching page:  20
Fetching page:  21
Fetching page:  22
Fetching page:  23
Fetching page:  24
Fetching page:  25
Fetching page:  26
Fetching page:  27
Fetching page:  28
Fetching page:  29
Fetching page:  30
Fetching page:  31
Fetching page:  32
Fetching page:  33
Fetching page:  34
Fetching page:  35
Fetching page:  36
Fetching page:  37
Fetching page:  38
Fetching page:  39
Fetching page:  40
Fetching page:  41
Fetching page:  42
Fetching page:  43
Fetching page:  44
Fetching page:  45
Fetching page:  46
Fetching page:  47
Fetching page:  48
Fetching page:  49
Fetching page:  50
Fetching page:  51
Fetching page:  52
Fetching page:  53
Fe

In [7]:
column_names = ['Restaurant ID','Name','Type of Restaurant','Trip Advisor Rating','TA Review Count','Yellow Pages Rating', \
                'YP Review Count','Phone Number','Street Address','Locality','Zipcode','Dollar Costs','Years in Business', \
                'Website','Menu URL','Secondary Info','Order Online Status']

In [8]:
df = pd.DataFrame(restaurant_data, columns=column_names)

In [17]:
df['Yellow Pages Rating'] = df['Yellow Pages Rating'].replace(np.nan, None)
df['YP Review Count'] = df['YP Review Count'].replace(np.nan, None)
df['Trip Advisor Rating'] = df['Trip Advisor Rating'].replace(np.nan, None)
df['TA Review Count'] = df['TA Review Count'].replace(np.nan, None)
df['Years in Business'] = df['Years in Business'].replace(np.nan, None)

df

Unnamed: 0,Restaurant ID,Name,Type of Restaurant,Trip Advisor Rating,TA Review Count,Yellow Pages Rating,YP Review Count,Phone Number,Street Address,Locality,Zipcode,Dollar Costs,Years in Business,Website,Menu URL,Secondary Info,Order Online Status
0,1,EJ's Luncheonette,"[Restaurants, American Restaurants, Caterers]",4.0,268.0,,,(212) 472-0600,1271 3rd Ave Close To The 72nd Street Crosstow...,"New York City, NY 10021",10021,Moderate,34.0,http://www.ejsluncheonette.com,http://yellowpages.com/new-york-city-ny/mip/ej...,From Business: EJ's Luncheonette is a NYC icon...,1
1,2,Route 66 Cafe,"[Restaurants, American Restaurants, Bars]",4.0,396.0,5.0,1.0,(212) 977-7600,858 9th Ave Between 55th And 56th Streets,"New York City, NY 10019",10019,Moderate,28.0,http://www.route66nyc.com,http://yellowpages.com/new-york-city-ny/mip/ro...,Great American style food but not a low end bb...,1
2,3,The Sea Fire Grill,"[Restaurants, Seafood Restaurants, Bars]",4.5,1154.0,4.0,1.0,(212) 935-3785,158 E 48th St Between 3rd And Lexington Avenue,"New York City, NY 10017",10017,Fine Dining,12.0,http://www.theseafiregrill.com,http://yellowpages.com/new-york-city-ny/mip/th...,"My husband and I went to Manhattan, N.Y. to ce...",0
3,4,Bobo,"[Restaurants, Continental Restaurants, Bars]",4.0,207.0,4.0,1.0,(212) 488-2626,181 W 10th St At Seventh Avenue,"New York City, NY 10014",10014,Fine Dining,17.0,http://www.bobonyc.com,http://yellowpages.com/new-york-city-ny/mip/bo...,We ordered Duck and Scallopsboth great!dont le...,1
4,5,Gayle's Broadway Rose,"[Restaurants, American Restaurants, Take Out R...",4.5,76.0,,,(347) 933-2673,228 W 47th St At Friedmans At The Edison Hotel,"New York City, NY 10036",10036,Unknown,,http://www.gaylesbroadwayrose.com,,From Business: Featuring the most talented asp...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Co.,"[Restaurants, Pizza, American Restaurants]",,,,,(212) 243-1105,230 9th Ave,"New York, NY 10001",10001,Moderate,15.0,https://www.sullivanstreetpizza.com,http://yellowpages.com/new-york-ny/mip/co-4710...,,0
2996,2997,Sushi You,"[Restaurants, Sushi Bars, Japanese Restaurants]",4.5,56.0,,,(212) 752-2987,246 E 51st St,"New York, NY 10022",10022,Moderate,25.0,http://www.sushi-you.com,http://yellowpages.com/new-york-ny/mip/sushi-y...,,0
2997,2998,Town House Inn Restaurant,"[Restaurants, American Restaurants, Coffee Shops]",4.5,72.0,,,(212) 599-6254,696 2nd Ave,"New York, NY 10016",10016,Affordable,29.0,,http://yellowpages.com/new-york-ny/mip/town-ho...,,1
2998,2999,Kiku Sushi,"[Restaurants, Sushi Bars, Japanese Restaurants]",,,,,(212) 627-3660,235 9th Ave,"New York, NY 10001",10001,Moderate,17.0,http://www.kikusushijapanese.com,http://yellowpages.com/new-york-ny/mip/kiku-su...,,1


In [None]:
df.to_csv('./data/clean/yellowpages_restaurants.csv', index=False)