In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

scrapping city list

In [2]:
def extract_cities(page) -> BeautifulSoup:
    
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
    url = f'https://www.tripadvisor.com.mx/Restaurants-g150768-oa{page}-Mexico.html#LOCATION_LIST'
    r = requests.get(url, headers=headers)

    soup = BeautifulSoup(r.content, 'html.parser')

    return soup

def transform_cities(soup) -> list:
    
    cities_ = []
    items = soup.find_all('div', class_='geo_wrap')

    for item in items:
        city =  item.find('a').text
        href =  item.find('a')['href']

        city = {'city': city, 'href': href}
        cities_.append(city)

    return cities_

def transform_cities2(soup) -> list:
    
    cities_ = []
    items = soup.find_all('ul', {'class': 'geoList'})

    for ultag in items:
        for litag in ultag.find_all('li'):
            city = litag.text
            href = litag.find('a')['href']
            
            city = {'city':city,'href':href}
            cities_.append(city)
    return cities_

In [3]:
max_page = 1

cities = transform_cities(extract_cities(0))
for i in range(20,(max_page*20),20):
    cities_ = transform_cities2(extract_cities(i))
    cities += cities_

In [4]:
cities = pd.DataFrame(cities)
cities['city'] = cities['city'].str.lstrip('Restaurantes en')
len(cities)

20

In [5]:
cities.head()

Unnamed: 0,city,href
0,Ciudad de México,/Restaurants-g150800-Mexico_City_Central_Mexic...
1,Guadalajara,/Restaurants-g150798-Guadalajara_Guadalajara_M...
2,Cancún,/Restaurants-g150807-Cancun_Yucatan_Peninsula....
3,Playa del Carmen,/Restaurants-g150812-Playa_del_Carmen_Yucatan_...
4,Monterrey,/Restaurants-g150782-Monterrey_Northern_Mexico...


scrapping restaurant list

In [6]:
def extract_restaurant_list(href_city) -> BeautifulSoup:
    
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
    url = f'https://www.tripadvisor.com.mx/{href_city}'
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    return soup

In [7]:
def transform_restaurant_list(soup) -> list:
    
    restaurant_ = []
    items = soup.find_all('div', class_='RfBGI')
    
    for item in items:
        restaurant =  item.find('a', class_='Lwqic Cj b').text
        href =  item.find('a')['href']

        restaurant = {'restaurant': restaurant, 'href': href}
        restaurant_.append(restaurant)

    return restaurant_

scrapping restaurant list from first city

In [8]:
restaurant_list = transform_restaurant_list(extract_restaurant_list(cities.href[0]))

restaurant_list = pd.DataFrame(restaurant_list)

restaurant_list['sponsored'] = ''

for i in range(len(restaurant_list)):
    if '.' in restaurant_list['restaurant'][i]:
        restaurant_list['restaurant'][i] = restaurant_list['restaurant'][i].split('.')[1]
        restaurant_list['sponsored'][i] = restaurant_list['restaurant'][i].split('.')[0]

restaurant_list['sponsored'] = ~restaurant_list.sponsored.astype('bool')

len(restaurant_list)

37

In [9]:
restaurant_list.head()

Unnamed: 0,restaurant,href,sponsored
0,La Vicenta Tezontle,/Restaurant_Review-g150800-d12310802-Reviews-L...,True
1,Balta,/Restaurant_Review-g150800-d19338104-Reviews-B...,False
2,La Mansion Marriott Reforma,/Restaurant_Review-g150800-d2394477-Reviews-La...,False
3,Condimento Restaurant,/Restaurant_Review-g150800-d1926008-Reviews-Co...,False
4,La Distral,/Restaurant_Review-g150800-d12104177-Reviews-L...,False


list without sponsored restaurants

In [10]:
restaurant_list = restaurant_list[restaurant_list['sponsored'] == False]
restaurant_list.reset_index(inplace=True)
restaurant_list.drop(columns=['index'], inplace=True)

scrapping restaurants info

In [15]:
def extract_restaurant_info(page) -> BeautifulSoup:

    url = f'https://www.tripadvisor.com.mx{page}'

    browser = webdriver.Chrome(executable_path=r"/Users/rosaarzabala/Documents/Projects/TripAdvisorScrapper/chromedriver")
    browser.get(url)
    html = browser.page_source

    soup = BeautifulSoup(html, 'html.parser')

    return soup

restaurants_info_list = []

def transform_restaurant_info(soup,restaurant): #,city,state) -> list:

    div = soup.find('ul', class_='breadcrumbs')
    
    i = 0
    for item in div:
        try:
            if i == 0:
                state = item.find('a').text
            if i == 1:
                region = item.find('a').text
            if i == 2:
                city = item.find('a').text
                break
            i += 1
        except:
            pass

    div = soup.find('div', class_='YDAvY R2 F1 e k')
    for item in div:
        try:
            points = item.find('span', class_='ZDEqb').text
            # print('points:',points)
        except:
            pass

        try:
            reviews = item.find('a', class_='IcelI').text
            reviews = reviews.rstrip('opiniones')
            # print('reviews:',reviews)
        except:
            pass

    div = soup.find_all('div', class_='vQlTa H3')
    for item in div:
        try:
            ranking = item.find('span', class_='DsyBj cNFrA').text
            ranking = ranking.split(' ')[0]
            # print('ranking:',ranking)
            break
        except:
            ranking = ''

    div = soup.find('div', class_='kDZhm IdiaP Me')
    for item in div:
        try:
            address = item.find('span', class_='yEWoV').text
            # print('address:',address)
        except:
            address = ''

    div = soup.find('div', class_='IdiaP Me sNsFa')
    for item in div:
        try:
            web_site = item.find('a', class_='YnKZo Ci Wc _S C FPPgD')['href']
            # print('web_site:',web_site)
        except:
            web_site = ''

    div = soup.find_all('div', class_='kDZhm IdiaP Me')
    for item in div:
        try:
            coordinates = item.find('a', class_='YnKZo Ci Wc _S C FPPgD')['href']
            coordinates = coordinates.split('@')[-1]
            latitude = coordinates.split(',')[0]
            longitude = coordinates.split(',')[1]
            # print('latitude:',latitude)
            # print('longite:',longitude)
        except:
            pass

    restaurants_info = {'restaurant':restaurant,
                        'city': city,
                        'state': state,
                        'region': region,
                        'points': points,
                        'reviews': reviews,
                        'ranking': ranking,
                        'address': address,
                        'web_site': web_site,
                        'latitude': latitude,
                        'longitude': longitude}

    restaurants_info_list.append(restaurants_info)

data from the first five restaurants

In [16]:
restaurants_info_list = []

for i in range(1):
    href,restaurant = restaurant_list['href'][i],restaurant_list['restaurant'][i]
    transform_restaurant_info(extract_restaurant_info(href), restaurant)

df_restaurants = pd.DataFrame(restaurants_info_list)
df_restaurants

  browser = webdriver.Chrome(executable_path=r"/Users/rosaarzabala/Documents/Projects/TripAdvisorScrapper/chromedriver")


Unnamed: 0,restaurant,city,state,region,points,reviews,ranking,address,web_site,latitude,longitude
0,Balta,Ciudad de México,México,México Central y Costa del Golfo,5.0,470,#1,Avenida Paseo de La Reforma 297 Colonia Cuauht...,http://www.sofitel-mexico-city.com/restaurants...,19.428432,-99.16592


testing

In [13]:
cities = pd.read_csv('data/city_list.csv')
cities = cities[['city', 'href']]
cities.head(10)

Unnamed: 0,city,href
0,Ciudad de México,/Restaurants-g150800-Mexico_City_Central_Mexic...
1,Guadalajara,/Restaurants-g150798-Guadalajara_Guadalajara_M...
2,Cancún,/Restaurants-g150807-Cancun_Yucatan_Peninsula....
3,Playa del Carmen,/Restaurants-g150812-Playa_del_Carmen_Yucatan_...
4,Monterrey,/Restaurants-g150782-Monterrey_Northern_Mexico...
5,Puerto Vallarta,/Restaurants-g150793-Puerto_Vallarta.html
6,Santiago de Querétaro,/Restaurants-g479232-Queretaro_City_Central_Me...
7,Mérida,/Restaurants-g150811-Merida_Yucatan_Peninsula....
8,Puebla,/Restaurants-g152773-Puebla_Central_Mexico_and...
9,Zapopan,/Restaurants-g1006488-Zapopan_Guadalajara_Metr...


In [21]:
restaurant = pd.read_csv('data/restaurants.csv')
restaurant

Unnamed: 0.1,Unnamed: 0,restaurant,city,state,region,points,reviews,ranking,address,web_site,latitude,longitude
0,0,Balta,Ciudad de México,México,México Central y Costa del Golfo,5.0,471,#1,Avenida Paseo de La Reforma 297 Colonia Cuauht...,http://www.sofitel-mexico-city.com/restaurants...,19.428432,-99.16592
1,1,La Mansion Marriott Reforma,Ciudad de México,México,México Central y Costa del Golfo,5.0,991,#2,"Paseo de la Reforma 276 Col. Juarez, Mexico Ci...",http://www.facebook.com/mansionmarriottreforma,19.428345,-99.16426
2,2,Condimento Restaurant,Ciudad de México,México,México Central y Costa del Golfo,5.0,713,#3,Avenida Paseo de La Reforma 276 Mexico City Ma...,http://www.marriott.com/hotels/hotel-informati...,19.427828,-99.164024
3,3,Restaurante SSAM,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,1035,#1,"Morelos 2122, Guadalajara 44600 México",http://www.facebook.com/RESTAURANTESSAMGDL/,20.675816,-103.37889
4,4,Octo Pescadería,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,397,#2,Lerdo de Tejada 2420 Entre Francisco Javier Ga...,https://www.facebook.com/Octo-Pescader%C3%ADa-...,20.670938,-103.37811
5,5,Porfirio's Guadalajara,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,1464,#3,"Calle Sao Paulo 2334 A, Col. Providencia, Punt...",http://porfirios.com.mx/,20.70163,-103.37652
