In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

scrapping city list

In [2]:
def extract_cities(page) -> BeautifulSoup:
    
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
    url = f'https://www.tripadvisor.com.mx/Restaurants-g150768-oa{page}-Mexico.html#LOCATION_LIST'
    r = requests.get(url, headers=headers)

    soup = BeautifulSoup(r.content, 'html.parser')

    return soup

def transform_cities(soup) -> list:
    
    cities_ = []
    items = soup.find_all('div', class_='geo_wrap')

    for item in items:
        city =  item.find('a').text
        href =  item.find('a')['href']

        city = {'city': city, 'href': href}
        cities_.append(city)

    return cities_

def transform_cities2(soup) -> list:
    
    cities_ = []
    items = soup.find_all('ul', {'class': 'geoList'})

    for ultag in items:
        for litag in ultag.find_all('li'):
            city = litag.text
            href = litag.find('a')['href']
            
            city = {'city':city,'href':href}
            cities_.append(city)
    return cities_

In [3]:
max_page = 1

cities = transform_cities(extract_cities(0))
for i in range(20,(max_page*20),20):
    cities_ = transform_cities2(extract_cities(i))
    cities += cities_

In [4]:
cities = pd.DataFrame(cities)
cities['city'] = cities['city'].str.lstrip('Restaurantes en')
len(cities)

20

In [5]:
cities.head()

Unnamed: 0,city,href
0,Ciudad de México,/Restaurants-g150800-Mexico_City_Central_Mexic...
1,Guadalajara,/Restaurants-g150798-Guadalajara_Guadalajara_M...
2,Cancún,/Restaurants-g150807-Cancun_Yucatan_Peninsula....
3,Playa del Carmen,/Restaurants-g150812-Playa_del_Carmen_Yucatan_...
4,Monterrey,/Restaurants-g150782-Monterrey_Northern_Mexico...


scrapping restaurant list

In [6]:
def extract_restaurant_list(href_city) -> BeautifulSoup:
    
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}
    url = f'https://www.tripadvisor.com.mx/{href_city}'
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    return soup

In [7]:
def transform_restaurant_list(soup) -> list:
    
    restaurant_ = []
    items = soup.find_all('div', class_='RfBGI')
    
    for item in items:
        restaurant =  item.find('a', class_='Lwqic Cj b').text
        href =  item.find('a')['href']

        restaurant = {'restaurant': restaurant, 'href': href}
        restaurant_.append(restaurant)

    return restaurant_

scrapping restaurant list from first city

In [8]:
restaurant_list = transform_restaurant_list(extract_restaurant_list(cities.href[1]))

restaurant_list = pd.DataFrame(restaurant_list)

restaurant_list['sponsored'] = ''

for i in range(len(restaurant_list)):
    if '.' in restaurant_list['restaurant'][i]:
        restaurant_list['restaurant'][i] = restaurant_list['restaurant'][i].split('.')[1]
        restaurant_list['sponsored'][i] = restaurant_list['restaurant'][i].split('.')[0]

restaurant_list['sponsored'] = ~restaurant_list.sponsored.astype('bool')

len(restaurant_list)

33

In [9]:
restaurant_list.head()

Unnamed: 0,restaurant,href,sponsored
0,Bruna,/Restaurant_Review-g150798-d10027426-Reviews-B...,True
1,Restaurante SSAM,/Restaurant_Review-g150798-d12502535-Reviews-R...,False
2,Octo Pescadería,/Restaurant_Review-g150798-d23759227-Reviews-O...,False
3,Porfirio's Guadalajara,/Restaurant_Review-g150798-d16640367-Reviews-P...,False
4,Gyropolus,/Restaurant_Review-g150798-d23313188-Reviews-G...,False


list without sponsored restaurants

In [10]:
restaurant_list = restaurant_list[restaurant_list['sponsored'] == False]
restaurant_list.reset_index(inplace=True)
restaurant_list.drop(columns=['index'], inplace=True)

scrapping restaurants info

In [11]:
def extract_restaurant_info(page) -> BeautifulSoup:

    url = f'https://www.tripadvisor.com.mx{page}'

    browser = webdriver.Chrome(executable_path=r"/Users/rosaarzabala/Documents/Projects/TripAdvisorScrapper/chromedriver")
    browser.get(url)
    html = browser.page_source

    soup = BeautifulSoup(html, 'html.parser')

    return soup

restaurants_info_list = []

def transform_restaurant_info(soup,restaurant): #,city,state) -> list:

    div = soup.find('ul', class_='breadcrumbs')
    
    i = 0
    for item in div:
        try:
            if i == 0:
                state = item.find('a').text
            if i == 1:
                region = item.find('a').text
            if i == 2:
                city = item.find('a').text
                break
            i += 1
        except:
            pass

    div = soup.find('div', class_='YDAvY R2 F1 e k')
    for item in div:
        try:
            points = item.find('span', class_='ZDEqb').text
            # print('points:',points)
        except:
            pass

        try:
            reviews = item.find('a', class_='IcelI').text
            reviews = reviews.rstrip('opiniones')
            # print('reviews:',reviews)
        except:
            pass

    div = soup.find_all('div', class_='vQlTa H3')
    for item in div:
        try:
            ranking = item.find('span', class_='DsyBj cNFrA').text
            ranking = ranking.split(' ')[0]
            # print('ranking:',ranking)
            break
        except:
            ranking = ''

    div = soup.find('div', class_='kDZhm IdiaP Me')
    for item in div:
        try:
            address = item.find('span', class_='yEWoV').text
            # print('address:',address)
        except:
            address = ''

    div = soup.find('div', class_='IdiaP Me sNsFa')
    for item in div:
        try:
            web_site = item.find('a', class_='YnKZo Ci Wc _S C FPPgD')['href']
            # print('web_site:',web_site)
        except:
            web_site = ''

    div = soup.find_all('div', class_='kDZhm IdiaP Me')
    for item in div:
        try:
            coordinates = item.find('a', class_='YnKZo Ci Wc _S C FPPgD')['href']
            coordinates = coordinates.split('@')[-1]
            latitude = coordinates.split(',')[0]
            longitude = coordinates.split(',')[1]
            # print('latitude:',latitude)
            # print('longite:',longitude)
        except:
            pass

    restaurants_info = {'restaurant':restaurant,
                        'city': city,
                        'state': state,
                        'region': region,
                        'points': points,
                        'reviews': reviews,
                        'ranking': ranking,
                        'address': address,
                        'web_site': web_site,
                        'latitude': latitude,
                        'longitude': longitude}

    restaurants_info_list.append(restaurants_info)

data from the first five restaurants

In [12]:
restaurants_info_list = []

for i in range(10):
    href,restaurant = restaurant_list['href'][i],restaurant_list['restaurant'][i]
    transform_restaurant_info(extract_restaurant_info(href), restaurant)

df_restaurants = pd.DataFrame(restaurants_info_list)
df_restaurants

  browser = webdriver.Chrome(executable_path=r"/Users/rosaarzabala/Documents/Projects/TripAdvisorScrapper/chromedriver")


Unnamed: 0,restaurant,city,state,region,points,reviews,ranking,address,web_site,latitude,longitude
0,Restaurante SSAM,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,1035,#1,"Morelos 2122, Guadalajara 44600 México",http://www.facebook.com/RESTAURANTESSAMGDL/,20.675816,-103.37889
1,Octo Pescadería,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,397,#2,Lerdo de Tejada 2420 Entre Francisco Javier Ga...,https://www.facebook.com/Octo-Pescader%C3%ADa-...,20.670938,-103.37811
2,Porfirio's Guadalajara,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,1464,#3,"Calle Sao Paulo 2334 A, Col. Providencia, Punt...",http://porfirios.com.mx/,20.70163,-103.37652
3,Gyropolus,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,74,#4,"Calle Manuel López Cotilla 1370 Local 05, Casa...",https://www.gyropolus.com/,20.673956,-103.36651
4,Bruna,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,3032,#5,"Calle Lerdo de Tejada 2418 Colonia Lafayette, ...",http://www.facebook.com/brunagdl/,20.670849,-103.37795
5,D'Franck,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,247,#6,"Av. Adolfo Lopez Mateos Norte 2405, Italia Pro...",http://www.dfranck.com.mx,20.693823,-103.37552
6,Sagrantino,Zona Metropolitana de Guadalajara,México,Jalisco,4.5,1582,#7,Diagonal Golfo de Cortes 4152 Fraccionamiento ...,http://www.sagrantino.mx,20.68131,-103.39548
7,Hato Ramen,Zona Metropolitana de Guadalajara,México,Jalisco,4.5,453,#8,"Americana 44160, Guadalajara, Jal. MX Calle Ef...",http://hato.mx,20.669739,-103.36984
8,Cuerno Andares,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,295,#9,"Blvrd. Puerta De Hierro 4965, Puerta De Hierro...",http://cuerno.mx/,20.711157,-103.41264
9,El Italiano,Zona Metropolitana de Guadalajara,México,Jalisco,5.0,2192,#9,"Golfo de Cortes No.4134, Fracc. Monraz C.P., G...",http://www.elitaliano.mx,20.680569,-103.39462


In [13]:
df_restaurants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   restaurant  10 non-null     object
 1   city        10 non-null     object
 2   state       10 non-null     object
 3   region      10 non-null     object
 4   points      10 non-null     object
 5   reviews     10 non-null     object
 6   ranking     10 non-null     object
 7   address     10 non-null     object
 8   web_site    10 non-null     object
 9   latitude    10 non-null     object
 10  longitude   10 non-null     object
dtypes: object(11)
memory usage: 1008.0+ bytes


testing script

In [14]:
cities = pd.read_csv('data/city_list.csv')
cities = cities[['city', 'href']]
print(len(cities))
cities.head()

1062


Unnamed: 0,city,href
0,Ciudad de México,/Restaurants-g150800-Mexico_City_Central_Mexic...
1,Guadalajara,/Restaurants-g150798-Guadalajara_Guadalajara_M...
2,Cancún,/Restaurants-g150807-Cancun_Yucatan_Peninsula....
3,Playa del Carmen,/Restaurants-g150812-Playa_del_Carmen_Yucatan_...
4,Monterrey,/Restaurants-g150782-Monterrey_Northern_Mexico...


In [15]:
restaurant = pd.read_csv('data/restaurants.csv')
restaurant

Unnamed: 0.1,Unnamed: 0,restaurant,city,state,region,points,reviews,ranking,address,web_site,latitude,longitude
0,0,Balta,Ciudad de México,México,México Central y Costa del Golfo,5.0,471,#1,Avenida Paseo de La Reforma 297 Colonia Cuauht...,http://www.sofitel-mexico-city.com/restaurants...,19.428432,-99.16592
1,1,La Mansion Marriott Reforma,Ciudad de México,México,México Central y Costa del Golfo,5.0,991,#2,"Paseo de la Reforma 276 Col. Juarez, Mexico Ci...",http://www.facebook.com/mansionmarriottreforma,19.428345,-99.16426
2,2,Condimento Restaurant,Ciudad de México,México,México Central y Costa del Golfo,5.0,713,#3,Avenida Paseo de La Reforma 276 Mexico City Ma...,http://www.marriott.com/hotels/hotel-informati...,19.427828,-99.164024
3,3,La Distral,Ciudad de México,México,México Central y Costa del Golfo,5.0,377,#4,Avenida Paseo de La Reforma 80 En el Hotel Fie...,http://www.fiestamericana.com/hoteles-y-resort...,19.433002,-99.15458
4,4,Sonora Grill Coapa,Ciudad de México,México,México Central y Costa del Golfo,5.0,734,#5,"Calzada Acoxpa 610, Colonia Villa Coapa, Tlalp...",http://sonoragrill.com.mx,19.294357,-99.12684
5,5,Restaurante Condimento Emporio Reforma,Ciudad de México,México,México Central y Costa del Golfo,5.0,502,#6,"Av. Paseo De La Reforma 124, Col. Juárez Piso ...",http://hotelesemporio.com/hoteles/emporio-ciud...,19.431667,-99.157
6,6,Parole Polanco,Ciudad de México,México,México Central y Costa del Golfo,5.0,1445,#7,"Av. Emilio Castelar 163, Colonia Polanco III, ...",http://parole.mx/en/location-cdmx-restaurant,19.430412,-99.1994
7,7,Taquería y Restaurante Takotl,Ciudad de México,México,México Central y Costa del Golfo,5.0,148,#8,"Avenida Insurgentes sur 275 Roma Nte., Cuauhté...",http://linkreview.biz/mx/takotl,19.416248,-99.16546
8,8,Zefiro,Ciudad de México,México,México Central y Costa del Golfo,5.0,817,#10,"San Jerónimo 24 Centro Historico, Ciudad de Mé...",http://www.elclaustro.edu.mx/zefiro/,19.427332,-99.1382
9,9,La Vicenta,Ciudad de México,México,México Central y Costa del Golfo,5.0,664,#10,"Calzada Vallejo 1090 Parque Vía Vallejo, nivel...",http://lavicenta.com.mx/,19.4866,-99.152115
