In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
def get_soup(url):    
    
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    
    return soup

In [3]:
#We get the links of every category of cuisine available in Yelp, in the following webpage
category_names = []
link_to_category = []
soup = get_soup('https://www.yelp.fr/c/paris/restaurants')
category_items = soup.select('.arrange.arrange--12.arrange--wrap.arrange--6-units a')
n_items = len(category_items)

for i in range(n_items):
    #get category name
    category = category_items[i].get_text()
    #get link
    href = category_items[i].get('href')
    link = 'https://www.yelp.fr' + href
    
    category_names.append(category)
    link_to_category.append(link)


In [209]:
data = pd.DataFrame( dict(category_names = category_names,
                         link_to_category = link_to_category))
                         

In [210]:
data.head()

Unnamed: 0,category_names,link_to_category
0,Afghan,https://www.yelp.fr/c/paris/afghan
1,Cuisine africaine,https://www.yelp.fr/c/paris/africain
2,Alsacien,https://www.yelp.fr/c/paris/alsacien
3,Américain traditionnel,https://www.yelp.fr/c/paris/am%C3%A9ricain
4,Cuisine des Pouilles,https://www.yelp.fr/c/paris/Cuisinedespouilles


In [165]:
#Previous links are not very useful to loop on, but we will use them to get better links that we can work with
#On each of the previous link there is a button in the middle of the page that redirects us on the pages we want

link_to_list = []
no_link = []
for i in range(len(link_to_category)):
    url = link_to_category[i]
    try:
        soup = get_soup(url)
        href = soup.select('.button-more')[0].get('href')
        link = 'https://www.yelp.fr' + href
        link_to_list.append(link)
        print(i, link)
    except:
        #if it doesn't find the button then it's because Yelp has already redirected you to the good webpage so we have to get the new link of the redirection (response.url)
        response = requests.get(url)
        link = response.url
        link_to_list.append(link)
        print(i, 'redirection', link)



0 https://www.yelp.fr/search?cflt=afghani&find_loc=Paris%2C+France
1 https://www.yelp.fr/search?cflt=african&find_loc=Paris%2C+France
2 https://www.yelp.fr/search?cflt=alsatian&find_loc=Paris%2C+France
3 https://www.yelp.fr/search?cflt=tradamerican&find_loc=Paris%2C+France
4 redirection https://www.yelp.fr/search?cflt=apulian&find_loc=Paris
5 https://www.yelp.fr/search?cflt=arabian&find_loc=Paris%2C+France
6 https://www.yelp.fr/search?cflt=argentine&find_loc=Paris%2C+France
7 https://www.yelp.fr/search?cflt=armenian&find_loc=Paris%2C+France
8 https://www.yelp.fr/search?cflt=asianfusion&find_loc=Paris%2C+France
9 redirection https://www.yelp.fr/search?cflt=australian&find_loc=Paris
10 redirection https://www.yelp.fr/search?cflt=austrian&find_loc=Paris
11 https://www.yelp.fr/search?cflt=auvergnat&find_loc=Paris%2C+France
12 redirection https://www.yelp.fr/search?cflt=bangladeshi&find_loc=Paris
13 https://www.yelp.fr/search?cflt=bbq&find_loc=Paris%2C+France
14 https://www.yelp.fr/search?c

In [214]:
data['link_to_restaurants_list'] = link_to_list

In [220]:
#checking for pages without restaurent because if we don't remove them it will crash our code for the final big loop
data['to_keep'] = 'x'
for i in range(len(data)):
    url = data['link_to_restaurants_list'][i]
    soup = get_soup(url)
    #if in our selector the first word is 'Pas' (first word of the full sentence 'Pas de résultats pour...') then we won't keep this page
    if soup.select('.raw__09f24__3Obuy')[0].get_text().strip()[0:3] == 'Pas':
        data['to_keep'][i] = 0
        print(i, url, data['to_keep'][i])
    else:
        data['to_keep'][i] = 1
        print(i, url, data['to_keep'][i])

0 https://www.yelp.fr/search?cflt=afghani&find_loc=Paris%2C+France 1
1 https://www.yelp.fr/search?cflt=african&find_loc=Paris%2C+France 1
2 https://www.yelp.fr/search?cflt=alsatian&find_loc=Paris%2C+France 1
3 https://www.yelp.fr/search?cflt=tradamerican&find_loc=Paris%2C+France 1
4 https://www.yelp.fr/search?cflt=apulian&find_loc=Paris 1
5 https://www.yelp.fr/search?cflt=arabian&find_loc=Paris%2C+France 1
6 https://www.yelp.fr/search?cflt=argentine&find_loc=Paris%2C+France 1
7 https://www.yelp.fr/search?cflt=armenian&find_loc=Paris%2C+France 1
8 https://www.yelp.fr/search?cflt=asianfusion&find_loc=Paris%2C+France 1
9 https://www.yelp.fr/search?cflt=australian&find_loc=Paris 1
10 https://www.yelp.fr/search?cflt=austrian&find_loc=Paris 1
11 https://www.yelp.fr/search?cflt=auvergnat&find_loc=Paris%2C+France 1
12 https://www.yelp.fr/search?cflt=bangladeshi&find_loc=Paris 1
13 https://www.yelp.fr/search?cflt=bbq&find_loc=Paris%2C+France 1
14 https://www.yelp.fr/search?cflt=basque&find_loc=

In [223]:
#we keep only the pages with at least 1 restaurant
data = data.loc[data['to_keep']==1, :]

In [225]:
data.head()

Unnamed: 0,category_names,link_to_category,link_to_restaurants_list,to_keep
0,Afghan,https://www.yelp.fr/c/paris/afghan,https://www.yelp.fr/search?cflt=afghani&find_l...,1
1,Cuisine africaine,https://www.yelp.fr/c/paris/africain,https://www.yelp.fr/search?cflt=african&find_l...,1
2,Alsacien,https://www.yelp.fr/c/paris/alsacien,https://www.yelp.fr/search?cflt=alsatian&find_...,1
3,Américain traditionnel,https://www.yelp.fr/c/paris/am%C3%A9ricain,https://www.yelp.fr/search?cflt=tradamerican&f...,1
4,Cuisine des Pouilles,https://www.yelp.fr/c/paris/Cuisinedespouilles,https://www.yelp.fr/search?cflt=apulian&find_l...,1


In [226]:
#we drop row 4 as there is only 1 restaurant with no review for the 'Cuisine des Pouilles' Category
data = data.drop(4).reset_index(drop = True)



In [132]:
#Creating N dataframes for each category of restaurant, each dataframe will contain the complete list of restaurants of the category
df_list = [pd.DataFrame() for i in range(len(data))]

In [229]:

#We loop on every category in the dataset 'data' 
for j in range(23, len(data)):  
    #We need to empty these 2 lists after each loop
    link_to_restaurant = []
    restaurant_name = []
    category = data['category_names'][j]
    url = data['link_to_restaurants_list'][j]
    url = url + '&ns=1&start='
    soup = get_soup(url + '0')
    #we get the number of pages at the very bottom of each webpage, the selector selects for instance '1 sur 24' then we take the last element '24'
    num_of_pages = soup.select('div.border-color--default__09f24__R1nRO.text-align--center__09f24__31irQ \
    span.text__09f24__2tZKC.text-color--black-extra-light__09f24__38DtK.text-align--left__09f24__3Drs0')[0].get_text().split(' ')[-1]

    for k in range(int(num_of_pages)):
        #For each page we request url and create a soup (we put 10*k to get the correct starting element which depends on the number of pages)
        soup = get_soup(url + str(10*k))
        #Getting the branch that contains the name and the link of each restaurant
        restaurants_info = soup.select('h4 span a.link__09f24__1kwXV ')
        items_per_page = len(restaurants_info)

        #we store all items of the page in lists
        for i in range(items_per_page):
            link = 'https://www.yelp.fr' + restaurants_info[i].get('href')
            name = restaurants_info[i].get('name')

            link_to_restaurant.append(link)
            restaurant_name.append(name)
            df_restaurants = pd.DataFrame( dict(restaurant_name = restaurant_name,
                                                link_to_restaurant = link_to_restaurant,
                                                category = category ))
            df_list[j] = df_restaurants

        print('loop', j, 'page', k, category, 'OK')


p 54 page 1 Hawaïen & Polynésien OK
loop 54 page 2 Hawaïen & Polynésien OK
loop 55 page 0 Himalayen/Népalais OK
loop 55 page 1 Himalayen/Népalais OK
loop 55 page 2 Himalayen/Népalais OK
loop 56 page 0 Hondurien OK
loop 57 page 0 Hot Dog OK
loop 57 page 1 Hot Dog OK
loop 57 page 2 Hot Dog OK
loop 57 page 3 Hot Dog OK
loop 57 page 4 Hot Dog OK
loop 58 page 0 Fondue chinoise OK
loop 58 page 1 Fondue chinoise OK
loop 59 page 0 Hongrois OK
loop 60 page 0 Indien OK
loop 60 page 1 Indien OK
loop 60 page 2 Indien OK
loop 60 page 3 Indien OK
loop 60 page 4 Indien OK
loop 60 page 5 Indien OK
loop 60 page 6 Indien OK
loop 60 page 7 Indien OK
loop 60 page 8 Indien OK
loop 60 page 9 Indien OK
loop 60 page 10 Indien OK
loop 60 page 11 Indien OK
loop 60 page 12 Indien OK
loop 60 page 13 Indien OK
loop 60 page 14 Indien OK
loop 60 page 15 Indien OK
loop 60 page 16 Indien OK
loop 60 page 17 Indien OK
loop 60 page 18 Indien OK
loop 60 page 19 Indien OK
loop 60 page 20 Indien OK
loop 60 page 21 Indien OK

In [242]:
len(df_list)

129

In [260]:
full_dataset = pd.concat(df_list)
full_dataset = full_dataset.reset_index(drop = True)

In [261]:
full_dataset.to_csv('restaurants_dataset.csv')