In [129]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [132]:
def get_soup(url):    
    
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

In [133]:
#We get the links of every category of cuisine available in Yelp, in the following webpage
category_names = []
link_to_category = []
soup = get_soup('https://www.yelp.fr/c/paris/restaurants')
category_items = soup.select('.arrange.arrange--12.arrange--wrap.arrange--6-units a')
n_items = len(category_items)

for i in range(n_items):
    #get category name
    category = category_items[i].get_text()
    #get link
    href = category_items[i].get('href')
    link = 'https://www.yelp.fr' + href
    
    category_names.append(category)
    link_to_category.append(link)


In [134]:
data = pd.DataFrame( dict(category_names = category_names,
                         link_to_category = link_to_category))
                         

In [135]:
data.head()

Unnamed: 0,category_names,link_to_category
0,Afghan,https://www.yelp.fr/c/paris/afghan
1,Cuisine africaine,https://www.yelp.fr/c/paris/africain
2,Alsacien,https://www.yelp.fr/c/paris/alsacien
3,Américain traditionnel,https://www.yelp.fr/c/paris/am%C3%A9ricain
4,Cuisine des Pouilles,https://www.yelp.fr/c/paris/Cuisinedespouilles


In [136]:
#Previous links are not very useful to loop on, but we will use them to get better links that we can work with
#On each of the previous link there is a button in the middle of the page that redirects us on the pages we want

link_to_list = []
no_link = []
for i in range(len(link_to_category)):
    url = link_to_category[i]
    try:
        soup = get_soup(url)
        href = soup.select('.button-more')[0].get('href')
        link = 'https://www.yelp.fr' + href
        link_to_list.append(link)
        print(i, link)
    except:
        #if it doesn't find the button then it's because Yelp has already redirected you to the good webpage so we have to get the new link of the redirection (response.url)
        response = requests.get(url)
        link = response.url
        link_to_list.append(link)
        print(i, 'redirection', link)



0 https://www.yelp.fr/search?cflt=afghani&find_loc=Paris%2C+France
1 https://www.yelp.fr/search?cflt=african&find_loc=Paris%2C+France
2 https://www.yelp.fr/search?cflt=alsatian&find_loc=Paris%2C+France
3 https://www.yelp.fr/search?cflt=tradamerican&find_loc=Paris%2C+France
4 redirection https://www.yelp.fr/search?cflt=apulian&find_loc=Paris
5 https://www.yelp.fr/search?cflt=arabian&find_loc=Paris%2C+France
6 https://www.yelp.fr/search?cflt=argentine&find_loc=Paris%2C+France
7 https://www.yelp.fr/search?cflt=armenian&find_loc=Paris%2C+France
8 https://www.yelp.fr/search?cflt=asianfusion&find_loc=Paris%2C+France
9 redirection https://www.yelp.fr/search?cflt=australian&find_loc=Paris
10 redirection https://www.yelp.fr/search?cflt=austrian&find_loc=Paris
11 https://www.yelp.fr/search?cflt=auvergnat&find_loc=Paris%2C+France
12 redirection https://www.yelp.fr/search?cflt=bangladeshi&find_loc=Paris
13 https://www.yelp.fr/search?cflt=bbq&find_loc=Paris%2C+France
14 https://www.yelp.fr/search?c

In [137]:
data['link_to_restaurants_list'] = link_to_list

In [138]:
data.head()

Unnamed: 0,category_names,link_to_category,link_to_restaurants_list
0,Afghan,https://www.yelp.fr/c/paris/afghan,https://www.yelp.fr/search?cflt=afghani&find_l...
1,Cuisine africaine,https://www.yelp.fr/c/paris/africain,https://www.yelp.fr/search?cflt=african&find_l...
2,Alsacien,https://www.yelp.fr/c/paris/alsacien,https://www.yelp.fr/search?cflt=alsatian&find_...
3,Américain traditionnel,https://www.yelp.fr/c/paris/am%C3%A9ricain,https://www.yelp.fr/search?cflt=tradamerican&f...
4,Cuisine des Pouilles,https://www.yelp.fr/c/paris/Cuisinedespouilles,https://www.yelp.fr/search?cflt=apulian&find_l...


In [139]:
#checking for pages without restaurant because if we don't remove them it will crash our code for the final big loop
#we add a new column 'to_keep' to ou dataframe we fill it with x then we will update these value with the following code
data['to_keep'] = 'x'
for i in range(len(data)):
    url = data['link_to_restaurants_list'][i]
    soup = get_soup(url)
    #if in our selector the first word is 'Pas' (first word of the full sentence 'Pas de résultats pour...') then we won't keep this page
    if soup.select('.raw__09f24__3Obuy')[0].get_text().strip()[0:3] == 'Pas':
        data['to_keep'][i] = 0
        print(i, url, data['to_keep'][i])
    else:
        data['to_keep'][i] = 1
        print(i, url, data['to_keep'][i])

0 https://www.yelp.fr/search?cflt=afghani&find_loc=Paris%2C+France 1
1 https://www.yelp.fr/search?cflt=african&find_loc=Paris%2C+France 1
2 https://www.yelp.fr/search?cflt=alsatian&find_loc=Paris%2C+France 1
3 https://www.yelp.fr/search?cflt=tradamerican&find_loc=Paris%2C+France 1
4 https://www.yelp.fr/search?cflt=apulian&find_loc=Paris 1
5 https://www.yelp.fr/search?cflt=arabian&find_loc=Paris%2C+France 1
6 https://www.yelp.fr/search?cflt=argentine&find_loc=Paris%2C+France 1
7 https://www.yelp.fr/search?cflt=armenian&find_loc=Paris%2C+France 1
8 https://www.yelp.fr/search?cflt=asianfusion&find_loc=Paris%2C+France 1
9 https://www.yelp.fr/search?cflt=australian&find_loc=Paris 1
10 https://www.yelp.fr/search?cflt=austrian&find_loc=Paris 1
11 https://www.yelp.fr/search?cflt=auvergnat&find_loc=Paris%2C+France 1
12 https://www.yelp.fr/search?cflt=bangladeshi&find_loc=Paris 1
13 https://www.yelp.fr/search?cflt=bbq&find_loc=Paris%2C+France 1
14 https://www.yelp.fr/search?cflt=basque&find_loc=

In [140]:
#we keep only the pages with at least 1 restaurant
data = data.loc[data['to_keep']==1, :]

In [141]:
data.head()

Unnamed: 0,category_names,link_to_category,link_to_restaurants_list,to_keep
0,Afghan,https://www.yelp.fr/c/paris/afghan,https://www.yelp.fr/search?cflt=afghani&find_l...,1
1,Cuisine africaine,https://www.yelp.fr/c/paris/africain,https://www.yelp.fr/search?cflt=african&find_l...,1
2,Alsacien,https://www.yelp.fr/c/paris/alsacien,https://www.yelp.fr/search?cflt=alsatian&find_...,1
3,Américain traditionnel,https://www.yelp.fr/c/paris/am%C3%A9ricain,https://www.yelp.fr/search?cflt=tradamerican&f...,1
4,Cuisine des Pouilles,https://www.yelp.fr/c/paris/Cuisinedespouilles,https://www.yelp.fr/search?cflt=apulian&find_l...,1


In [142]:
#we drop row 4 as there is only 1 restaurant with no review for the 'Cuisine des Pouilles' Category
data = data.drop(4).reset_index(drop = True)



In [143]:
data.to_csv('links_to_categories.csv')

In [144]:
data.head()

Unnamed: 0,category_names,link_to_category,link_to_restaurants_list,to_keep
0,Afghan,https://www.yelp.fr/c/paris/afghan,https://www.yelp.fr/search?cflt=afghani&find_l...,1
1,Cuisine africaine,https://www.yelp.fr/c/paris/africain,https://www.yelp.fr/search?cflt=african&find_l...,1
2,Alsacien,https://www.yelp.fr/c/paris/alsacien,https://www.yelp.fr/search?cflt=alsatian&find_...,1
3,Américain traditionnel,https://www.yelp.fr/c/paris/am%C3%A9ricain,https://www.yelp.fr/search?cflt=tradamerican&f...,1
4,Cuisine du Maghreb,https://www.yelp.fr/c/paris/maghreb,https://www.yelp.fr/search?cflt=arabian&find_l...,1


In [145]:
#Creating N dataframes for each category of restaurant, each dataframe will contain the complete list of restaurants of the category
df_list = [pd.DataFrame() for i in range(len(data))]

In [147]:
data['link_to_restaurants_list'][0]

'https://www.yelp.fr/search?cflt=afghani&find_loc=Paris%2C+France'

In [195]:

#We loop on every category in the dataset 'data' so that we can scrap every restaurant that belongs to this category 
for j in range(121, len(data)):  
    #We need to empty these 3 lists after each loop
    link_to_restaurant = []
    list_restaurant_name = []
    list_review_count = []
    category = data['category_names'][j]
    url = data['link_to_restaurants_list'][j] 
    url = url + '&ns=1&start='
    soup = get_soup(url + '0')
    #we get the number of pages at the very bottom of each webpage, the selector selects for instance '1 sur 24' then we take the last element '24'
    num_of_pages = soup.select('div.border-color--default__09f24__R1nRO.text-align--center__09f24__31irQ \
    span.text__09f24__2tZKC.text-color--black-extra-light__09f24__38DtK.text-align--left__09f24__3Drs0')[0].get_text().split(' ')[-1]

    for k in range(int(num_of_pages)):
        #For each page we request url and create a soup (we put 10*k to get the correct starting element which depends on the number of pages)
        soup = get_soup(url + str(10*k))
        #Getting the branch that contains the name and the link of each restaurant
        restaurants_info = soup.select(".mainAttributes__09f24__26-vh")
        items_per_page = len(restaurants_info)

        #we store all items of the page in the 3 lists
        for i in range(items_per_page):
            link = 'https://www.yelp.fr' + restaurants_info[i].select('h4 span a.link__09f24__1kwXV')[0].get('href')
            name = restaurants_info[i].select('h4 span a.link__09f24__1kwXV')[0].get('name')
            try:
                #if we get an error with this line it means that there is no review so we need to raise an exception
                review_count = restaurants_info[i].select('.reviewCount__09f24__EUXPN')[0].get_text()
            except:
                #if exception raised then there is no review
                review_count = 0 

            link_to_restaurant.append(link)
            list_restaurant_name.append(name)
            list_review_count.append(review_count)
            df_restaurants = pd.DataFrame( dict(restaurant_name = list_restaurant_name,
                                                link_to_restaurant = link_to_restaurant,
                                                category = category,
                                                review_count = list_review_count ))
            df_list[j] = df_restaurants

        print('loop', j, 'page', k, category, 'OK')


loop 121 page 0 Spécialités toscanes OK
loop 122 page 0 Ukrainien OK
loop 123 page 0 Végétalien OK
loop 123 page 1 Végétalien OK
loop 123 page 2 Végétalien OK
loop 123 page 3 Végétalien OK
loop 123 page 4 Végétalien OK
loop 123 page 5 Végétalien OK
loop 123 page 6 Végétalien OK
loop 123 page 7 Végétalien OK
loop 123 page 8 Végétalien OK
loop 124 page 0 Végétarien OK
loop 124 page 1 Végétarien OK
loop 124 page 2 Végétarien OK
loop 124 page 3 Végétarien OK
loop 124 page 4 Végétarien OK
loop 124 page 5 Végétarien OK
loop 124 page 6 Végétarien OK
loop 124 page 7 Végétarien OK
loop 124 page 8 Végétarien OK
loop 124 page 9 Végétarien OK
loop 124 page 10 Végétarien OK
loop 124 page 11 Végétarien OK
loop 124 page 12 Végétarien OK
loop 124 page 13 Végétarien OK
loop 125 page 0 Vénézuélien OK
loop 126 page 0 Vietnamien OK
loop 126 page 1 Vietnamien OK
loop 126 page 2 Vietnamien OK
loop 126 page 3 Vietnamien OK
loop 126 page 4 Vietnamien OK
loop 126 page 5 Vietnamien OK
loop 126 page 6 Vietnamien

In [199]:
#now we concatenate all the sub dateframes
full_dataset = pd.concat(df_list)
full_dataset = full_dataset.reset_index(drop = True)

In [220]:
full_dataset['review_count'] = full_dataset['review_count'].astype('int')

In [247]:
full_dataset.to_csv('restaurants_datasetv2.csv')

In [232]:
full_dataset.head()

Unnamed: 0.1,Unnamed: 0,restaurant_name,link_to_restaurant,category,review_count
0,0,L’Afghanistan,https://www.yelp.fr/biz/l-afghanistan-paris,Afghan,46
1,1,Buzkashi,https://www.yelp.fr/biz/buzkashi-paris-3,Afghan,20
2,2,Afghani,https://www.yelp.fr/biz/afghani-paris-2,Afghan,45
3,3,La Table Afghane,https://www.yelp.fr/biz/la-table-afghane-paris,Afghan,4
4,4,Kootchi,https://www.yelp.fr/biz/kootchi-paris-2,Afghan,14


In [237]:
#we drop duplicates with respect to the link which is unique for a restaurant
full_dataset = full_dataset.drop_duplicates(subset = 'link_to_restaurant')