In [43]:
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
import csv
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from tqdm import tqdm

### Let's do web scrapping of afisha.ru: we need to collect info about restaurants in Moscow.

#### Firstly, we'll collect the restaurants links by going through all the pages with Selenium.

In [50]:
base_url = "https://www.afisha.ru"
url = f"{base_url}/msk/restaurants/restaurant_list/"

options = webdriver.ChromeOptions()
# options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

driver.get(url)
time.sleep(7)

all_links = set()  #We need to use set to avoid repeat links

while True:
    soup = BeautifulSoup(driver.page_source, "html.parser")
    for a in soup.find_all("a"):
        if "href" in a.attrs and a["href"].startswith("/msk/restaurant"): #Search only links starts with /msk/restaurant
            all_links.add(a["href"])

    try:
        next_button = driver.find_element(By.CLASS_NAME, 'Pagination_pagination-next__rtqsZ')#try to click next page button
        next_button.click()
        time.sleep(5)
    except Exception as e:
        try:
            close_button = driver.find_element(By.CLASS_NAME, 'popmechanic-close')#unexpectedly a banner pops up on the site, try to close it, as it covers the usual page layout
            close_button.click()
            time.sleep(2)

            next_button = driver.find_element(By.CLASS_NAME, 'Pagination_pagination-next__rtqsZ')#try to click next page button again
            next_button.click()
            time.sleep(5)
        except Exception as inner_e:
            print("There are no more pages:", inner_e)
            break

driver.quit()

#### Collected links are combined into a DataFrame to be shared with other group members for gathering information about restaurants.

In [None]:
#adding base url to links
full_links = [base_url + link for link in all_links]
all_links_list = list(full_links)

df = pd.DataFrame(all_links_list, columns=['Link'])
df.to_csv('resto_links.csv', index=False)

print(f"We saved {len(all_links_list)} links in 'rest_links.csv'")

We saved 11957 links in 'rest_links.csv'


#### To collect information about restaurants in Moscow, we selected the following characteristics:

* title: name of the restaurant
* rating: the establishment's rating on the website
* address: restaurant address
* metro: the nearest metro station to the restaurant
* avg_check: average check at the establishment
* breakfast: availability of a breakfast menu
* business_lunch: availability of a business lunch menu
* delivery: availability of delivery services
* parking: availability of parking
* catering: availability of catering services
* banquets: whether banquets are held
* telephone: restaurant phone number
* site: restaurant website
* restaurant_type: type of establishment or cuisine
* open_hours: opening hours
* positive_reviews: number of positive reviews on the website
* negative_reviews: number of negative reviews on the website


#### We extract the required tags and their content from each page and then form a DataFrame

In [27]:
# define function for collecting information about restaurants reviews
def get_reviews_count(button):
    content_div = button.find('div', class_='Button_button__content___D2b_')
    if content_div:
        counter_span = content_div.find('span', class_='DefaultTag_button__counter__64UpQ')
        if counter_span:
            return int(counter_span.text)
    return 0

In [None]:

options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

with open('resto_links.csv', 'w', encoding='utf-8-sig', newline='') as d:
    file_writer =  csv.writer(d, delimiter = ";")
    file_writer.writerow(["title", "rating", "address", "metro", "avg_check", "breakfast", "business_lunch", "deleviry", "parking", "catering", "banquets", "telephone", "site", "restaurant_type", "open_hours", "positive_reviews", "negative_reviews"]) 

with open('for_vera_2.csv','r') as f:
    all_links = f.readlines()

# parsing data with progress bar and remaining time
for i, link in enumerate(tqdm(all_links, desc="Links processing", unit="link", dynamic_ncols=True)):
    try:

        driver.get(link)
        driver.implicitly_wait(10)

        soup = BeautifulSoup(driver.page_source, "html.parser")

            
        name = soup.find('span',class_ = "Title_header__SIloF").text
        rank = soup.find('div',class_ = "RestaurantCover_rating-wrapper__CTNts")
        if rank!=None:
            estimate = rank.text
        else:
            estimate = 0
        adress = soup.find('div', class_="SectionTitle_wrapper__nAAJ0 RestaurantExtraInfo_address__aJsK2").text
        metro = soup.find_all('ul', class_="RestaurantExtraInfo_metro-list__KTBX3")
        metro_=[]
        for i in metro:
            for j in i.find_all('span', class_="Text_text__e9ILn"):
                metro_ += [j.text]
        s = ''       
        all = soup.find_all('div',class_ = "RestaurantExtraInfo_table__l34_J")
        for i in all:
            for j in i.find_all('span', class_="Text_text__e9ILn"):
                s += j.text + ','
            tot =s.split(',')[1:len(s):2]   
        bill = tot[0] 
        brekfast = tot[1]
        business =  tot[2]
        delivery = tot[3]
        parking = tot[4]
        keit = tot[5]
        feast = tot[6] 
        phone_num = tot[7]
        site_ = tot[8] 
        type_all = []
        type_ = soup.find_all('div',class_ = "RestaurantExtraInfo_tag__BqQ7e")
        for i in type_:
            for j in i.find_all('div', class_="Button_button__content___D2b_"):
                type_all +=[j.text] 
        work = soup.find_all('div',class_ = "RestaurantCover_content-wrapper__72Dox")
        for i in work:
            for j in i.find_all('span', class_="Text_text__e9ILn"):
                e = j          
        open_ = e.text.strip('Открыто c')    

        filters_div = soup.find('div', class_='FiltersReview_filters__7E8qs')
        positive_reviews = 0
        negative_reviews = 0

        if filters_div:
            buttons = filters_div.find_all('button', class_='Button_button__j_Rc9')
            for button in buttons:
                content_div = button.find('div', class_='Button_button__content___D2b_')
                if content_div:
                    text = content_div.text
                    if 'Положительные' in text:
                        positive_reviews = get_reviews_count(button)
                    elif 'Отрицательные' in text:
                        negative_reviews = get_reviews_count(button)

                    
        with open('data.csv', 'a', encoding='utf-8-sig', newline='') as d:
            file_writer =  csv.writer(d, delimiter = ";")
            file_writer.writerow([name, estimate, adress, metro_, bill, brekfast ,business ,delivery ,parking ,keit ,feast ,phone_num , site_, type_all, open_, positive_reviews, negative_reviews])

    except TypeError:
        pass
    except Exception as e:
        tqdm.write(f"Link processing error {link}: {e}")
driver.quit()

#### Creating DataFrame
Using CSV-file with whole data set

In [None]:
df = pd.read_csv('big_data.csv', delimiter=';')

#### Fixing NaN values and duplicates

In [74]:
def clean_data(
        data_frame: pd.DataFrame
) -> pd.DataFrame:
    """
    Cleans the DataFrame by removing duplicates and handling missing values.

        :param data_frame: pd.DataFrame

        :return: pd.DataFrame
    """

    # 1. Remove full duplicates and duplicates by title+adress
    data_frame.drop_duplicates(inplace=True)
    data_frame.drop_duplicates(subset=['title', 'address'], inplace=True)

    # 2. Remove rows with empty addresses(we cannot indificate restaurant only by title because there are many chain restaurants)
    data_frame = data_frame[data_frame['address'] != '']

    # 3. Remove rows with any missing values (NaN)
    data_frame.dropna(how='any', inplace=True)

    return data_frame
df = clean_data(data_frame=df)

PermissionError: [Errno 1] Operation not permitted

#### Fixing rating column

In [70]:
def fix_rating_column(
        data_frame: pd.DataFrame
) -> pd.DataFrame:
    '''
        There are cells with the pattern "n отзывов", they are replaced by '0'.
        The rating column is converted to the float (numeric) type.
        The average value of the review is set to 0.

        :param data_frame: pd.DataFrame

        :return: pd.DataFrame
    '''
    data_frame.loc[data_frame['rating'].str.contains('отз'), 'rating'] = '0'
    data_frame['rating'] = pd.to_numeric(data_frame['rating'])

    mean_rating = round(data_frame['rating'][data_frame['rating'] != 0.0].mean(), 2)
    data_frame['rating'] = np.where(data_frame['rating'] == 0.0, mean_rating, data_frame['rating'])

    return data_frame


df = fix_rating_column(data_frame=df)

PermissionError: [Errno 1] Operation not permitted

In [61]:
def transform_information_columns(
        data_frame: pd.DataFrame
) -> pd.DataFrame:
    '''
        Changing breakfast, business_lunch, deleviry, parking, catering, banquets columns
        to 0 or 1. Added column is_city_center (0 or 1) to see is restaurant in garden ring in Moscow.

        :param data_frame: pd.DataFrame

        :return: pd.DataFrame
    '''
    station_metro = 'Цветной бульварТверскаяЧеховскаяБоровицкаяаяПолянкаДобрынинскаяСерпуховскаяМаяковскаяТеатральнаяНовокузнецкаяПавелецкаяПушкинскаяКузнецкий мостКитай-городТаганскаяМарксистскаяСмоленскаяАрбатскаяАлександровский садПлощадь РеволюцииЧкаловскаяКурскаяПарк культурыКропоткинскаяБиблиотека им. ЛенинаОхотный РядЛубянкаКрасные воротаЧистые прудыКрасные воротаОктябрьскаяТретьяковскаяТургеневскаяСухаревскаяПроспект МираТрубнаяСретенский бульвар' 
    data_frame['breakfast'] = (data_frame['breakfast'] =='Есть').astype(int)
    data_frame['business_lunch'] = (data_frame['business_lunch'] =='Есть').astype(int)
    data_frame['deleviry'] = (data_frame['deleviry'] =='Есть').astype(int)
    data_frame['parking'] = (data_frame['parking'] =='Есть').astype(int)
    data_frame['catering'] = (data_frame['catering'] =='Есть').astype(int)
    data_frame['banquets'] = (data_frame['banquets'] =='Есть').astype(int)

    sites=list(data_frame['site'].values)
    sites1=[]
    for i in range(len(sites)):
        if sites[i].startswith('http://'):
            sites1+=[1]
        else:
            sites1+=[0]       

    data_frame['site'] = sites1

    telephones=list(data_frame['telephone'].values)
    telephone1=[]
    for i in range(len(telephones)):
        if telephones[i].startswith('+7'):
            telephone1+=[1]
        else:
            telephone1+=[0]   
    data_frame['telephone'] = telephone1     

    #data_frame.loc[data_frame['avg_check'] =='Нет', 'avg_check'] = 0   

    avg = list(data_frame['avg_check'].values)
    avg1=[]
    for i in range(len(avg)):
        if avg[i].startswith('До 7'):
            avg1+=[1]
        elif avg[i].startswith('70'):
            avg1+=[2]
        elif avg[i].startswith('170'):
            avg1+=[3]
        elif avg[i].startswith('Бо'):
            avg1+=[4]        
        else:
            avg1+=[0]   
    data_frame['avg_check'] = avg1
    metros = list(data_frame['metro'].values)
    metro1=[]
    metro2=[]
    for i in metros:
        metro1 += [i[2:len(i)-2].split(',')]
    for i in range(len(metro1)):
        if metro1[i][0] in station_metro:
            metro2 +=[1]
        else:
            metro2 +=[0] 
    data_frame['is_city_center']=metro2     

    return data_frame

df = transform_information_columns(data_frame=df)

#### Separating open_hours column

In [62]:
def separate_opened_dates(
        data_frame: pd.DataFrame
) -> pd.DataFrame:
    '''
        There is "hh:mm до hh:mm" format in open_hours column.
        It need to separate this column to opened_from_dttm and opened_to_dttm
        and correct mistakes.

        :param data_frame: pd.DataFrame

        :return: pd.DataFrame
    '''

    for index, row in data_frame.iterrows():
        if ' до ' not in row['open_hours']:
            data_frame.loc[index, 'open_hours'] = '10:00 до 22:00' # change cells with mistakes to usual open-close time '10:00 до 22:00'
    
    for index, row in data_frame.iterrows():
        open_from_to = row['open_hours'].split(' до ')
        data_frame.loc[index, 'opened_from_dttm'] = datetime.time(datetime.strptime(open_from_to[0], '%H:%M'))
        data_frame.loc[index, 'opened_to_dttm'] = datetime.time(datetime.strptime(open_from_to[1], '%H:%M'))
    
    data_frame = data_frame.drop(columns=['open_hours'])
    return data_frame
    

df = separate_opened_dates(data_frame=df)

#### Sorting review and finding good restaurants

In [63]:
def sorting_review(
        data_frame: pd.DataFrame
) -> pd.DataFrame:
    '''
        If there are more positive reviews than negative ones 
        and the rating is more than 8, 
        then write 1 to the new top_restaurant column.

        :param data_frame: pd.DataFrame

        :return: pd.DataFrame
    '''

    top_rest = []
    pos = data_frame['positive_reviews'].values
    neg = data_frame['negative_reviews'].values
    ran = data_frame['rating'].values

    for i in range(len(pos)):
        if (pos[i]>=neg[i]) and ran[i]>=8:
            top_rest += [1]
        else:
            top_rest += [0]  
    data_frame['top_restaurant']= top_rest
    return data_frame

df = sorting_review(data_frame=df)