In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import copy
from booking_reader import BookingReader

In [2]:
def parse_hotel_refs(url: str):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    hotels = soup.findAll("div", class_="sr-hotel__title-wrap")
    
    hotel_refs = []
    
    for hotel in hotels:
        try:
            hotel_soup = BeautifulSoup(str(hotel), 'html.parser')
            href = hotel_soup.find("a", class_="hotel_name_link url").get('href')

            hotel_ref = href.split("/")[3][:-1]
            hotel_refs.append(hotel_ref)
        except:
            print("Error")
            
    return hotel_refs

In [3]:
def parse_all_hotels_refs(urls):
    references = []
    for url in urls:
        references += parse_hotel_refs(url)
    
    return references

In [4]:
def get_all_possible_urls(url1: str, url2: str = None, num_pages: int = None):
    if url2 is None or num_pages is None:
        return [url1]
    
    urls = [url1, url2]
    
    for page in range(3, num_pages + 1):
        url = url2[:-2] + str((page - 1) * 25)
        urls.append(url)
    
    return urls

In [5]:
def create_review_url(hotel_ref: str, num: int):
    ref = hotel_ref.replace('.uk.html', '.ru.html')
    url = f"https://www.booking.com/reviews/ua/hotel/{ref}?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page={num};r_lang=ru;rows=75&"
    return url

In [6]:
def read_reviews_for_ref(hotel_ref: str):
    all_reviews = []
    
    page: int = 1
    frames = [] 
    while True:
        review_url = create_review_url(hotel_ref, page)
        try:
            df = BookingReader.parse_reviews_from_url(review_url)
            if len(df) == 0:
                break
            df['hotel'] = hotel_ref
            frames.append(df)
            page += 1
            
        except:
            break
            
    if len(frames) == 0:
        return None
    
    return pd.concat(frames)  

In [7]:
def read_reviews_for_refences(refs):
    frames = []
    for ref in refs:
        df = read_reviews_for_ref(ref)
        frames.append(df)
        
    return pd.concat(frames)

### Parse data

In [10]:
input_data = [
#     { 
#     'location': 'kyiv',
#     'url1' : "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1044367&class_interval=1&dest_id=-1044367&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=1da163700aff009d&ssb=empty&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=", 
#     'url2' : "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1044367&class_interval=1&dest_id=-1044367&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=8f6d63732a67016d&ss_all=0&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=25",
#     'num_pages' : 12
# },
# {
#     'location' : 'lviv',
#     'url1' : 'https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&class_interval=1&dest_id=-1045268&dest_type=city&from_sf=1&group_adults=2&group_children=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&src=index&srpvid=a2b1635941690167&ss=Львів&ssb=empty&top_ufis=1&rows=25',
#     'url2' : 'https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&class_interval=1&dest_id=-1045268&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&src=index&srpvid=08d163cdf20e0099&ss=Львів&ss_all=0&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=25',
#     'num_pages': 7
# },
    
# {
#     'location': 'odesa',
#     'url1': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1049092&class_interval=1&dest_id=-1049092&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=edc46463ed680070&ssb=empty&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=",
#     'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1049092&class_interval=1&dest_id=-1049092&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e3db6468626700ee&ss_all=0&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=25",
#     'num_pages' : 11
# },
    
# {
#     'location' : 'kharkiv',
#     'url1': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1049092&class_interval=1&clear_ht_id=1&clear_ht_id=1&dest_id=-1041320&dest_type=city&from_sf=1&group_adults=2&group_children=0&iata=HRK&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=cfec64b8cdfe0151&ss=Харків%2C%20Харківська%20область%2C%20Україна&ss_raw=Харків&ssb=empty&ssne=Одеса&ssne_untouched=Одеса&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=",
#     'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1049092&class_interval=1&clear_ht_id=1&clear_ht_id=1&dest_id=-1041320&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&iata=HRK&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=cfec64bc09540029&ss=Харків%2C%20Харківська%20область%2C%20Україна&ss_all=0&ss_raw=Харків&ssb=empty&sshis=0&ssne=Одеса&ssne_untouched=Одеса&top_ufis=1&rows=25&offset=25",
#     'num_pages': 4
        
# },
{
    'location': 'dnipro',
    'url1': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1041320&class_interval=1&dest_id=-1037865&dest_id=-1037865&dest_type=city&dest_type=city&from_sf=1&group_adults=2&group_children=0&iata=DNK&iata=DNK&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=344e64fffa500062&ss=Дніпро%2C%20Дніпропетровська%20область%2C%20Україна&ss_raw=Дніпро&ssb=empty&ssne=Харків&ssne_untouched=Харків&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=",
    'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1041320&class_interval=1&dest_id=-1037865&dest_id=-1037865&dest_type=city&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&iata=DNK&iata=DNK&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=84686503af9c0087&ss=Дніпро%2C%20Дніпропетровська%20область%2C%20Україна&ss_all=0&ss_raw=Дніпро&ssb=empty&sshis=0&ssne=Харків&ssne_untouched=Харків&top_ufis=1&rows=25&offset=25",
    'num_pages': 3
},
{
    'location': 'uzhgorod',
    'url1' : 'https://www.booking.com/searchresults.uk.html?label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1057311&class_interval=1&dest_id=-1057311&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e2116544da140052&ss_all=0&ssb=empty&sshis=0&top_ufis=1&nflt=ht_id%3D204%3B&percent_htype_hotel=1&rsf=',
    'url2': 'https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1057311&class_interval=1&dest_id=-1057311&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&nflt=ht_id%3D204%3B&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e2116547c2a700f1&ssb=empty&top_ufis=1&rows=25&offset=25',
    'num_pages': 2     
},
{
    'location': 'ivano-frankivsk',
    'url1': "https://www.booking.com/searchresults.uk.html?label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1040327&class_interval=1&dest_id=-1040327&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e3db65808d2200b9&ss_all=0&ssb=empty&sshis=0&top_ufis=1&nflt=ht_id%3D204%3B&percent_htype_hotel=1&rsf=",
    'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1040327&class_interval=1&dest_id=-1040327&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&nflt=ht_id%3D204%3B&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=668d6587665b003a&ssb=empty&top_ufis=1&rows=25&offset=25",
    'num_pages': 2
}]

In [11]:
for item in input_data:
    location = item['location']
    url1 = item['url1']
    url2 = item['url2']
    num_pages = item['num_pages']
    
    urls = get_all_possible_urls(url1, url2, num_pages)
    hotel_refs = parse_all_hotels_refs(urls)    
    print(f"There were parsed {len(hotel_refs)} in {location}")
    
    df = read_reviews_for_refences(hotel_refs)
    
    print(f"There were parsed {len(df)} reviews in {location}")
    
    df.to_csv(f"../dataset/{location}-reviews-ru.csv", index=False)
    
    

There were parsed 65 in dnipro
'NoneType' object has no attribute 'text'
https://www.booking.com/reviews/ua/hotel/spa-tsunami.ru.html?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page=2;r_lang=ru;rows=75&
'NoneType' object has no attribute 'text'
https://www.booking.com/reviews/ua/hotel/menorah.ru.html?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page=1;r_lang=ru;rows=75&
'NoneType' object has no attribute 'text'
https://www.booking.com/reviews/ua/hotel/abri.ru.html?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page=2;r_lang=ru;rows=

### Check dataset

In [12]:
kyiv_reviews_df = pd.DataFrame.from_csv('../dataset/kyiv-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [13]:
len(kyiv_reviews_df)

38024

In [14]:
lviv_reviews_df = pd.DataFrame.from_csv('../dataset/lviv-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [15]:
len(lviv_reviews_df)

23551

In [16]:
odesa_reviews_df = pd.DataFrame.from_csv('../dataset/odesa-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [17]:
kharkiv_reviews_df = pd.DataFrame.from_csv('../dataset/kharkiv-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [18]:
dnipro_reviews_df = pd.DataFrame.from_csv('../dataset/dnipro-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [19]:
uzhgorod_reviews_df = pd.DataFrame.from_csv('../dataset/uzhgorod-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [20]:
ivano_frankivsk_df = pd.DataFrame.from_csv('../dataset/ivano-frankivsk-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [21]:
len(kyiv_reviews_df) + len(lviv_reviews_df) + len(odesa_reviews_df) + len(kharkiv_reviews_df) + len(dnipro_reviews_df) + len(uzhgorod_reviews_df) + len(ivano_frankivsk_df)

105737