In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import copy
from booking_reader import BookingReader

In [3]:
def parse_hotel_refs(url: str):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    hotels = soup.findAll("div", class_="sr-hotel__title-wrap")
    
    hotel_refs = []
    
    for hotel in hotels:
        try:
            hotel_soup = BeautifulSoup(str(hotel), 'html.parser')
            href = hotel_soup.find("a", class_="hotel_name_link url").get('href')

            hotel_ref = href.split("/")[3][:-1]
            hotel_refs.append(hotel_ref)
        except:
            print("Error")
            
    return hotel_refs

In [4]:
def parse_all_hotels_refs(urls):
    references = []
    for url in urls:
        references += parse_hotel_refs(url)
    
    return references

In [5]:
def get_all_possible_urls(url1: str, url2: str = None, num_pages: int = None):
    if url2 is None or num_pages is None:
        return [url1]
    
    urls = [url1, url2]
    
    for page in range(3, num_pages + 1):
        url = url2[:-2] + str((page - 1) * 25)
        urls.append(url)
    
    return urls

In [17]:
def create_review_url(hotel_ref: str, num: int):
#     ref = hotel_ref.replace('.uk.html', '.ru.html')
    ref = hotel_ref
    url = f"https://www.booking.com/reviews/ua/hotel/{ref}?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page={num};r_lang=uk;rows=75&"
    return url

In [6]:
def read_reviews_for_ref(hotel_ref: str):
    all_reviews = []
    
    page: int = 1
    frames = [] 
    while True:
        review_url = create_review_url(hotel_ref, page)
        try:
            df = BookingReader.parse_reviews_from_url(review_url)
            if len(df) == 0:
                break
            df['hotel'] = hotel_ref
            frames.append(df)
            page += 1
            
        except:
            break
            
    if len(frames) == 0:
        return None
    
    return pd.concat(frames)  

In [7]:
def read_reviews_for_refences(refs):
    frames = []
    for ref in refs:
        df = read_reviews_for_ref(ref)
        frames.append(df)
        
    return pd.concat(frames)

### Parse data

In [11]:
input_data = [
{ 
    'location': 'kyiv',
    'url1' : "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1044367&class_interval=1&dest_id=-1044367&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=1da163700aff009d&ssb=empty&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=", 
    'url2' : "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1044367&class_interval=1&dest_id=-1044367&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=8f6d63732a67016d&ss_all=0&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=25",
    'num_pages' : 12
},
{
    'location' : 'lviv',
    'url1' : 'https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&class_interval=1&dest_id=-1045268&dest_type=city&from_sf=1&group_adults=2&group_children=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&src=index&srpvid=a2b1635941690167&ss=Львів&ssb=empty&top_ufis=1&rows=25',
    'url2' : 'https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&class_interval=1&dest_id=-1045268&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&src=index&srpvid=08d163cdf20e0099&ss=Львів&ss_all=0&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=25',
    'num_pages': 7
},
    
{
    'location': 'odesa',
    'url1': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1049092&class_interval=1&dest_id=-1049092&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=edc46463ed680070&ssb=empty&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=",
    'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1049092&class_interval=1&dest_id=-1049092&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e3db6468626700ee&ss_all=0&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=25",
    'num_pages' : 11
},
    
# {
#     'location' : 'kharkiv',
#     'url1': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1049092&class_interval=1&clear_ht_id=1&clear_ht_id=1&dest_id=-1041320&dest_type=city&from_sf=1&group_adults=2&group_children=0&iata=HRK&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=cfec64b8cdfe0151&ss=Харків%2C%20Харківська%20область%2C%20Україна&ss_raw=Харків&ssb=empty&ssne=Одеса&ssne_untouched=Одеса&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=",
#     'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1049092&class_interval=1&clear_ht_id=1&clear_ht_id=1&dest_id=-1041320&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&iata=HRK&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=cfec64bc09540029&ss=Харків%2C%20Харківська%20область%2C%20Україна&ss_all=0&ss_raw=Харків&ssb=empty&sshis=0&ssne=Одеса&ssne_untouched=Одеса&top_ufis=1&rows=25&offset=25",
#     'num_pages': 4
        
# },
# {
#     'location': 'dnipro',
#     'url1': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1041320&class_interval=1&dest_id=-1037865&dest_id=-1037865&dest_type=city&dest_type=city&from_sf=1&group_adults=2&group_children=0&iata=DNK&iata=DNK&label_click=undef&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=344e64fffa500062&ss=Дніпро%2C%20Дніпропетровська%20область%2C%20Україна&ss_raw=Дніпро&ssb=empty&ssne=Харків&ssne_untouched=Харків&top_ufis=1&nflt=ht_id%3D204%3Bht_id%3D208%3B&rsf=",
#     'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&ac_click_type=b&ac_position=0&city=-1041320&class_interval=1&dest_id=-1037865&dest_id=-1037865&dest_type=city&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&iata=DNK&iata=DNK&inac=0&index_postcard=0&label_click=undef&nflt=ht_id%3D204%3Bht_id%3D208%3B&no_rooms=1&percent_htype_hotel=1&postcard=0&raw_dest_type=city&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&search_selected=1&shw_aparth=1&slp_r_match=0&src=searchresults&srpvid=84686503af9c0087&ss=Дніпро%2C%20Дніпропетровська%20область%2C%20Україна&ss_all=0&ss_raw=Дніпро&ssb=empty&sshis=0&ssne=Харків&ssne_untouched=Харків&top_ufis=1&rows=25&offset=25",
#     'num_pages': 3
# },
# {
#     'location': 'uzhgorod',
#     'url1' : 'https://www.booking.com/searchresults.uk.html?label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1057311&class_interval=1&dest_id=-1057311&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e2116544da140052&ss_all=0&ssb=empty&sshis=0&top_ufis=1&nflt=ht_id%3D204%3B&percent_htype_hotel=1&rsf=',
#     'url2': 'https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1057311&class_interval=1&dest_id=-1057311&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&nflt=ht_id%3D204%3B&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e2116547c2a700f1&ssb=empty&top_ufis=1&rows=25&offset=25',
#     'num_pages': 2     
# },
# {
#     'location': 'ivano-frankivsk',
#     'url1': "https://www.booking.com/searchresults.uk.html?label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1040327&class_interval=1&dest_id=-1040327&dest_type=city&dr_ps=IDR&dtdisc=0&from_idr=1&group_adults=2&group_children=0&ilp=1&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=e3db65808d2200b9&ss_all=0&ssb=empty&sshis=0&top_ufis=1&nflt=ht_id%3D204%3B&percent_htype_hotel=1&rsf=",
#     'url2': "https://www.booking.com/searchresults.uk.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaOkBiAEBmAEpuAEHyAEN2AEB6AEB-AELiAIBqAIDuAKb7uzvBcACAQ&sid=b695f9c28245d03eeefe8ce9bcf78790&tmpl=searchresults&city=-1040327&class_interval=1&dest_id=-1040327&dest_type=city&dr_ps=IDR&from_idr=1&group_adults=2&group_children=0&ilp=1&label_click=undef&nflt=ht_id%3D204%3B&no_rooms=1&percent_htype_hotel=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&srpvid=668d6587665b003a&ssb=empty&top_ufis=1&rows=25&offset=25",
#     'num_pages': 2
# }
    ]

In [11]:
for item in input_data:
    location = item['location']
    url1 = item['url1']
    url2 = item['url2']
    num_pages = item['num_pages']
    
    urls = get_all_possible_urls(url1, url2, num_pages)
    hotel_refs = parse_all_hotels_refs(urls)    
    print(f"There were parsed {len(hotel_refs)} in {location}")
    
    df = read_reviews_for_refences(hotel_refs)
    
    print(f"There were parsed {len(df)} reviews in {location}")
    
    df.to_csv(f"../dataset/{location}-reviews-ru.csv", index=False)
    
    

There were parsed 65 in dnipro
'NoneType' object has no attribute 'text'
https://www.booking.com/reviews/ua/hotel/spa-tsunami.ru.html?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page=2;r_lang=ru;rows=75&
'NoneType' object has no attribute 'text'
https://www.booking.com/reviews/ua/hotel/menorah.ru.html?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page=1;r_lang=ru;rows=75&
'NoneType' object has no attribute 'text'
https://www.booking.com/reviews/ua/hotel/abri.ru.html?label=gen173nr-1DCA0o6QFCCmhvc3RlbC1zdW5IKVgEaOkBiAEBmAEpuAEHyAEN2AED6AEB-AECiAIBqAIDuAKQgdPrBcACAQ;sid=45a571d0756ef620ea794c3a3ec26fbb;customer_type=total;hp_nav=0;old_page=0;order=featuredreviews;page=2;r_lang=ru;rows=

In [53]:
def get_hotels_refs():
    all_hotel_refs = []
    for item in input_data:
        try:
            location = item['location']
            url1 = item['url1']
            url2 = item['url2']
            num_pages = item['num_pages']

            urls = get_all_possible_urls(url1, url2, num_pages)
            all_hotel_refs += parse_all_hotels_refs(urls)    
        except Exception as e:
            print(e)
            continue
    
    return all_hotel_refs

In [54]:
all_hotel_refs = get_hotels_refs()

In [56]:
all_hotel_refs

['ukraine.uk.html',
 'verhovina.uk.html',
 'fire-inn.uk.html',
 'smart-house-kyiv1234567891011.uk.html',
 'express-hotel-kiev.uk.html',
 'bakkara.uk.html',
 'zhuliani-siti-zhuliany-city-kyiv.uk.html',
 'tourist-complex-kyiv-365.uk.html',
 'd-d3-4nd-dd-d-d1-4d-d-do-dndud-n.uk.html',
 'bee-station.uk.html',
 'ibis-kiev-railway-station.uk.html',
 'aleksandria.uk.html',
 'suit-mini.uk.html',
 'nika-kiev.uk.html',
 'hotel-florida.uk.html',
 'rus.uk.html',
 'evropeyskiy.uk.html',
 'ibis-kiev-shevchenko-boulevard.uk.html',
 'kozatskiy.uk.html',
 'hotel-nivki.uk.html',
 'lybid.uk.html',
 'tourist-hotel.uk.html',
 'best-aparts.uk.html',
 'chyhorinskyi.uk.html',
 'mini-otiel-kyiv.uk.html',
 'mini-otiel-kyiv.uk.html',
 'hotel-bratislava.uk.html',
 'voshod-kiev.uk.html',
 'zhuliani-siti-zhuliany-city-kyiv.uk.html',
 'gostevoi-dom-na-ivana-svetlichnogo-6.uk.html',
 'royal-city.uk.html',
 'arena-summit-apart.uk.html',
 'mini-hotel-lukyanovski.uk.html',
 'd-d3-4nd-dd-d-d1-4d-d-do-dndud-n.uk.html',
 '

In [55]:
len(all_hotel_refs)

747

In [62]:
def acquire_review_score(hotel_ref):
    try:
        
        review_url = create_review_url(hotel_ref, 1)
        result = requests.get(review_url)
        soup = BeautifulSoup(result.text, 'html.parser')
        review_score = soup.find("ul", class_="review_score_breakdown_list")

        result = {
            'clean' : review_score.get('data-hotel_clean'),
            'comfort' : review_score.get('data-hotel_comfort'),
            'location' : review_score.get('data-hotel_location'),
            'services' : review_score.get('data-hotel_services'),
            'staff' : review_score.get('data-hotel_staff'),
            'value' : review_score.get('data-hotel_value'),
            'wifi' : review_score.get('data-hotel_wifi')
        }
    except:
        return None
    
    return result

In [58]:
acquire_review_score(all_hotel_refs[0])

{'clean': '8,1',
 'comfort': '7,7',
 'location': '9,6',
 'services': '7,3',
 'staff': '7,9',
 'value': '7,7',
 'wifi': '8,2'}

In [73]:
def create_dataframe_for_review_score(all_hotel_refs):
    cleans, comforts, locations, services, staffs, values, wifis = [], [], [], [], [], [], []
    refs = []
    for hotel_ref in all_hotel_refs:
        res = acquire_review_score(hotel_ref)
        if res is None:
            continue
        refs.append(hotel_ref)
        cleans.append(res['clean'])
        locations.append(res['location'])
        comforts.append(res['comfort'])
        services.append(res['services'])
        staffs.append(res['staff'])
        values.append(res['value'])
        wifis.append(res['wifi'])
            
    return pd.DataFrame.from_dict({
        'hotel': refs,
        'clean' : cleans,
        'comfort' : locations,
        'location' : comforts,
        'services' : services,
        'staff' : staffs,
        'value' : values,
        'wifi' : wifis

    })

In [76]:
review_score_df = create_dataframe_for_review_score(all_hotel_refs)

In [77]:
review_score_df

Unnamed: 0,hotel,clean,comfort,location,services,staff,value,wifi
0,ukraine.uk.html,81,96,77,73,79,77,82
1,verhovina.uk.html,79,74,78,77,81,84,81
2,fire-inn.uk.html,8,89,75,76,82,78,72
3,smart-house-kyiv1234567891011.uk.html,89,8,87,87,87,87,
4,express-hotel-kiev.uk.html,75,87,72,7,8,73,7
...,...,...,...,...,...,...,...,...
695,zory-the-guesthouse-odesa1.uk.html,85,85,88,83,88,8,
696,tihaya-gavan.uk.html,86,84,85,83,91,8,75
697,economy-kombi.uk.html,66,81,67,66,77,8,68
698,sun-marine.uk.html,74,88,67,7,81,73,63


In [93]:
for i in range(0, len(review_score_df)):
    clean = review_score_df['clean'].values[i]
    if clean is not None:
        review_score_df['clean'].values[i] = float(clean.replace(',', '.'))
        
    comfort = review_score_df['comfort'].values[i]
    if comfort is not None:
        review_score_df['comfort'].values[i] = float(comfort.replace(',', '.'))

    location = review_score_df['location'].values[i]
    if location is not None:
        review_score_df['location'].values[i] = float(location.replace(',', '.'))

    services = review_score_df['services'].values[i]
    if services is not None:
        review_score_df['services'].values[i] = float(services.replace(',', '.'))

    staff = review_score_df['staff'].values[i]
    if staff is not None:
        review_score_df['staff'].values[i] = float(staff.replace(',', '.'))
        
    value = review_score_df['value'].values[i]
    if value is not None:
        review_score_df['value'].values[i] = float(value.replace(',', '.'))
        
    wifi = review_score_df['wifi'].values[i]
    if wifi is not None:
        review_score_df['wifi'].values[i] = float(wifi.replace(',', '.'))    

In [95]:
!ls ../data

bigram-pmi-negative-scores.csv	 trigram-pmi-positive-scores.csv
bigram-pmi-positive-scores.csv	 ukrainian-stopwords.txt
pos-sentence-ngram.csv		 unigram-pmi-negative-scores.csv
review_score.csv		 unigram-pmi-positive-scores.csv
trigram-pmi-negative-scores.csv


In [96]:
review_score_df.to_csv('../data/review_score.csv')

In [100]:
review_score_df.loc[review_score_df['clean'] < 6]

Unnamed: 0,hotel,clean,comfort,location,services,staff,value,wifi
44,pokrovsky.uk.html,5.1,7.3,4.8,4.7,6.2,5.4,
125,andreevsky-guest-house-kyiv.uk.html,4.3,7.7,4.2,4.3,5.5,4.7,5.5
209,v-s-apart-central-plaza.uk.html,4.2,8.4,4.3,4.2,5.2,4.6,
230,feofaniia-kiyiv.uk.html,5.8,7.3,5.5,5.5,7.9,6.3,7.9
246,plutarh-mini-gr-12.uk.html,4.8,6.5,4.8,4.6,6.0,5.2,4.8
247,gostinitsa-vg.uk.html,5.5,3.0,3.3,4.3,5.0,4.3,
256,hostel-dvorets-ukrainy.uk.html,4.2,7.5,4.1,4.3,5.7,4.8,6.6
258,privat.uk.html,5.5,7.1,5.4,4.8,6.5,5.4,
437,cassiopeia.uk.html,5.2,4.9,5.5,4.6,5.7,4.7,
555,zirka-odessa.uk.html,5.7,7.8,5.0,5.3,6.5,6.0,6.3


In [None]:
for hotel_ref in all_hotel_refs:
    res = acquire_review_score(hotel_ref)
    locations
    

### Check dataset

In [12]:
kyiv_reviews_df = pd.DataFrame.from_csv('../dataset/kyiv-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [13]:
len(kyiv_reviews_df)

38024

In [14]:
lviv_reviews_df = pd.DataFrame.from_csv('../dataset/lviv-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [15]:
len(lviv_reviews_df)

23551

In [16]:
odesa_reviews_df = pd.DataFrame.from_csv('../dataset/odesa-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [17]:
kharkiv_reviews_df = pd.DataFrame.from_csv('../dataset/kharkiv-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [18]:
dnipro_reviews_df = pd.DataFrame.from_csv('../dataset/dnipro-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [19]:
uzhgorod_reviews_df = pd.DataFrame.from_csv('../dataset/uzhgorod-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [20]:
ivano_frankivsk_df = pd.DataFrame.from_csv('../dataset/ivano-frankivsk-reviews-ru.csv')

  """Entry point for launching an IPython kernel.


In [21]:
len(kyiv_reviews_df) + len(lviv_reviews_df) + len(odesa_reviews_df) + len(kharkiv_reviews_df) + len(dnipro_reviews_df) + len(uzhgorod_reviews_df) + len(ivano_frankivsk_df)

105737