In [8]:
import json
import stanza
import os

In [2]:
def load_json(file):
    with open(file, 'r') as f:
        data = json.load(f)
        return data

In [5]:
nlp = stanza.Pipeline('uk')

2020-06-04 12:56:09 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| mwt       | iu      |
| pos       | iu      |
| lemma     | iu      |
| depparse  | iu      |

2020-06-04 12:56:09 INFO: Use device: gpu
2020-06-04 12:56:09 INFO: Loading: tokenize
2020-06-04 12:56:11 INFO: Loading: mwt
2020-06-04 12:56:11 INFO: Loading: pos
2020-06-04 12:56:12 INFO: Loading: lemma
2020-06-04 12:56:12 INFO: Loading: depparse
2020-06-04 12:56:14 INFO: Done loading processors!


In [22]:
doc = nlp("–ü—Ä–∏–≤—ñ—Ç–Ω–∏–π –ø–µ—Ä—Å–æ–Ω–∞–ª")
doc.sentences[0].to_dict()

[{'id': '1',
  'text': '–ü—Ä–∏–≤—ñ—Ç–Ω–∏–π',
  'lemma': '–ø—Ä–∏–≤—ñ—Ç–Ω–∏–π',
  'upos': 'ADJ',
  'xpos': 'Ao-msnf',
  'feats': 'Case=Nom|Gender=Masc|Number=Sing',
  'head': 2,
  'deprel': 'amod',
  'misc': 'start_char=0|end_char=9'},
 {'id': '2',
  'text': '–ø–µ—Ä—Å–æ–Ω–∞–ª',
  'lemma': '–ø–µ—Ä—Å–æ–Ω–∞–ª',
  'upos': 'NOUN',
  'xpos': 'Ncmsny',
  'feats': 'Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing',
  'head': 0,
  'deprel': 'root',
  'misc': 'start_char=10|end_char=18'}]

In [40]:
def get_nouns(nlp, text: str):
    nouns = []
    doc = nlp(text)
    for sent in doc.sentences:
        for token in sent.to_dict():
            if 'upos' in token and token['upos'] == 'NOUN':
                nouns.append([token['text'], token['lemma']])
    return nouns

In [13]:
path_dir1 = "google/lviv"
path_dir2 = "google/kyiv"

In [14]:
files = []
for path_dir in [path_dir1, path_dir2]:
    for i in os.listdir(path_dir):
        if i.endswith('.json'):
            files.append(f"{path_dir}/{i}")

In [15]:
len(files)

526

In [16]:
hotel_review_dict = dict()
for file in files:
    hotel_review_dict[file] = load_json(file)

In [64]:
topic_text_dict = dict()
count_all_reviews = 0
count_reviews = 0
for hotel, review in hotel_review_dict.items():
#     print(review.keys())
    for item in review['reviews']:
        count_all_reviews += 1
        if item['should_be_more_details'] is False:
            count_reviews += 1
            for topic, text in item['topics'].items():
                if text is not None:
                    if topic not in topic_text_dict:
                        topic_text_dict[topic] = [text]
                    else:
                        topic_text_dict[topic].append(text)


In [65]:
for k, v in topic_text_dict.items():
    print(k, len(v))

Bathroom and toiletries 987
Cleanliness 4226
Sleep 1300
Room amenities 1009
Parking 519
Property 6900
Public transit 583
Nightlife 251
Kitchen 903
Location 3393
Service 4758
Room entertainment 76
Atmosphere 3426
Wi-Fi 144
Accessibility 112
Breakfast 1419
Restaurant 946
Food and Beverage 1864
Air conditioning 142
Wellness 229
Spa 180
Gym 45
Fitness 180
Business 1037
Family friendly 115
Couple friendly 180
Nature and outdoor activities 190
Safety 55
Pool 63
Hot tub 11
Pets 14
Bar or lounge 21
Beach 1


In [66]:
with open('google/topic_texts.json', 'w') as f:
    json.dump(topic_text_dict, f)

In [19]:
topic_text_dict['Location']

[' –ß—É–¥–æ–≤–∏–π —Ö–æ—Å—Ç–µ–ª –Ω–µ–¥–∞–ª–µ–∫–æ –≤—ñ–¥ —Ü–µ–Ω—Ç—Ä—É –º—ñ—Å—Ç–∞.',
 ' –î–æ—Ä–µ—á—ñ –≤ —Ü–µ–Ω—Ç—Ä—ñ –º—ñ—Å—Ç–∞ . –ê —â–µ –º–æ–∂–Ω–∞ –ø—ñ—Ç–∏ –≤ –¢–¶ , —è–∫–∏–π –ø—Ä–∏–±–ª–∏–∑–Ω–æ 500 –º –≤—ñ–¥ —Ö–æ—Å—Ç–µ–ª—É .',
 '–ü–ª—é—Å–∞–º–∏ —î —Ü—ñ–Ω–∞ —ñ —Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è.',
 '–§–∞–π–Ω—ñ –Ω–æ–º–µ—Ä–∏ –ø–æ –¥–æ—Å—Ç—É–ø–Ω—ñ–π —Ü—ñ–Ω—ñ, –ø–æ—Ä—É—á –∑ —Ü–µ–Ω—Ç—Ä–æ–º –º—ñ—Å—Ç–∞.',
 '–ù–µ–¥–∞–ª–µ–∫–æ –≤—ñ–¥ —Ü–µ–Ω—Ç—Ä—É. ',
 ' –î–æ –û–ø–µ—Ä–Ω–æ–≥–æ 10 —Ö–≤–∏–ª–∏–Ω —Ö–æ–¥—É.',
 '–ß—É–¥–æ–≤–∏–π —Ö–æ—Å—Ç–µ–ª, –Ω–µ–¥–∞–ª–µ–∫–æ –≤—ñ–¥ —Ü–µ–Ω—Ç—Ä—É –º—ñ—Å—Ç–∞!',
 ' –í 5 —Ö–≤–∏–ª–∏–Ω–∞—Ö  —Ö–æ–¥—å–±–∏ –≤—ñ–¥ —Ü–µ–Ω—Ç—Ä—É.',
 '–î—É–∂–µ –∑–∞—Ç–∏—à–Ω–æ —ñ –±–ª–∏–∑—å–∫–æ –¥–æ —Ü–µ–Ω—Ç—Ä—Éüòé',
 '–•–æ—Å—Ç–µ–ª —ñ–∑ —Å–µ—Ä–µ–¥–Ω—ñ–º–∏ —Ü—ñ–Ω–∞–º–∏ –Ω–µ–ø–æ–¥–∞–ª—ñ–∫ –≤—ñ–¥ —Ü–µ–Ω—Ç—Ä—É —Å—Ç–∞—Ä–æ–≥–æ –º—ñ—Å—Ç–∞ –õ—å–≤—ñ–≤)',
 '–ü–æ–º—ñ—Ä–Ω—ñ —Ü—ñ–Ω–∏, –ø–æ—Ä—É—á —Ü–µ–Ω—Ç—Ä—É –º—ñ—Å—Ç–∞ —ñ –≥–∞—Ä–Ω–∏ –ø–µ—Ä—Å–æ–Ω–∞–ª!',
 '–ë–ª–∏—Å–∫ –¥–æ —Ü–µ–Ω—Ç—Ä—É –º—ñ—Å—Ç–∞',
 ' –î—É–∂–µ –±–ª–∏–∑—å–∫–æ –¥–æ 

In [41]:
get_nouns(nlp, topic_text_dict['Location'][0])

[['—Ö–æ—Å—Ç–µ–ª', '—Ö–æ—Å—Ç–µ–ª'], ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—Ä'], ['–º—ñ—Å—Ç–∞', '–º—ñ—Å—Ç–æ']]

In [42]:
def get_topic_nouns(nlp, topic_texts):
    topic_nouns = []
    for text in topic_texts:
        topic_nouns += get_nouns(nlp, text)
    return  topic_nouns

In [61]:
def get_topic_lemma_count_dict(topic_lemmas):
    topic_lemma_count = dict()
    for lemma in topic_lemmas:
        if lemma not in topic_lemma_count:
            topic_lemma_count[lemma] = 1
        else:
            topic_lemma_count[lemma] += 1
    topic_lemma_count = {k: v for k, v in sorted(topic_lemma_count.items(), key=lambda item: item[1], reverse=True)}
    return topic_lemma_count

In [43]:
location_nouns = get_topic_nouns(nlp, topic_text_dict['Location'])

In [44]:
location_nouns

[['—Ö–æ—Å—Ç–µ–ª', '—Ö–æ—Å—Ç–µ–ª'],
 ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—Ä'],
 ['–º—ñ—Å—Ç–∞', '–º—ñ—Å—Ç–æ'],
 ['—Ü–µ–Ω—Ç—Ä—ñ', '—Ü–µ–Ω—Ç—Ä'],
 ['–º—ñ—Å—Ç–∞', '–º—ñ—Å—Ç–æ'],
 ['–¢–¶', '–¢–¶'],
 ['–º', '–º'],
 ['—Ö–æ—Å—Ç–µ–ª—É', '—Ö–æ—Å—Ç–µ–ª'],
 ['–ü–ª—é—Å–∞–º–∏', '–ø–ª—é—Å'],
 ['—Ü—ñ–Ω–∞', '—Ü—ñ–Ω–∞'],
 ['—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è', '—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è'],
 ['–Ω–æ–º–µ—Ä–∏', '–Ω–æ–º–µ—Ä'],
 ['—Ü—ñ–Ω—ñ', '—Ü—ñ–Ω–∞'],
 ['—Ü–µ–Ω—Ç—Ä–æ–º', '—Ü–µ–Ω—Ç—Ä'],
 ['–º—ñ—Å—Ç–∞', '–º—ñ—Å—Ç–æ'],
 ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—Ä'],
 ['—Ö–≤–∏–ª–∏–Ω', '—Ö–≤–∏–ª–∏–Ω–∞'],
 ['—Ö–æ–¥—É', '—Ö—ñ–¥'],
 ['—Ö–æ—Å—Ç–µ–ª', '—Ö–æ—Å—Ç–µ–ª'],
 ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—Ä'],
 ['–º—ñ—Å—Ç–∞', '–º—ñ—Å—Ç–æ'],
 ['—Ö–≤–∏–ª–∏–Ω–∞—Ö', '—Ö–≤–∏–ª–∏–Ω–∞'],
 ['—Ö–æ–¥—å–±–∏', '—Ö–æ–¥—å–±–∞'],
 ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—Ä'],
 ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—Ä'],
 ['–•–æ—Å—Ç–µ–ª', '—Ö–æ—Å—Ç–µ–ª'],
 ['—Ü—ñ–Ω–∞–º–∏', '—Ü—ñ–Ω–∞'],
 ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—Ä'],
 ['–º—ñ—Å—Ç–∞', '–º—ñ—Å—Ç–æ'],
 ['—Ü—ñ–Ω–∏', '—Ü—ñ–Ω–∞'],
 ['—Ü–µ–Ω—Ç—Ä—É', '—Ü–µ–Ω—Ç—

In [49]:
location_lemmas_set = set([item[1] for item in location_nouns])

In [50]:
location_lemmas = [item[1] for item in location_nouns]

In [55]:
location_lemma_count = get_topic_lemma_count_dict(location_lemmas)

In [53]:
location_lemma_count

{'—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è': 1560,
 '—Ü–µ–Ω—Ç—Ä': 794,
 '–º—ñ—Å—Ü–µ': 659,
 '–≥–æ—Ç–µ–ª—å': 563,
 '–º—ñ—Å—Ç–æ': 395,
 '–ø–µ—Ä—Å–æ–Ω–∞–ª': 344,
 '–Ω–æ–º–µ—Ä': 236,
 '–≤–æ–∫–∑–∞–ª': 236,
 '–º–µ—Ç—Ä–æ': 227,
 '—Ü—ñ–Ω–∞': 220,
 '—Ö–æ—Å—Ç–µ–ª': 124,
 '–ª–æ–∫–∞—Ü—ñ—è': 118,
 '—Å–Ω—ñ–¥–∞–Ω–æ–∫': 105,
 '—Å–µ—Ä–≤—ñ—Å': 97,
 '—Ö–≤–∏–ª–∏–Ω–∞': 90,
 '–æ–±—Å–ª—É–≥–æ–≤—É–≤–∞–Ω–Ω—è': 78,
 '–∫—ñ–º–Ω–∞—Ç–∞': 57,
 '–∫—É—Ö–Ω—è': 54,
 '—Å—Ç–∞–Ω—Ü—ñ—è': 50,
 '–º–∞–≥–∞–∑–∏–Ω': 44,
 '—è–∫—ñ—Å—Ç—å': 43,
 '—Å—É–ø–µ—Ä': 42,
 '–º—ñ—Å—Ü–µ—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è': 42,
 '—Ö–≤.': 40,
 '–∞–µ—Ä–æ–ø–æ—Ä—Ç': 39,
 '—Ä–µ—Å—Ç–æ—Ä–∞–Ω': 35,
 '–ø–ª–æ—â–∞': 35,
 '—Ö–æ–¥—å–±–∞': 34,
 '—Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç': 34,
 '—É–º–æ–≤–∞': 34,
 '—Ä–∏–Ω–æ–∫': 32,
 '–∞—Ç–º–æ—Å—Ñ–µ—Ä–∞': 31,
 '–º—ñ—Å—Ü–µ–∑–Ω–∞—Ö–æ–¥–∂–µ–Ω–Ω—è': 31,
 '–ø–ª—é—Å': 30,
 '—ñ–Ω—Ç–µ—Ä‚Äô—î—Ä': 30,
 '–ø–∞—Ä–∫': 29,
 '—Ä–∞–π–æ–Ω': 27,
 '—á–∏—Å—Ç–æ—Ç–∞': 27,
 '–ø–∞—Ä–∫–æ–≤–∫–∞': 25,
 '–ø—Ä–æ–∂–∏–≤–∞–Ω–Ω—è': 24,
 '—Ä—ñ–≤–µ–Ω—å': 24,
 '–Ω–æ–º–µ—Ä–∞': 24,
 '–∑—É–ø–∏–Ω–∫–∞':

In [60]:
service_nouns = get_topic_nouns(nlp, topic_text_dict['Service'])

In [62]:
service_lemma_count = get_topic_lemma_count_dict( [item[1] for item in service_nouns])

In [63]:
service_lemma_count

{'–ø–µ—Ä—Å–æ–Ω–∞–ª': 2979,
 '–æ–±—Å–ª—É–≥–æ–≤—É–≤–∞–Ω–Ω—è': 703,
 '—Å–µ—Ä–≤—ñ—Å': 619,
 '–≥–æ—Ç–µ–ª—å': 595,
 '–Ω–æ–º–µ—Ä': 455,
 '–∫—É—Ö–Ω—è': 297,
 '–º—ñ—Å—Ü–µ': 288,
 '—Ä–æ–∑—Ç–∞—à—É–≤–∞–Ω–Ω—è': 288,
 '—Ü—ñ–Ω–∞': 277,
 '—Ä—ñ–≤–µ–Ω—å': 173,
 '—Ö–æ—Å—Ç–µ–ª': 145,
 '—Ü–µ–Ω—Ç—Ä': 124,
 '–∞—Ç–º–æ—Å—Ñ–µ—Ä–∞': 109,
 '–∫—ñ–º–Ω–∞—Ç–∞': 106,
 '—Ä–µ—Å—Ç–æ—Ä–∞–Ω': 98,
 '—ñ–Ω—Ç–µ—Ä‚Äô—î—Ä': 85,
 '–≥–æ—Å–ø–æ–¥–∞—Ä': 83,
 '–ø–æ—Å–ª—É–≥–∞': 72,
 '—Ä–µ—Ü–µ–ø—Ü—ñ—è': 69,
 '—è–∫—ñ—Å—Ç—å': 68,
 '—É–º–æ–≤–∞': 68,
 '—Å—É–ø–µ—Ä': 67,
 '—á–∏—Å—Ç–æ—Ç–∞': 66,
 '–Ω–æ–º–µ—Ä–∞': 64,
 '–º—ñ—Å—Ç–æ': 59,
 '—ó–∂–∞': 58,
 '–ø—Ä–∞—Ü—ñ–≤–Ω–∏–∫': 57,
 '–ª—é–¥–∏–Ω–∞': 46,
 '–∞–¥–º—ñ–Ω—ñ—Å—Ç—Ä–∞—Ç–æ—Ä': 44,
 '–≤–∏–¥': 42,
 '–≤–ª–∞—Å–Ω–∏–∫': 41,
 '—Å–Ω—ñ–¥–∞–Ω–æ–∫': 41,
 '—Å–ø—ñ–≤—Ä–æ–±—ñ—Ç–Ω–∏–∫': 40,
 '–ª—ñ–∂–∫–æ': 39,
 '–∑–∞–∫–ª–∞–¥': 37,
 '–≤–∏—Å–æ—Ç–∞': 36,
 '–≥–æ—Å—Ç–∏–Ω–Ω—ñ—Å—Ç—å': 35,
 '–∫–ª—ñ—î–Ω—Ç': 34,
 '—Ä–µ–º–æ–Ω—Ç': 31,
 '–∫–æ–º—Ñ–æ—Ä—Ç': 28,
 '–¥—ñ–≤—á–∏–Ω–∞': 28,
 '–≥—ñ—Å—Ç—å': 28,
 '–ø—Ä–æ–∂–∏–≤–∞–Ω–Ω—è':