In [1]:
import pandas as pd
import numpy as np
from typing import List
import warnings
import random
import json

In [2]:
warnings.filterwarnings('ignore')

In [3]:
def save_data_to_json(data, file):
    with open(file, 'w') as f:
        json.dump(data, f)

In [4]:
def load_json(file):
    with open(file) as json_file:
        data = json.load(json_file)
        return data


In [5]:
def get_mixed_review_texts(df):
    review_texts = []
    for index, row in df.iterrows():
        review_text = ""
        
        if len(row['title']) > 0 and row['title'] != 'Nan':
            review_text += row['title']
            
        if len(row['pos_text']) > 0 and row['pos_text'] != 'Nan':
            review_text += f" {row['pos_text']}"

        if len(row['neg_text']) > 0 and row['neg_text'] != 'Nan':
            review_text += f" {row['neg_text']}"
            
            
        review_texts.append(review_text)
            
    
    return review_texts

## Dataset

See https://github.com/DmytroBabenko/insights-from-user-reviews/blob/master/dataset/data-analysis.ipynb to get more details about dataset

For baseline, we are going to use Ukrainian-language user reviews parsed from *booking.com.*

In [6]:
booking_test_df = pd.read_csv("dataset/booking/booking-test.csv")

In [7]:
hotel_topic_marks_test = pd.read_csv("dataset/booking/hotel-topic-marks-test.csv")

In [8]:
test_hotels = np.unique(hotel_topic_marks_test['hotel'].values)

In [9]:
test_hotels[50:70]

array(['felisa.uk.html', 'fire-inn.uk.html', 'flora-apartments.uk.html',
       'flying-dutchman.uk.html', 'fontush-boutique.uk.html',
       'franz.uk.html', 'frapolli.uk.html', 'freedom-kiev.uk.html',
       'fregat.uk.html', 'gagarinn.uk.html', 'garmonia.uk.html',
       'gaudi-stylish.uk.html', 'genuez.uk.html', 'gintama.uk.html',
       'good-night.uk.html', 'gostinitsa-semeinogo-tipa.uk.html',
       'gostinitsa-vg.uk.html', 'gostiniy-dvor-odessa.uk.html',
       'gotiel-quot-art-gotiel-quot.uk.html', 'gotiel-znannia.uk.html'],
      dtype=object)

In [10]:
len(test_hotels)

188

## Collocations

The extracting collocation algorith for baseline is implemented in *collocations.collocation_extractor.py*. Here, we use this implementation. 

In [11]:
from topics import TopicType
from collocations.collocation_extractor import StanfordNLPCollocationExtractor

In [12]:
[e.value for e in TopicType]

[('clean',),
 ('comfort',),
 ('location',),
 ('services',),
 ('staff',),
 ('value',),
 ('wi-fi',),
 'others']

In [13]:
collocation_extractor = StanfordNLPCollocationExtractor()

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/dbabenko/stanfordnlp_resources/uk_iu_models/uk_iu_tokenizer.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/dbabenko/stanfordnlp_resources/uk_iu_models/uk_iu_tagger.pt', 'pretrain_path': '/home/dbabenko/stanfordnlp_resources/uk_iu_models/uk_iu.pretrain.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/dbabenko/stanfordnlp_resources/uk_iu_models/uk_iu_lemmatizer.pt', 'lang': 'uk', 'shorthand': 'uk_iu', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/dbabenko/stanfordnlp_resources/uk_iu_models/uk_iu_parser.pt', 'pretrain_path': '/home/dbabenko/stanfordnlp_resources/uk_iu_models/uk_iu

Example of extraction collocation for one hotel

In [14]:
example_hotel = test_hotels[57]
example_hotel_df = booking_test_df.loc[booking_test_df['hotel'] == example_hotel]
example_hotel_df.head()

Unnamed: 0,title,pos_text,neg_text,ratingValue,bestRating,hotel,rating
1367,"Чистота в номері, гаряча вона постійно, бо є е...","Чистота в номері, гаряча вона постійно, бо є е...",Я отримав номер на 8 поверсі з гарним краєвидо...,8.3,10.0,freedom-kiev.uk.html,4
1368,"Брудне, некомфортно місце, лише у разі якщо не...","Заселили після 12 години ночі, хоча вказано, з...","Брудно . В коридорі дірки у стінах, аж лячно б...",3.3,10.0,freedom-kiev.uk.html,2
1369,"В моєму номері був бойлер, що дало доступ до г...","В моєму номері був бойлер, що дало доступ до г...","Дуже холодно, не захоплюємо приміщення",4.2,10.0,freedom-kiev.uk.html,2
1370,"Душ у номері, є холодильник, є зайчик і вентил...","Душ у номері, є холодильник, є зайчик і вентил...","Знаходиться в такому місці, що добратись можна...",5.0,10.0,freedom-kiev.uk.html,2
1371,"Ванна кімната чиста, є шампунь, гель, упакован...","Ванна кімната чиста, є шампунь, гель, упакован...","Якщо врахувати ціну за номер, то немає претенз...",8.3,10.0,freedom-kiev.uk.html,4


In [15]:
example_hotel_revies = get_mixed_review_texts(example_hotel_df)
example_hotel_revies[:5]

['Чистота в номері, гаряча вона постійно, бо є електричний... Чистота в номері, гаряча вона постійно, бо є електричний бойлер, а також радіатор та тепла ковдра, що об 11 ночі знайшов мені адміністратор, бо було трохи нижче нуля градусів. Взагалі, коли треба на день два приїхати в Київ у справах, то в цілому можна перетерпіти. Я отримав номер на 8 поверсі з гарним краєвидом, але біля ліфту, та через стіни було добре чути сусідів, щоразу інших, які сварили ся до першої години ночі, а стосовно ліфту, всі ходили повз мене весь вечер, всю ніч і ранок і кожен крок було дуже добре чути, так було всі три дні і голосно розмовляли по телефону і сварилися і таке інше.',
 'Брудне, некомфортно місце, лише у разі якщо нема інших варіантів Заселили після 12 години ночі, хоча вказано, зо заселення до опівночі. Та дозволили поставити безкоштовно автомобіль на території. Брудно . В коридорі дірки у стінах, аж лячно було ходити . В номері лише туалет, душ на поверсі . матрац на ліжку дуже поганий, пружин

In [16]:
example_collocations = collocation_extractor.extract_from_list_of_text(example_hotel_revies)

In [17]:
example_collocations

[({'word': 'чистота', 'upos': 'NOUN', 'lemma': 'чистота'},
  {'word': 'в', 'upos': 'ADP', 'lemma': 'в'},
  {'word': 'номері', 'upos': 'NOUN', 'lemma': 'номер'}),
 ({'word': 'гаряча', 'upos': 'ADJ', 'lemma': 'гарячий'},
  {'word': 'вона', 'upos': 'PRON', 'lemma': 'вона'},
  {'word': 'постійно', 'upos': 'ADV', 'lemma': 'постійно'}),
 ({'word': 'чистота', 'upos': 'NOUN', 'lemma': 'чистота'},
  {'word': 'в', 'upos': 'ADP', 'lemma': 'в'},
  {'word': 'номері', 'upos': 'NOUN', 'lemma': 'номер'}),
 ({'word': 'гаряча', 'upos': 'ADJ', 'lemma': 'гарячий'},
  {'word': 'вона', 'upos': 'PRON', 'lemma': 'вона'},
  {'word': 'постійно', 'upos': 'ADV', 'lemma': 'постійно'}),
 ({'word': 'електричний', 'upos': 'ADJ', 'lemma': 'електричний'},
  {'word': 'бойлер', 'upos': 'NOUN', 'lemma': 'бойлер'}),
 ({'word': 'тепла', 'upos': 'ADJ', 'lemma': 'теплий'},
  {'word': 'ковдра', 'upos': 'NOUN', 'lemma': 'ковдр'}),
 ({'word': 'радіатор', 'upos': 'NOUN', 'lemma': 'радіатор'},
  {'word': 'та', 'upos': 'CCONJ', 'le

Extract collocations for all test hotels

In [18]:
def extract_collocations_for_hotels(hotels, df):
    hotel_collocations_dict = dict()

    for hotel in hotels:
        hotel_df = df.loc[df['hotel'] == example_hotel]
        hotel_revies = get_mixed_review_texts(hotel_df)
        hotel_collocations = collocation_extractor.extract_from_list_of_text(hotel_revies)
        
        hotel_collocations_dict[hotel] =  hotel_collocations
        
    return hotel_collocations_dict

In [19]:
%time hotel_collocations_dict = extract_collocations_for_hotels(test_hotels, booking_test_df)

CPU times: user 1h 40min 43s, sys: 14 s, total: 1h 40min 57s
Wall time: 1h 40min 59s


In [29]:
save_data_to_json(hotel_collocations_dict, "data/hotel-collocations-test.json")

## Cluster test collocations

In [20]:
from clustering.cluster_collocations import ClusterCollocations
from gensim.models import KeyedVectors

The clustering collocation is implemented in *clustering.cluster.collocations*. First of all we convert our collocations (use only noun) into vector via word2vec, secondly we run K-means algortithm to cluster these vector.

In [21]:
K_CLUSTERS = len(TopicType)

In [22]:
embeddings_file = "./embeddings/ubercorpus.lowercased.tokenized.word2vec.300d"

In [23]:
word2vec = KeyedVectors.load_word2vec_format(embeddings_file)

In [24]:
cluster_collocations = ClusterCollocations(word2vec)

Let's cluster example collocations

In [25]:
example_hotel_clustered = cluster_collocations.cluster_collocations(example_collocations, K_CLUSTERS)

In [26]:
example_hotel_clustered

{4: {'collocations': [({'word': 'чистота', 'upos': 'NOUN', 'lemma': 'чистота'},
    {'word': 'в', 'upos': 'ADP', 'lemma': 'в'},
    {'word': 'номері', 'upos': 'NOUN', 'lemma': 'номер'}),
   ({'word': 'чистота', 'upos': 'NOUN', 'lemma': 'чистота'},
    {'word': 'в', 'upos': 'ADP', 'lemma': 'в'},
    {'word': 'номері', 'upos': 'NOUN', 'lemma': 'номер'}),
   ({'word': 'нуля', 'upos': 'NOUN', 'lemma': 'нуля'},
    {'word': 'градусів', 'upos': 'NOUN', 'lemma': 'градус'}),
   ({'word': 'день', 'upos': 'NOUN', 'lemma': 'день'},
    {'word': 'два', 'upos': 'NUM', 'lemma': 'два'},
    {'word': 'приїхати', 'upos': 'VERB', 'lemma': 'приїхати'}),
   ({'word': 'отримав', 'upos': 'VERB', 'lemma': 'отримати'},
    {'word': 'номер', 'upos': 'NOUN', 'lemma': 'номер'}),
   ({'word': 'розмовляли', 'upos': 'VERB', 'lemma': 'розмовляти'},
    {'word': 'по', 'upos': 'ADP', 'lemma': 'по'},
    {'word': 'телефону', 'upos': 'NOUN', 'lemma': 'телефон'}),
   ({'word': 'разі', 'upos': 'NOUN', 'lemma': 'раз'},
   

So, and now we can print **pre-summary** about example  hotel (centroid collocation for each cluster)

In [27]:
example_summary = []
for key in example_hotel_clustered:
    centroid = example_hotel_clustered[key]['centroid']
    words = [item['word'] for item in centroid]
    example_summary.append(' '.join(words))
example_summary

['чистота в номері',
 'загальний сам готель',
 'матрац на ліжку',
 'персонал не потурбувався',
 'дозволив залишити',
 'ванна кімната',
 'врахувати ціну',
 'проблем дозволив залишити']

Let's cluster all collocations for each hotel

In [28]:
hotel_clustered_dict = {hotel: cluster_collocations.cluster_collocations(hotel_collocations_dict[hotel], K_CLUSTERS) 
                        for hotel in hotel_collocations_dict}

KeyboardInterrupt: 

### Detect hotel topic type 

As now we have separate clusters, let's detect which topic is related to each cluster. <br>
initially, there is simple rule-based classifier for baseline.

In [None]:
from clustering.cluster_topic_detector import RuleBasedTopicDetector

In [None]:
topic_detector = RuleBasedTopicDetector()

Let's detect topics for clusters of example hotel

In [None]:
def detect_topic_for_hotel_cluster(hotel_clustered):
    hotel_topic_clustered = dict()
    for key in hotel_clustered:
        collocations = hotel_clustered[key]['collocations']
        topic = topic_detector.detect_cluster_topic_type(collocations)

        if topic not in hotel_topic_clustered:
            hotel_topic_clustered[topic] = hotel_clustered[key]
        else:
            hotel_topic_clustered[topic]['collocations'] += collocations
            
    return hotel_topic_clustered

In [None]:
example_hotel_topic_clustered = detect_topic_for_hotel_cluster(example_hotel_clustered)

In [None]:
example_hotel_topic_clustered.keys()

As we can see, the number of detected topic is less that number of cluster. It is caused that some clusters has the same topic and algortihm for clustering do not work very well now.

Let's detect topics for clusters of each hotel

In [None]:
hotel_topic_clustered_dict = {
    hotel: detect_topic_for_hotel_cluster(hotel_clustered_dict[hotel])
    for hotel in hotel_clustered_dict
}

### Detect sentiment of clusters

In [None]:
from enum import IntEnum
class SentimentType(IntEnum):
    NEGATIVE = 0, 
    NEUTRAL = 1,
    POSITIVE = 2

In [None]:
import itertools  

#### Load pmi tonal dictionary for adjectives

As we have already had pmi dictionaries for adjectives which are acquired from postive and negative reviews, lent's use this information for detecting sentiment of cluster.

To see, how the dictionary was built, please use the following link: http://localhost:8888/notebooks/insights-from-user-reviews/build-sentiment-dictionary.ipynb

In [None]:
pos_adj_pmi_dict = load_json('data/pos-adj-pmi.json')
neg_adj_pmi_dict = load_json('data/neg-adj-pmi.json')

Top positive adjective lemmas

In [None]:
dict(itertools.islice(pos_adj_pmi_dict.items(), 10))  

In [None]:
pos_adj_pmi_dict['привітний']

Top negative adjective lemmas

In [None]:
dict(itertools.islice(neg_adj_pmi_dict.items(), 10))  

In [None]:
neg_adj_pmi_dict['шумний']

In [None]:
pmi_threshold = 0.1

#### Detect sentiment

In [None]:
def detect_sentiment_of_cluster(cluster, pos_adj_pmi_dict, neg_adj_pmi_dict, pmi_threshold):
    adjectives = []
    for cluster_item in cluster:
        adjectives += [collocation['lemma'] for collocation in cluster_item if collocation['upos'] == 'ADJ']
    
    
    pos_pmis, neg_pmis = [], []
    for adj in adjectives:
        
        if adj in pos_adj_pmi_dict:
            pmi = pos_adj_pmi_dict[adj]
            if pmi >= pmi_threshold:
                pos_pmis.append(pmi)
                
        if adj in neg_adj_pmi_dict:
            pmi = neg_adj_pmi_dict[adj]
            if pmi >= pmi_threshold:
                neg_pmis.append(pmi)
                
#     return pos_pmis, neg_pmis
    
    if len(neg_pmis) / len(pos_pmis) <= 0.75:
        return SentimentType.POSITIVE
    
    if len(pos_pmis) / len(neg_pmis) <= 0.75:
        return SentimentType.NEGATIVE    

#     pos_pmi_avg = sum(pos_pmis) / len(pos_pmis)
#     neg_pmi_avg = sum(neg_pmis) / len(meg_pmis)
    
    
    return SentimentType.NEUTRAL
    

In [None]:
def detect_sentiments_of_clusters_for_hotel(hotel_topic_clustered, pos_adj_pmi_dict, neg_adj_pmi_dict, pmi_threshold):
    for topic in hotel_topic_clustered:
        collocations = hotel_topic_clustered[topic]['collocations']
        sentiment = detect_sentiment_of_cluster(collocations, pos_adj_pmi_dict, neg_adj_pmi_dict, pmi_threshold)
        hotel_topic_clustered[topic]['sentiment'] = sentiment
        
    return hotel_topic_clustered

Detect sentiment for clusters of example hotel

In [None]:
example_hotel_topic_clustered = detect_sentiments_of_clusters_for_hotel(example_hotel_topic_clustered, pos_adj_pmi_dict, neg_adj_pmi_dict, pmi_threshold)

In [None]:
{print(topic, " : ", example_hotel_topic_clustered[topic]['sentiment']) for topic in example_hotel_topic_clustered}

### Detect sentiment for each test hotel

In [None]:
for hotel in hotel_topic_clustered_dict:
    detect_sentiments_of_clusters_for_hotel(hotel_topic_clustered_dict[hotel], pos_adj_pmi_dict, neg_adj_pmi_dict, pmi_threshold)

## Metric

In [None]:
def mark2sentiment(mark_value):
    if mark_value == -1:
        return SentimentType.NEUTRAL
    
    if mark_value > 8:
        return SentimentType.POSITIVE
    
    if mark_value > 7:
        return SentimentType.NEUTRAL
    
    return SentimentType.NEGATIVE

In [None]:
hotel_topic_marks_test.head()

As we have mark data for each topic in each hotel, let's convert this data into suitable sentiment for comparing with predicted.

In [None]:
hotel_topic_sentiments = dict()
for index, row in hotel_topic_marks_test.iterrows():
    hotel_topic_sentiments[row['hotel']] = {
        TopicType.CLEAN : mark2sentiment(row['hotel_clean']),
        TopicType.COMFORT : mark2sentiment(row['hotel_comfort']),
        TopicType.LOCATION : mark2sentiment(row['hotel_location']),
        TopicType.SERVICES : mark2sentiment(row['hotel_services']),
        TopicType.STAFF : mark2sentiment(row['hotel_staff']),
        TopicType.VALUE : mark2sentiment(row['hotel_value']),
        TopicType.WIFI : mark2sentiment(row['hotel_free_wifi']),
    }

In [None]:
hotel_topic_sentiments

In [None]:
y_pred, y_test = [], []
for hotel in hotel_topic_clustered_dict:
    if hotel not in hotel_topic_sentiments:
        continue
    
    for topic in hotel_topic_sentiments[hotel]:
        y_test.append(hotel_topic_sentiments[hotel][topic])
        
        if topic in hotel_topic_clustered_dict[hotel]:
            pred_sent = hotel_topic_clustered_dict[hotel][topic]['sentiment']
            y_pred.append(pred_sent)
        else:
            y_pred.append(SentimentType.NEUTRAL)
            
assert len(y_pred) == len(y_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
report = classification_report(y_pred=y_pred, y_true=y_test)
print(report)

## Summary example

One of the main final task is to generate summary about hotel. Let's show simple baseline example

In [None]:
def acquire_hote_summary(hotel_info):
    summary = dict()
    for topic in hotel_info:
        sent = hotel_info[topic]['sentiment']
        words = [item['word'] for item in hotel_info[topic]['centroid']]
        phrase = ' '.join(words)
        
        if sent not in summary:
            summary[sent] = [phrase]
        else:
            summary[sent].append(phrase)
        
    return summary

In [None]:
hotels = list(hotel_topic_clustered_dict.keys())[:2]

In [None]:
hotels[0]

In [None]:
acquire_hote_summary(hotel_topic_clustered_dict[hotels[0]])

In [None]:
hotels[1]

In [None]:
acquire_hote_summary(hotel_topic_clustered_dict[hotels[1]])

## TODO for improvements

1. Implement smarter extraction of collocations. Filter collocation noise. Consider dependency tree deeply. <br> 
2. Improve collocation clustering. In this case, we are going to use synonym dictionaries or fine-tuned existed embeddings on our domain. <br>
3. Annotate data for detection topic (location, staff, etc) of collocations. Train classifier for this purpose. <br>
4. Consider more approaches how to identify sentiment of cluster (improve existing dictionary, train classifier, etc.) <br>
5. Consider more approaches for summary geneartion about specific hotel.