In [1]:
import xml.etree.ElementTree as ET
from razdel import tokenize
import ipymarkup
from collections import defaultdict
from pymorphy2 import MorphAnalyzer
from razdel import tokenize
import pandas as pd

In [2]:
#Функции для получения разметки из xml файлов SentiRuEval

def parse_texts(path):
    root_node = ET.parse(path).getroot()
    texts = dict()
    for rev in root_node.findall('review'):
        texts[rev.get('id')] = rev.find('text').text
    return texts

def parse_aspects(path):
    root_node = ET.parse(path).getroot()
    global_aspects = dict()
    tag_list = ['type', 'to','term','sentiment','mark','from','category']
    for rev in root_node.findall('review'):
        id_ = rev.get('id')
        aspects = dict()
        for i, a in enumerate(rev.findall('aspects/aspect')):
            aspects[i] = dict()
            for tag in tag_list:
                aspects[i][f'{tag}'] = a.get(tag)
        global_aspects[f'{id_}'] = aspects
    return global_aspects

def mark_text(path, aspect_type = 'explicit', binary = True):
    global_aspects = parse_aspects(path)
    texts = parse_texts(path)
    
    categories = set()
    for id_ in global_aspects.values():
        for aspect in id_.values():
            categories.add(aspect['category'])
    
    markup = dict()
    for id_, aspects in global_aspects.items():
        labels = []
        tokens = list(tokenize(texts[id_]))
        for token in tokens:
            label = 0
            for aspect in aspects.values():
                if aspect['type'] == aspect_type and aspect['mark'] == 'Rel':
                    if token.start == int(aspect['from']):
                        label = 1
                    elif token.start > int(aspect['from']) and token.stop <= int(aspect['to']):
                        if binary:
                            label = 1
                        else:
                            label = 2
            labels.append(label)
        markup[id_] = labels
        
    return markup, texts

def markup_text_senti(path):
    global_aspects = parse_aspects(path)
    texts = parse_texts(path)
    markup = defaultdict(dict)
    for id_, aspects in global_aspects.items():
        spans = []
        markup[id_]['text'] = texts[id_]
        for entity in aspects.values():
            spans.append((int(entity['from']), int(entity['to']), entity['polarity']))
            spans.append((int(entity['from']), int(entity['to']), entity['category']))
        markup[id_]['aspects'] = spans
    return markup

In [3]:
#Функции для получения разметки из xml файлов SemEval

def parse_texts_sem(path):
    root_node = ET.parse(path).getroot()
    texts = dict()
    for rev in root_node.findall('Review/sentences/sentence'):
        texts[rev.get('id')] = rev.find('text').text
    return texts

def parse_aspects_sem(path):
    root_node = ET.parse(path).getroot()
    global_aspects = dict()
    tag_list = ['to','from','polarity','category','target']
    for rev in root_node.findall('Review/sentences/sentence'):
        id_ = rev.get('id')
        aspects = dict()
        for i, a in enumerate(rev.findall('Opinions/Opinion')):
            aspects[i] = dict()
            for tag in tag_list:
                aspects[i][f'{tag}'] = a.get(tag)
        global_aspects[f'{id_}'] = aspects
    return global_aspects

def markup_text_sem(path):
    global_aspects = parse_aspects_sem(path)
    texts = parse_texts_sem(path)
    markup = defaultdict(dict)
    for id_, aspects in global_aspects.items():
        spans = []
        markup[id_]['text'] = texts[id_]
        for entity in aspects.values():
            spans.append((int(entity['from']), int(entity['to']), entity['polarity']))
            spans.append((int(entity['from']), int(entity['to']), entity['category']))
        markup[id_]['aspects'] = spans
    return markup

In [4]:
senti_texts_train = parse_texts('data/SentiRuEval_car_markup_train.xml')
senti_texts_test = parse_texts('data/SentiRuEval_car_markup_test.xml')

In [6]:
senti_texts = {**senti_texts_train,**senti_texts_test}

In [9]:
senti_aspects_train = parse_aspects('data/SentiRuEval_car_markup_train.xml')
senti_aspects_test = parse_aspects('data/SentiRuEval_car_markup_test.xml')
senti_aspects = {**senti_aspects_train,**senti_aspects_test}

In [13]:
spans_senti_train = dict()
for id_ in senti_aspects_train.keys():
    entities = []
    for entity in senti_aspects_train[id_].values():
        if entity['type'] == 'explicit':
            entities.append((int(entity['from']), int(entity['to']), entity['term'], entity['sentiment'], entity['category']))
    spans_senti_train[id_] = entities

In [15]:
spans_senti_test = dict()
for id_ in senti_aspects_test.keys():
    entities = []
    for entity in senti_aspects_test[id_].values():
        if entity['type'] == 'explicit':
            entities.append((int(entity['from']), int(entity['to']), entity['term'], entity['sentiment'], entity['category']))
    spans_senti_test[id_] = entities

In [71]:
united_aspects = dict()
for k in senti_texts.keys():
    try:
        united_aspects[k] = spans_senti[k] + spans[k]
    except KeyError:
        continue

In [115]:
# first, filter by indexes
left_in_sentiru = dict()
left_in_sem = dict()

for text_id in senti_texts.keys():
    try:
        sentiru_aspects = spans_senti[text_id]
        semru_aspects = spans[text_id]
    except KeyError:
        continue
    sim_i = []
    sim_j = []
    j_start = 0
    for i, semru_aspect in enumerate(semru_aspects):
        for j, sentiru_aspect in enumerate(sentiru_aspects[j_start:]):
            if semru_aspect[0] == sentiru_aspect[0] and semru_aspect[1] == sentiru_aspect[1]:
                sim_i.append(i)
                sim_j.append(j+j_start)
                j_start = j
                break
            elif semru_aspect[0] < sentiru_aspect[0]:
                break
    if len(sim_j) - len(sentiru_aspects) < 0:
        left_in_sentiru[text_id] = [a for j, a in enumerate(sentiru_aspects) if j not in sim_j]
    if len(sim_i) - len(semru_aspects) < 0:
        left_in_sem[text_id] = [a for i, a in enumerate(semru_aspects) if i not in sim_i]

In [116]:
# second, consider sem inclusions

left_in_sentiru_2 = dict()
left_in_sem_2 = dict()

for text_id in left_in_sem.keys():
    try:
        sentiru_aspects = left_in_sentiru[text_id]
        semru_aspects = left_in_sem[text_id]
    except KeyError:
        continue
    sim_i = []
    sim_j = []
    for i, semru_aspect in enumerate(semru_aspects):
        for j, sentiru_aspect in enumerate(sentiru_aspects):
            if semru_aspect[0] >= sentiru_aspect[0] and semru_aspect[1] <= sentiru_aspect[1]:
                sim_i.append(i)
                sim_j.append(j)
                break
            elif semru_aspect[0] < sentiru_aspect[0]:
                break
    if len(sim_j) - len(sentiru_aspects) < 0:
        left_in_sentiru_2[text_id] = [a for j, a in enumerate(sentiru_aspects) if j not in set(sim_j)]
    if len(sim_i) - len(semru_aspects) < 0:
        left_in_sem_2[text_id] = [a for i, a in enumerate(semru_aspects) if i not in set(sim_i)]

In [117]:
# third, consider senti inclusions

left_in_sentiru_3 = dict()
left_in_sem_3 = dict()

for text_id in left_in_sem_2.keys():
    try:
        sentiru_aspects = left_in_sentiru_2[text_id]
        semru_aspects = left_in_sem_2[text_id]
    except KeyError:
        continue
    sim_i = []
    sim_j = []
    for i, semru_aspect in enumerate(semru_aspects):
        for j, sentiru_aspect in enumerate(sentiru_aspects):
            if semru_aspect[0] <= sentiru_aspect[0] and semru_aspect[1] >= sentiru_aspect[1]:
                sim_i.append(i)
                sim_j.append(j)
                break
            elif semru_aspect[0] < sentiru_aspect[0]:
                break
    if len(sim_j) - len(sentiru_aspects) < 0:
        left_in_sentiru_3[text_id] = [a for j, a in enumerate(sentiru_aspects) if j not in set(sim_j)]
    if len(sim_i) - len(semru_aspects) < 0:
        left_in_sem_3[text_id] = [a for i, a in enumerate(semru_aspects) if i not in set(sim_i)]

In [120]:
# Аспекты из sentiru, не попоавшие в semru
morph = MorphAnalyzer()
abandoned = []
for text_id in left_in_sentiru_3.keys():
    for aspect in left_in_sentiru_3[text_id]:
        tokens = list(tokenize(aspect[2].lower()))
        if len(tokens)>1:
            for t in tokens:
                lemma = morph.parse(t.text)[0].normal_form
                abandoned.append((aspect[0]+t.start, aspect[0]+t.stop, lemma, aspect[3], aspect[4], text_id))
            abandoned.append((aspect[0], aspect[1], aspect[2].lower().strip('"'), aspect[3], aspect[4], text_id))
        else:
            abandoned.append((aspect[0], aspect[1], morph.parse(aspect[2])[0].normal_form.strip('"'), aspect[3], aspect[4], text_id))

In [158]:
abandoned_df = pd.DataFrame(abandoned, columns = ['from', 'to', 'text', 'sentiment', 'category', 'review_id'])

In [159]:
#убираем нейтральные
abandoned_df['polar'] = ((abandoned_df.sentiment != 'neutral') & (abandoned_df.sentiment != 'both'))*1

In [160]:
abandoned_df

Unnamed: 0,from,to,text,sentiment,category,review_id,polar
0,43,52,заведение,neutral,Whole,16137,0
1,446,455,ресторан,neutral,Whole,16137,0
2,642,650,посадить,neutral,Service,16137,0
3,651,654,мы,neutral,Service,16137,0
4,655,657,за,neutral,Service,16137,0
...,...,...,...,...,...,...,...
1269,791,808,салат с говядиной,positive,Food,28258,1
1270,811,819,карпаччо,positive,Food,28258,1
1271,822,826,цена,neutral,Price,28258,0
1272,870,876,визит,positive,Whole,28258,1


In [161]:
abandoned_df.category.unique()

array(['Whole', 'Service', 'Food', 'Price', 'Interior'], dtype=object)

In [162]:
abandoned_df[abandoned_df.category=='Whole'].groupby('text').agg({'review_id':'count',
                                                                 'polar':'sum'}).sort_values('review_id', ascending=False).iloc[:30]

Unnamed: 0_level_0,review_id,polar
text,Unnamed: 1_level_1,Unnamed: 2_level_1
ресторан,82,31
заведение,23,9
"""",22,6
место,21,15
вечер,12,12
провести,6,6
впечатление,6,6
время,6,6
посещение,4,4
кафе,4,2


In [163]:
abandoned_df[abandoned_df.category=='Interior'].groupby('text').agg({'review_id':'count',
                                                                 'polar':'sum'}).sort_values('review_id', ascending=False).iloc[:30]

Unnamed: 0_level_0,review_id,polar
text,Unnamed: 1_level_1,Unnamed: 2_level_1
зал,12,9
столик,6,4
интерьер,3,2
потанцевать,2,2
в,2,2
атмосфера,2,2
оформить,2,2
стена,2,2
музыка,2,1
концерт,2,0


In [164]:
abandoned_df[abandoned_df.category=='Food'].groupby('text').agg({'review_id':'count',
                                                                 'polar':'sum'}).sort_values('review_id', ascending=False).iloc[:30]

Unnamed: 0_level_0,review_id,polar
text,Unnamed: 1_level_1,Unnamed: 2_level_1
блюдо,21,9
кухня,18,11
с,14,6
салат,12,2
пиво,11,0
напиток,10,1
готовить,9,9
еда,8,2
выпить,7,2
горячий,7,1


In [165]:
abandoned_df[abandoned_df.category=='Service'].groupby('text').agg({'review_id':'count',
                                                                 'polar':'sum'}).sort_values('review_id', ascending=False).iloc[:30]

Unnamed: 0_level_0,review_id,polar
text,Unnamed: 1_level_1,Unnamed: 2_level_1
встретить,19,16
заказать,15,5
столик,13,4
принести,12,11
обслуживать,12,12
заказ,11,11
проводить,11,9
обслуживание,9,5
стол,9,7
официант,8,4


In [166]:
abandoned_df[abandoned_df.category=='Price'].groupby('text').agg({'review_id':'count',
                                                                 'polar':'sum'}).sort_values('review_id', ascending=False).iloc[:30]

Unnamed: 0_level_0,review_id,polar
text,Unnamed: 1_level_1,Unnamed: 2_level_1
цена,34,31
счёт,8,5
соотношение,4,4
качество,4,4
скидка,3,3
ценник,3,3
чек,2,1
и,2,2
плата,1,1
предоплата,1,1


In [168]:
#оставляем следующие аспекты
interesting_words = ["цена", "счет", "столик", "заказ", 
                    "стол", "официант", "сервис", "встретить",
                    "обслуживать", "обслужить", "помочь", 
                    "меню", "блюдо", "кухня", "салат", 
                    "еда", "поесть", "готовить", "вкус", 
                    "зал", "интерьер", "атмосфера", "ресторан", 
                    "заведение", "место"]

In [169]:
interesting_df = abandoned_df[abandoned_df['text'].isin(interesting_words)]
interesting_df = interesting_df[interesting_df['polar'] == 1]

In [170]:
interesting_df.sentiment.value_counts()

positive    197
negative     31
Name: sentiment, dtype: int64

In [171]:
interesting_df.category.value_counts()

Service     76
Whole       55
Food        48
Price       31
Interior    18
Name: category, dtype: int64

## Добавляем в разметку

In [255]:
additional_spans = []
for id_ in interesting_df.review_id.unique():
    new_df = interesting_df[interesting_df.review_id == id_]
    a_new_df = abandoned_df[abandoned_df.review_id == id_][abandoned_df[abandoned_df.review_id == id_].polar==1]
    for val in new_df.values:
        start, stop, token, sentiment, cat, rev_id, pol = val
        inter_df = a_new_df[a_new_df['from']<=start]
        inter_df = inter_df[inter_df['to']>=stop]
        if len(inter_df) == 1:
            additional_spans.append((start, stop, token, sentiment, cat, rev_id))
        else:
            additional_spans.append(tuple(inter_df[inter_df.text != token].values[0][:-1]))

In [256]:
additional_spans_list = list(set(additional_spans))
for i in additional_spans_list:
    id_ = i[-1]
    key_indicator = 0
    for sp in spans.keys():
        if sp == id_:
            spans[sp].append((i[0], i[1], i[3]))
            key_indicator += 1
    if key_indicator == 0:
        print(i)

In [257]:
spans

{'15758': [(60, 69, 'neutral'),
  (131, 136, 'positive'),
  (153, 165, 'positive'),
  (177, 180, 'positive'),
  (320, 328, 'neutral'),
  (389, 397, 'positive'),
  (475, 479, 'neutral')],
 '2073': [(97, 106, 'negative'),
  (223, 231, 'neutral'),
  (242, 247, 'neutral'),
  (314, 326, 'negative'),
  (377, 387, 'negative'),
  (899, 911, 'negative'),
  (974, 983, 'negative'),
  (1179, 1187, 'negative')],
 '24713': [(52, 55, 'positive'),
  (116, 120, 'neutral'),
  (166, 173, 'negative'),
  (213, 227, 'negative'),
  (246, 251, 'negative'),
  (253, 259, 'negative'),
  (307, 313, 'negative'),
  (448, 451, 'positive'),
  (525, 532, 'negative'),
  (620, 623, 'negative'),
  (833, 842, 'negative'),
  (502, 509, 'negative')],
 '16630': [(25, 35, 'positive'),
  (56, 64, 'positive'),
  (85, 97, 'positive'),
  (99, 102, 'positive'),
  (244, 253, 'positive'),
  (294, 302, 'positive'),
  (315, 320, 'positive')],
 '27841': [(71, 95, 'positive'),
  (161, 165, 'positive'),
  (175, 182, 'positive'),
  (264, 

Оставляем только полярные аспекты

In [317]:
spans_polar = defaultdict(list)
spans_neu = defaultdict(list)
for k, i in spans.items():
    c = 0
    for j in i:
        if j[2] not in ['neutral', 'conflict']:
            spans_polar[k].append(j)
            c+=1

In [309]:
fin_spans = defaultdict(dict)
for k, i in spans_polar.items():
    fin_spans[k]['text'] = senti_texts[k]
    fin_spans[k]['aspects'] = i

In [310]:
fin_spans

defaultdict(dict,
            {'15758': {'text': 'Был в заведении со своей девушкой в пятницу. Очень неплохо. Ресторану еще много есть над чем поработать, конечно, но тем не менее, место довольно милое, обслуживание приятное и еда довольно вкусна. Заказывали салаты и горячее(стейк из лосося и блюдо под названием "сытая хавронья" из свинины). Принесли быстро. Крепкого алкоголя, кроме пива у них пока нет, видимо с лицензией проблемы, но медовуха оказалась очень даже приятной. Хотелось бы, конечно, большего разнообразия в меню, но так как они недавно открылись, думаю, со временем все у них наладится.',
              'aspects': [(131, 136, 'positive'),
               (153, 165, 'positive'),
               (177, 180, 'positive'),
               (389, 397, 'positive')]},
             '2073': {'text': 'Три дня назад отмечали в этом "ресторане" день рождения. Сегодня решила почитать отзывы по этому заведению, так как вечер был просто испорчен. Складывается такое впечатление, что мы были где-то

In [299]:
print(ipymarkup.show_line_markup(fin_spans['2073']['text'], fin_spans['2073']['aspects']))

None


In [311]:
import json
with open("../data/final_sem_markup_train.json", 'w') as f:
    json.dump(fin_spans, f)