In [None]:
!pip install scikit-learn==0.22.2 

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from bs4 import BeautifulSoup
import plotly.graph_objs as go
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [None]:
! pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pymorphy2

In [None]:
!git clone https://github.com/DariaRev/project_autobrea

fatal: destination path 'project_autobrea' already exists and is not an empty directory.


In [None]:
df = pd.read_csv('project_autobrea/data/train_split_aspects.csv', delimiter = '\t', header = None)

In [None]:
df

Unnamed: 0,0,1,2,3,4,5
0,30808,Whole,ресторане,16,25,neutral
1,30808,Interior,первом этаже,43,55,neutral
2,30808,Whole,руководству ресторана,124,145,positive
3,30808,Service,обслуживающему персоналу,147,171,positive
4,30808,Service,сотрудникам,189,200,positive
...,...,...,...,...,...,...
3568,16630,Service,обслуживание,85,97,positive
3569,16630,Food,Еда,99,102,positive
3570,16630,Service,персоналу,244,253,positive
3571,16630,Whole,ресторан,294,302,positive


In [None]:
unique_asps = df[2].unique()

In [None]:
df[1].value_counts()

Food        1439
Service      909
Whole        611
Interior     514
Price        100
Name: 1, dtype: int64

In [None]:
from nltk.tokenize import RegexpTokenizer
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()
token = RegexpTokenizer('\w+')

In [None]:
def normalize(text):
    words = [morph.parse(word)[0].normal_form for word in tokenize(text) if word]
    return words

def tokenize(text):
    return token.tokenize(text)

In [None]:
df['lemmas'] = [tuple(normalize(text)) for text in df[2]]

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,lemmas
0,30808,Whole,ресторане,16,25,neutral,"(ресторан,)"
1,30808,Interior,первом этаже,43,55,neutral,"(первый, этаж)"
2,30808,Whole,руководству ресторана,124,145,positive,"(руководство, ресторан)"
3,30808,Service,обслуживающему персоналу,147,171,positive,"(обслуживающий, персонал)"
4,30808,Service,сотрудникам,189,200,positive,"(сотрудник,)"


In [None]:
s = df.value_counts(subset=[1, 2])

In [None]:
train_counts = dict(zip(s.keys(), s.to_list()))

In [None]:
!pip install stanza

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import stanza

In [None]:
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...
INFO:stanza:File exists: /root/stanza_resources/ru/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [None]:
nlp = stanza.Pipeline('ru', processors='tokenize')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Done loading processors!


In [None]:
def get_reviews(filename):
  reviews = {}
  with open(filename) as f:
    for line in f:
      line = line.rstrip('\r\n').split('\t')
      reviews[line[0]] = line[1]
  return reviews

In [None]:
revs = get_reviews('project_autobrea/data/train_split_reviews.txt')

In [None]:
from collections import defaultdict
aspects = defaultdict(list)

In [None]:
with open('project_autobrea/data/train_split_aspects.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    keys = ('category', 'mention', 'start', 'end', 'sentiment')
    # ['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
    # тут можно отдельно запомнить начало и конец каждого упоминания
    aspects[line[0]].append(dict(zip(keys, line[1:])))

In [None]:
def get_bioes(reviews):
    li_all = []
    
    for text_id, text in reviews.items():
        tuple_li = []
        processed = nlp(text)
        for token in processed.iter_tokens():
            add = False
            parse = morph.parse(token.text)[0]
            p = parse.tag.POS
            if p is None:
                p = 'PUNKT'
            for mention in aspects[text_id]:
                if token.start_char == int(mention['start']):
                    aspect = (text_id, token.text,'B-'+mention['category'], p,token.start_char, token.end_char)

                    add = True
                    break
                elif token.start_char > int(mention['start']) and token.end_char < int(mention['end']):
                    add = True
                    aspect = (text_id, token.text,'I-'+mention['category'], p,token.start_char, token.end_char)

                    break
                elif token.start_char > int(mention['start']) and token.end_char == int(mention['end']):
                    add = True
                    aspect = (text_id, token.text,'E-'+mention['category'], p,token.start_char, token.end_char)
    
                    break
            if not add:
                aspect = (text_id, token.text,'O',p, token.start_char, token.end_char)
            tuple_li.append(aspect)  
        li_all.append(tuple_li)
    return li_all

In [None]:
all_cats = get_bioes(revs)

In [None]:
all_cats[0]

[('25709', 'Были', 'O', 'VERB', 0, 4),
 ('25709', '1', 'O', 'PUNKT', 5, 6),
 ('25709', 'февраля', 'O', 'NOUN', 7, 14),
 ('25709', '.', 'O', 'PUNKT', 14, 15),
 ('25709', 'Я', 'O', 'NPRO', 16, 17),
 ('25709', 'второй', 'O', 'ADJF', 18, 24),
 ('25709', 'раз', 'O', 'NOUN', 25, 28),
 ('25709', ',', 'O', 'PUNKT', 28, 29),
 ('25709', 'подруга', 'O', 'NOUN', 30, 37),
 ('25709', '-', 'O', 'PUNKT', 38, 39),
 ('25709', 'первый', 'O', 'ADJF', 40, 46),
 ('25709', '.', 'O', 'PUNKT', 46, 47),
 ('25709', 'Приехали', 'O', 'VERB', 48, 56),
 ('25709', 'не', 'O', 'PRCL', 57, 59),
 ('25709', 'поздно', 'O', 'ADVB', 60, 66),
 ('25709', ',', 'O', 'PUNKT', 66, 67),
 ('25709', 'часов', 'O', 'NOUN', 68, 73),
 ('25709', 'в', 'O', 'PREP', 74, 75),
 ('25709', 'семь', 'O', 'NUMR', 76, 80),
 ('25709', '.', 'O', 'PUNKT', 80, 81),
 ('25709', 'Народу', 'O', 'NOUN', 82, 88),
 ('25709', 'было', 'O', 'VERB', 89, 93),
 ('25709', 'очень', 'O', 'ADVB', 94, 99),
 ('25709', 'мало', 'O', 'NUMR', 100, 104),
 ('25709', '.', 'O', '

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
!wget https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-ru-wp3.bz2

--2022-12-29 07:59:11--  https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-ru-wp3.bz2
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/aij-wikiner-ru-wp3.bz2 [following]
--2022-12-29 07:59:11--  https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/aij-wikiner-ru-wp3.bz2
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7856559 (7.5M) [application/octet-stream]
Saving to: ‘aij-wikiner-ru-wp3.bz2.1’


2022-12-29 07:59:11 (287 MB/s) - ‘aij-wikiner-ru-wp3.bz2.1’ saved [7856559/7856559]



In [None]:
!pip install corus

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def word2features(sent, i):

    # достаёт фичи для i-го токена в предложении
    word = sent[i][1]
    postag = sent[i][3]
    text_id = sent[i][0]
    start = sent[i][4]
    end = sent[i][5]

    features = {
        'text_id': text_id,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'start':start,
        'end': end
    }
    if i > 0:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][3]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][1]
        postag1 = sent[i+1][3]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    # достаёт фичи для всех токенов в предложении
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [tup[2] for tup in sent]

def sent2tokens(sent):
    return [tup[1] for tup in sent]

In [None]:
from sklearn.model_selection import train_test_split

train_, test_ = train_test_split(all_cats, test_size=0.1, random_state=0, shuffle=True)

In [None]:
X_train = [sent2features(s) for s in train_]
y_train = [sent2labels(s) for s in train_]

X_test = [sent2features(s) for s in test_]
y_test = [sent2labels(s) for s in test_]

In [None]:
!pip install sklearn-crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers,CRF

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)


From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [None]:
labels = list(crf.classes_)

In [None]:
metrics.flat_f1_score(y_test, predictions, 
                      average='weighted', labels=labels)

0.9283971268554776

In [None]:
print(X_test)

[[{'text_id': '34282', 'word.lower()': 'сегодня', 'word[-3:]': 'дня', 'word[-2:]': 'ня', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'ADVB', 'start': 0, 'end': 7, 'BOS': True, '+1:word.lower()': 'случайно', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'ADVB', '+1:postag[:2]': 'AD'}, {'text_id': '34282', 'word.lower()': 'случайно', 'word[-3:]': 'йно', 'word[-2:]': 'но', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'ADVB', 'start': 8, 'end': 16, '-1:word.lower()': 'сегодня', '-1:word.istitle()': True, '-1:word.isupper()': False, '-1:postag': 'ADVB', '+1:word.lower()': 'попали', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'VERB', '+1:postag[:2]': 'VE'}, {'text_id': '34282', 'word.lower()': 'попали', 'word[-3:]': 'али', 'word[-2:]': 'ли', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'VERB', 'start': 17, 'end': 23, '-1:word.

In [None]:
len(X_test[0])

234

In [None]:
len(predictions[0])

234

In [None]:
li_cats = []
for ind, rev in enumerate(X_test):
    for ind2, asp in enumerate(rev):
        cat = ''
        if "B" in predictions[ind][ind2]:
            word = asp['word.lower()'] 
            text_id = asp['text_id']
            start = asp['start']
            i = ind2 + 1
            cat =  predictions[ind][ind2].split('-')[1]  
            while "I" in predictions[ind][i] or "E" in predictions[ind][i]:
                if i == len(rev) - 1:
                    break
                else:
                    word += ' ' + rev[i]['word.lower()']
                    cat = predictions[ind][i].split('-')[1]
                    i += 1
               
            end = rev[i]['end']

        tup = (text_id, word, cat,start, end)
        if tup[2] != '':
            li_cats.append(tup)

In [None]:
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

In [110]:
len(li_cats)

271

In [None]:
li_cats_df= pd.DataFrame(li_cats)

In [None]:
df_cats = swap_columns(li_cats_df, 1, 2)

In [None]:
df_cats

Unnamed: 0,0,2,1,3,4
0,34282,Whole,ресторан,31,40
1,34282,Service,обслуживание,96,118
2,34282,Food,блюда,301,307
3,34282,Food,цезарь,359,367
4,34282,Food,пасту,380,386
...,...,...,...,...,...
266,28612,Food,готовят,152,167
267,28612,Food,порции,178,192
268,28612,Interior,музыка,281,288
269,28612,Interior,обстановка,309,320


In [None]:
df_cats.to_csv('out_aspects.csv',index=False, sep = '\t')

In [None]:
print(predictions)

[['O', 'O', 'O', 'O', 'O', 'B-Whole', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Service', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Food', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Servic

In [None]:
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, predictions, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

           O      0.952     0.984     0.968      3286
      B-Food      0.765     0.673     0.716       150
      E-Food      0.667     0.483     0.560        29
      I-Food      0.647     0.524     0.579        21
  B-Interior      0.867     0.448     0.591        58
  E-Interior      0.333     0.062     0.105        16
  I-Interior      0.200     0.143     0.167         7
     B-Price      1.000     0.500     0.667         6
     E-Price      0.000     0.000     0.000         2
     I-Price      0.000     0.000     0.000         1
   B-Service      0.857     0.532     0.656        79
   E-Service      0.429     0.250     0.316        12
   I-Service      0.333     0.250     0.286         4
     B-Whole      0.825     0.855     0.839        55
     E-Whole      0.714     1.000     0.833         5
     I-Whole      0.000     0.000     0.000         0

    accuracy                          0.935      3731
   macro avg      0.537   


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



In [None]:
from pymorphy2 import MorphAnalyzer

In [None]:
m = MorphAnalyzer()
def preprocess_aspects(words, with_cats = False):
    all_aspects = []
    asp_norm = ''
    for word in words:
        ana = m.parse(word)[0]
        if ana.tag.POS:
            aspect = ana.normal_form
        else:
            aspect = ana.normal_form
        if not with_cats:
            if aspect not in all_aspects:
                all_aspects.append(aspect)

    return all_aspects

In [None]:
asp_preproc = set(preprocess_aspects(unique_asps))

In [None]:
asp_preproc

{'первый зать',
 'тыквенный суп',
 'пиво',
 'манго в имбирном соус',
 'заплатить',
 'объесться',
 'соотношение цена/качество',
 'атмосфера заведение',
 'попробовать',
 'масло',
 'чай с чабрец',
 'порекомендовать',
 'оливка',
 'свежие овощ',
 'убирать',
 'состав блюд',
 'дораду с овощь',
 'торт',
 'скидка',
 'колонка',
 'принесла менить',
 'мартини со льд',
 'посетители vip-зал',
 'смотрели за стол',
 'закуски к пиво',
 'салат с креветка',
 'есть',
 'проводила к столик',
 'смотрят за посетитель',
 'пустое помещение',
 'нельсон паб',
 'камин',
 'кафе dolce italy',
 'официатка',
 'здороваться',
 'шеф повар',
 'селёдка',
 'попить',
 'пробовать',
 'вишня',
 'заказ принести',
 'подать',
 'китайский ресторан',
 'администрация',
 'праздник',
 'салату цезарь',
 'da albertone',
 'вид из окно',
 'отдельный зать',
 'зала',
 'разнообразие',
 'аппетит',
 'обедать',
 'салат с креветками и листие',
 'сухарик',
 'посадить',
 'салат из лисичек с кроличьей печение',
 'мамалыга',
 'встречать',
 'перечный 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
len(asp_preproc)

1090

In [None]:
df_recipes = pd.read_csv('drive/MyDrive/povarenok_recipes_2021_06_16.csv')

In [None]:
li_recipes = set(df_recipes.name.unique())

In [None]:
len(li_recipes)

128495

In [None]:
from tqdm.auto import tqdm

In [None]:
!pip install summa

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:

!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.1.0/ru_core_news_sm-3.1.0.tar.gz (15.9 MB)
[K     |████████████████████████████████| 15.9 MB 481 kB/s 


In [None]:
!python -m spacy download ru_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ru-core-news-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.1.0/ru_core_news_lg-3.1.0-py3-none-any.whl (514.2 MB)
[K     |████████████████████████████████| 514.2 MB 27 kB/s 
Installing collected packages: ru-core-news-lg
Successfully installed ru-core-news-lg-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')


In [None]:

import re
import os
from nltk import pos_tag
from tqdm.auto import tqdm

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('russian')
from textblob import TextBlob, Word
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from summa import keywords
import spacy
from spacy.lang.ru.examples import sentences 
nlp = spacy.load('ru_core_news_lg')
from sklearn.metrics import f1_score, precision_score,recall_score
from sklearn.preprocessing import MultiLabelBinarizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
def filter_keywords(keywords):
    patterns = ['NOUN NOUN', 'ADJ NOUN', 'NOUN ADP NOUN']
    pos_tags = []
    for i in tqdm(keywords):
        pos = ''
        filtered = []
        doc = nlp(i)
        lemmas = " ".join([token.lemma_ for token in doc])
        pos = ' '.join([word.pos_ for word in doc])
        pos = pos.split(' ')
        for j in range(len(pos)-1):
            pos_two = pos[j] + ' ' + pos[j+1]
            if pos_two in patterns:
                if len(i.split(' ')) == len(pos):
                    filtered.append(lemmas.split(' ')[j].lower()+' '+lemmas.split(' ')[j+1].lower())
        for j in range(len(pos)-2):
            pos_two = pos[j] + ' ' + pos[j+1]
            if pos_two + ' '+ pos[j+2] in patterns:
                if len(i.split(' ')) == len(pos):
                    filtered.append(lemmas.split(' ')[j].lower()+' '+lemmas.split(' ')[j+1].lower()+' '+ lemmas.split(' ')[j+2].lower())
        if len(filtered) !=  0:   
            pos_tags.append(filtered)
    return pos_tags

In [None]:
li_new = filter_keywords(list(li_recipes))

  0%|          | 0/128495 [00:00<?, ?it/s]

In [None]:
li_new[:30]

[['паровой котлета'],
 ['мясной рулет', 'рулет с начинка'],
 ['конвертик из лаваш', 'лаваш с морепродукт'],
 ['карамелизованным лук'],
 ['кекс с вишня'],
 ['вареник с гриб'],
 ['рогалик с сыр'],
 ['тыквенный омлетиками'],
 ['макаронный запеканка'],
 ['косточка в духовка'],
 ['цуккини с перец', 'перец в сметане'],
 ['борода монах', 'монах на подушка', 'подушка из овощ'],
 ['творожный пирог', 'пирог с персиками'],
 ['овощной запеканка', 'куриный филе', 'вяленый помидор'],
 ['морковный пирог', 'пирог без яйцо'],
 ['рыбный пирог', 'пирог с лосось'],
 ['морковный коврижка'],
 ['запеченные фрукт', 'фрукт с хлопьями'],
 ['томатный суп'],
 ['экономный оладьи', 'крабовый палочка'],
 ['домашний условие'],
 ['шоколадный торт'],
 ['заварной тест'],
 ['любимый котлета'],
 ['кролик в горшочек'],
 ['фул из крыжовник'],
 ['отварный свёкла'],
 ['сырнички с сыр'],
 ['жареный сыр', 'салат с чечевица'],
 ['сладкий хлеб']]

In [None]:
flat_list = [item for sublist in li_new for item in sublist]

In [None]:
flat_list[:30]

['паровой котлета',
 'мясной рулет',
 'рулет с начинка',
 'конвертик из лаваш',
 'лаваш с морепродукт',
 'карамелизованным лук',
 'кекс с вишня',
 'вареник с гриб',
 'рогалик с сыр',
 'тыквенный омлетиками',
 'макаронный запеканка',
 'косточка в духовка',
 'цуккини с перец',
 'перец в сметане',
 'борода монах',
 'монах на подушка',
 'подушка из овощ',
 'творожный пирог',
 'пирог с персиками',
 'овощной запеканка',
 'куриный филе',
 'вяленый помидор',
 'морковный пирог',
 'пирог без яйцо',
 'рыбный пирог',
 'пирог с лосось',
 'морковный коврижка',
 'запеченные фрукт',
 'фрукт с хлопьями',
 'томатный суп']

In [None]:
with open('project_autobrea/data/all_aspects.txt') as f:
    read = [elem.split('\n')[0] for elem in f.readlines()]

In [None]:
flat_list.extend(read)

In [None]:
len(flat_list)

95865

In [None]:
flat_set = set(flat_list)

In [None]:
len(flat_set)

40898

In [None]:
train_reviews = {}
test_reviews = {}
with open('project_autobrea/data/train_split_reviews.txt', encoding = 'utf-8') as f:
    text = f.read()
    for ind, line in enumerate(text.splitlines()):
        num, text = line.split('\t')
        train_reviews[num] = text
with open('project_autobrea/data/dev_reviews.txt', encoding = 'utf-8') as f:
    text = f.read()
    for ind, line in enumerate(text.splitlines()):
        num, text = line.split('\t')
        test_reviews[num] = text       

In [None]:
m = MorphAnalyzer()
from collections import defaultdict
aspect2cat = defaultdict(list)
with open('project_autobrea/data/train_split_aspects.txt', encoding = 'utf-8') as f:
    text = f.readlines()
    for line in text:
        line = line.rstrip('\r\n').split('\t')
        cat = line[1]
        ana = m.parse(line[2])[0]
        lemma = ana.normal_form
        aspect2cat[lemma].append(cat)

In [None]:
from collections import Counter
aspect2cat_fin = {asp: Counter(l).most_common(1)[0][0] for asp, l in aspect2cat.items()}

In [None]:
import gensim.downloader as api

model = api.load("word2vec-ruscorpora-300")

In [None]:
import numpy as np
def define_category(aspect):
  classes = ['Whole', 'Food', 'Interior', 'Service', 'Price']
  base_words = ['ресторан_NOUN', 'еда_NOUN', 'интерьер_NOUN', 'обслуживание_NOUN', 'цена_NOUN']
  base_vectors = []
  for c, word in zip(classes, base_words):
      base_vectors.append(model[word])
  if aspect in model.vocab:
      sim = model.cosine_similarities(model[aspect], base_vectors)
      return classes[np.argmax(sim)]
  else:
      return 'Whole'

In [None]:
from collections import defaultdict
aspects2reviews_pred_2 = defaultdict(list)
from nltk.tokenize import word_tokenize
for num, review in test_reviews.items():
    processed = nlp(review)
    lemma = " ".join([token.lemma_ for token in processed])
    for elem in flat_set:
        if elem in lemma:
            aspects2reviews_pred_2[num].append({'aspect': elem})

In [None]:
aspects2reviews_pred_2

defaultdict(list,
            {'13823': [{'aspect': 'сок'},
              {'aspect': 'с '},
              {'aspect': 'официантка'},
              {'aspect': 'чай'},
              {'aspect': 'дать'},
              {'aspect': 'бизнес ланч'},
              {'aspect': 'встретить'},
              {'aspect': 'ждать'},
              {'aspect': 'еда'},
              {'aspect': 'дать меню'},
              {'aspect': 'цена'},
              {'aspect': 'стол'},
              {'aspect': ' с'},
              {'aspect': 'проводить'},
              {'aspect': 'принять заказ'},
              {'aspect': 'ресторан'},
              {'aspect': 'качество обслуживание'},
              {'aspect': 'вок'},
              {'aspect': 'аппетит'},
              {'aspect': 'ланч'},
              {'aspect': 'менеджер'},
              {'aspect': 'официант'},
              {'aspect': 'проводить к стол'},
              {'aspect': 'место'},
              {'aspect': 'девушка'},
              {'aspect': ' приятный'},
      

In [None]:
with open('aspects_only_coll.txt', 'w') as f:
    for k, w in tqdm(aspects2reviews_pred_2.items()):
        for elem in w:
            for val in elem.values():
                f.write(k)
                f.write('\t')
                f.write(val)
                f.write('\n')

  0%|          | 0/71 [00:00<?, ?it/s]

5

1

3

1

5

1

2

1

5

1

10

1

5

1

3

1

5

1

4

1

5

1

11

1

5

1

9

1

5

1

5

1

5

1

3

1

5

1

9

1

5

1

4

1

5

1

4

1

5

1

2

1

5

1

9

1

5

1

13

1

5

1

8

1

5

1

21

1

5

1

3

1

5

1

7

1

5

1

4

1

5

1

8

1

5

1

8

1

5

1

16

1

5

1

5

1

5

1

7

1

5

1

9

1

5

1

4

1

5

1

8

1

5

1

5

1

5

1

7

1

5

1

11

1

5

1

9

1

5

1

5

1

5

1

2

1

5

1

12

1

5

1

3

1

5

1

1

1

5

1

4

1

4

1

2

1

4

1

4

1

4

1

7

1

4

1

9

1

4

1

4

1

4

1

12

1

4

1

11

1

4

1

8

1

4

1

9

1

4

1

4

1

4

1

9

1

4

1

4

1

4

1

5

1

4

1

9

1

4

1

4

1

4

1

6

1

4

1

2

1

4

1

7

1

4

1

9

1

4

1

6

1

4

1

3

1

4

1

10

1

4

1

5

1

4

1

3

1

4

1

7

1

4

1

7

1

4

1

8

1

4

1

5

1

4

1

7

1

4

1

9

1

4

1

4

1

4

1

9

1

4

1

5

1

4

1

15

1

4

1

12

1

4

1

5

1

4

1

9

1

4

1

5

1

4

1

6

1

4

1

8

1

4

1

5

1

4

1

5

1

4

1

8

1

4

1

13

1

4

1

2

1

4

1

9

1

4

1

6

1

4

1

12

1

4

1

7

1

4

1

1

1

4

1

10

1

4

1

6

1

5

1

3

1

5

1

2

1

5

1

17

1

5

1

8

1

5

1

6

1

5

1

6

1

5

1

8

1

5

1

4

1

5

1

2

1

5

1

4

1

5

1

11

1

5

1

6

1

5

1

5

1

5

1

4

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

7

1

5

1

12

1

5

1

3

1

5

1

9

1

5

1

10

1

5

1

5

1

5

1

5

1

5

1

15

1

5

1

8

1

5

1

7

1

5

1

5

1

5

1

6

1

5

1

13

1

5

1

3

1

5

1

13

1

5

1

1

1

5

1

4

1

5

1

8

1

5

1

11

1

5

1

9

1

5

1

8

1

3

1

2

1

3

1

9

1

3

1

4

1

3

1

10

1

3

1

3

1

3

1

4

1

3

1

9

1

3

1

6

1

3

1

6

1

3

1

2

1

3

1

6

1

3

1

8

1

3

1

5

1

3

1

8

1

3

1

4

1

3

1

14

1

3

1

8

1

3

1

7

1

3

1

5

1

3

1

1

1

3

1

10

1

3

1

8

1

3

1

7

1

5

1

9

1

5

1

7

1

5

1

3

1

5

1

10

1

5

1

9

1

5

1

4

1

5

1

8

1

5

1

5

1

5

1

9

1

5

1

4

1

5

1

16

1

5

1

7

1

5

1

2

1

5

1

13

1

5

1

5

1

5

1

8

1

5

1

2

1

5

1

10

1

5

1

4

1

5

1

8

1

5

1

5

1

5

1

18

1

5

1

5

1

5

1

4

1

5

1

4

1

5

1

9

1

5

1

5

1

5

1

6

1

5

1

4

1

5

1

5

1

5

1

8

1

5

1

12

1

5

1

1

1

5

1

11

1

5

1

6

1

5

1

2

1

5

1

9

1

5

1

4

1

5

1

3

1

5

1

2

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

8

1

5

1

17

1

5

1

9

1

5

1

9

1

5

1

4

1

5

1

28

1

5

1

14

1

5

1

5

1

5

1

5

1

5

1

6

1

5

1

2

1

5

1

1

1

5

1

14

1

5

1

5

1

5

1

4

1

4

1

2

1

4

1

4

1

4

1

10

1

4

1

14

1

4

1

4

1

4

1

2

1

4

1

6

1

4

1

9

1

4

1

6

1

4

1

5

1

4

1

8

1

4

1

5

1

4

1

7

1

4

1

4

1

4

1

10

1

4

1

8

1

4

1

5

1

4

1

8

1

4

1

13

1

4

1

10

1

4

1

5

1

4

1

12

1

4

1

8

1

4

1

9

1

4

1

6

1

4

1

5

1

4

1

8

1

4

1

12

1

4

1

12

1

4

1

6

1

4

1

1

1

4

1

4

1

4

1

11

1

4

1

6

1

4

1

7

1

3

1

5

1

3

1

7

1

3

1

7

1

3

1

4

1

3

1

4

1

3

1

7

1

3

1

8

1

3

1

6

1

3

1

4

1

3

1

2

1

3

1

8

1

3

1

8

1

3

1

5

1

3

1

8

1

3

1

9

1

3

1

4

1

3

1

5

1

3

1

7

1

3

1

7

1

3

1

5

1

3

1

5

1

3

1

13

1

3

1

2

1

3

1

1

1

3

1

6

1

3

1

5

1

3

1

4

1

4

1

6

1

4

1

2

1

4

1

5

1

4

1

7

1

4

1

3

1

4

1

10

1

4

1

5

1

4

1

6

1

4

1

6

1

4

1

7

1

4

1

9

1

4

1

4

1

4

1

8

1

4

1

8

1

4

1

2

1

4

1

4

1

4

1

5

1

4

1

8

1

4

1

6

1

4

1

7

1

4

1

27

1

4

1

13

1

4

1

8

1

4

1

5

1

4

1

5

1

4

1

8

1

4

1

5

1

4

1

9

1

4

1

10

1

4

1

16

1

4

1

5

1

4

1

6

1

4

1

6

1

4

1

13

1

4

1

9

1

4

1

21

1

4

1

8

1

4

1

15

1

4

1

17

1

4

1

1

1

4

1

6

1

4

1

11

1

4

1

5

1

4

1

3

1

4

1

4

1

4

1

7

1

5

1

3

1

5

1

2

1

5

1

10

1

5

1

16

1

5

1

7

1

5

1

6

1

5

1

4

1

5

1

16

1

5

1

6

1

5

1

7

1

5

1

7

1

5

1

11

1

5

1

2

1

5

1

5

1

5

1

7

1

5

1

13

1

5

1

7

1

5

1

8

1

5

1

4

1

5

1

5

1

5

1

15

1

5

1

5

1

5

1

13

1

5

1

4

1

5

1

4

1

5

1

7

1

5

1

5

1

5

1

8

1

5

1

2

1

5

1

17

1

5

1

12

1

5

1

1

1

5

1

5

1

5

1

4

1

5

1

11

1

5

1

22

1

5

1

7

1

5

1

6

1

5

1

4

1

5

1

8

1

5

1

17

1

5

1

8

1

5

1

6

1

5

1

2

1

5

1

13

1

5

1

7

1

5

1

8

1

5

1

7

1

5

1

14

1

5

1

5

1

5

1

2

1

5

1

8

1

5

1

3

1

5

1

7

1

5

1

10

1

5

1

5

1

5

1

5

1

5

1

11

1

5

1

4

1

5

1

4

1

5

1

7

1

5

1

9

1

5

1

5

1

5

1

5

1

5

1

13

1

5

1

2

1

5

1

14

1

5

1

9

1

5

1

1

1

5

1

6

1

5

1

5

1

5

1

4

1

5

1

6

1

5

1

5

1

5

1

15

1

5

1

6

1

5

1

2

1

5

1

5

1

5

1

5

1

5

1

4

1

5

1

3

1

5

1

4

1

5

1

4

1

5

1

14

1

5

1

12

1

5

1

5

1

5

1

2

1

5

1

8

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

7

1

5

1

9

1

5

1

26

1

5

1

6

1

5

1

9

1

5

1

8

1

5

1

9

1

5

1

10

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

12

1

5

1

11

1

5

1

1

1

5

1

17

1

5

1

4

1

4

1

2

1

4

1

5

1

4

1

8

1

4

1

4

1

4

1

3

1

4

1

7

1

4

1

6

1

4

1

2

1

4

1

4

1

4

1

9

1

4

1

6

1

4

1

8

1

4

1

5

1

4

1

9

1

4

1

8

1

4

1

7

1

4

1

10

1

4

1

10

1

4

1

1

1

4

1

3

1

4

1

4

1

4

1

8

1

5

1

8

1

5

1

2

1

5

1

18

1

5

1

7

1

5

1

9

1

5

1

8

1

5

1

3

1

5

1

5

1

5

1

5

1

5

1

12

1

5

1

4

1

5

1

2

1

5

1

6

1

5

1

9

1

5

1

11

1

5

1

6

1

5

1

5

1

5

1

5

1

5

1

6

1

5

1

5

1

5

1

7

1

5

1

8

1

5

1

16

1

5

1

14

1

5

1

5

1

5

1

4

1

5

1

8

1

5

1

5

1

5

1

7

1

5

1

13

1

5

1

11

1

5

1

3

1

5

1

9

1

5

1

5

1

5

1

15

1

5

1

15

1

5

1

8

1

5

1

6

1

5

1

7

1

5

1

6

1

5

1

3

1

5

1

8

1

5

1

12

1

5

1

1

1

5

1

6

1

5

1

5

1

5

1

3

1

5

1

4

1

5

1

7

1

5

1

9

1

5

1

7

1

5

1

3

1

5

1

2

1

5

1

13

1

5

1

10

1

5

1

4

1

5

1

7

1

5

1

8

1

5

1

9

1

5

1

9

1

5

1

4

1

5

1

9

1

5

1

7

1

5

1

7

1

5

1

2

1

5

1

6

1

5

1

3

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

13

1

5

1

8

1

5

1

18

1

5

1

5

1

5

1

4

1

5

1

5

1

5

1

7

1

5

1

6

1

5

1

12

1

5

1

1

1

5

1

6

1

5

1

12

1

5

1

16

1

5

1

11

1

5

1

5

1

4

1

9

1

4

1

2

1

4

1

5

1

4

1

3

1

4

1

4

1

4

1

5

1

4

1

4

1

4

1

6

1

4

1

2

1

4

1

8

1

4

1

2

1

4

1

5

1

4

1

4

1

4

1

7

1

4

1

13

1

4

1

8

1

4

1

5

1

4

1

4

1

4

1

5

1

4

1

7

1

4

1

9

1

4

1

11

1

4

1

10

1

4

1

11

1

4

1

5

1

4

1

5

1

4

1

8

1

4

1

2

1

4

1

4

1

4

1

3

1

4

1

7

1

4

1

1

1

4

1

5

1

4

1

6

1

5

1

4

1

5

1

16

1

5

1

14

1

5

1

4

1

5

1

4

1

5

1

2

1

5

1

6

1

5

1

7

1

5

1

5

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

9

1

5

1

7

1

5

1

9

1

5

1

8

1

5

1

7

1

5

1

1

1

5

1

5

1

5

1

3

1

5

1

4

1

5

1

8

1

4

1

13

1

4

1

8

1

4

1

2

1

4

1

8

1

4

1

8

1

4

1

3

1

4

1

4

1

4

1

3

1

4

1

5

1

4

1

8

1

4

1

4

1

4

1

3

1

4

1

4

1

4

1

11

1

4

1

2

1

4

1

8

1

4

1

9

1

4

1

5

1

4

1

8

1

4

1

2

1

4

1

6

1

4

1

3

1

4

1

11

1

4

1

5

1

4

1

20

1

4

1

5

1

4

1

12

1

4

1

5

1

4

1

15

1

4

1

7

1

4

1

6

1

4

1

7

1

4

1

9

1

4

1

5

1

4

1

6

1

4

1

18

1

4

1

15

1

4

1

2

1

4

1

15

1

4

1

8

1

4

1

4

1

4

1

6

1

4

1

6

1

4

1

1

1

4

1

3

1

4

1

4

1

4

1

6

1

4

1

11

1

4

1

9

1

4

1

8

1

5

1

3

1

5

1

2

1

5

1

13

1

5

1

4

1

5

1

3

1

5

1

4

1

5

1

2

1

5

1

10

1

5

1

6

1

5

1

4

1

5

1

5

1

5

1

6

1

5

1

6

1

5

1

4

1

5

1

9

1

5

1

13

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

12

1

5

1

1

1

5

1

3

1

5

1

8

1

5

1

7

1

5

1

2

1

5

1

18

1

5

1

7

1

5

1

12

1

5

1

4

1

5

1

8

1

5

1

2

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

4

1

5

1

12

1

5

1

5

1

5

1

7

1

5

1

8

1

5

1

9

1

5

1

2

1

5

1

12

1

5

1

1

1

5

1

5

1

5

1

4

1

5

1

6

1

5

1

6

1

5

1

3

1

5

1

7

1

5

1

8

1

5

1

2

1

5

1

5

1

5

1

16

1

5

1

9

1

5

1

4

1

5

1

24

1

5

1

6

1

5

1

4

1

5

1

2

1

5

1

9

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

3

1

5

1

10

1

5

1

7

1

5

1

8

1

5

1

6

1

5

1

5

1

5

1

9

1

5

1

5

1

5

1

11

1

5

1

10

1

5

1

15

1

5

1

5

1

5

1

9

1

5

1

13

1

5

1

9

1

5

1

12

1

5

1

11

1

5

1

1

1

5

1

6

1

5

1

11

1

5

1

2

1

5

1

5

1

5

1

4

1

5

1

10

1

5

1

8

1

5

1

4

1

5

1

9

1

5

1

8

1

5

1

9

1

5

1

17

1

5

1

5

1

5

1

3

1

5

1

3

1

5

1

6

1

5

1

4

1

5

1

2

1

5

1

11

1

5

1

16

1

5

1

10

1

5

1

8

1

5

1

2

1

5

1

14

1

5

1

5

1

5

1

8

1

5

1

17

1

5

1

5

1

5

1

4

1

5

1

5

1

5

1

10

1

5

1

7

1

5

1

9

1

5

1

4

1

5

1

10

1

5

1

5

1

5

1

6

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

1

1

5

1

9

1

5

1

16

1

5

1

9

1

4

1

9

1

4

1

2

1

4

1

8

1

4

1

6

1

4

1

3

1

4

1

8

1

4

1

8

1

4

1

8

1

4

1

7

1

4

1

4

1

4

1

3

1

4

1

5

1

4

1

4

1

4

1

12

1

4

1

2

1

4

1

8

1

4

1

11

1

4

1

8

1

4

1

13

1

4

1

5

1

4

1

8

1

4

1

4

1

4

1

8

1

4

1

20

1

4

1

5

1

4

1

11

1

4

1

3

1

4

1

4

1

4

1

4

1

4

1

10

1

4

1

5

1

4

1

6

1

4

1

7

1

4

1

4

1

4

1

11

1

4

1

8

1

4

1

14

1

4

1

6

1

4

1

7

1

4

1

1

1

4

1

6

1

4

1

5

1

4

1

4

1

4

1

15

1

4

1

6

1

4

1

14

1

4

1

9

1

5

1

3

1

5

1

2

1

5

1

9

1

5

1

20

1

5

1

8

1

5

1

4

1

5

1

4

1

5

1

7

1

5

1

4

1

5

1

2

1

5

1

7

1

5

1

5

1

5

1

8

1

5

1

13

1

5

1

7

1

5

1

10

1

5

1

5

1

5

1

7

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

2

1

5

1

12

1

5

1

11

1

5

1

1

1

5

1

11

1

5

1

4

1

5

1

11

1

5

1

9

1

5

1

4

1

5

1

2

1

5

1

4

1

5

1

18

1

5

1

4

1

5

1

9

1

5

1

6

1

5

1

4

1

5

1

15

1

5

1

2

1

5

1

5

1

5

1

3

1

5

1

7

1

5

1

5

1

5

1

9

1

5

1

10

1

5

1

5

1

5

1

5

1

5

1

10

1

5

1

6

1

5

1

9

1

5

1

2

1

5

1

1

1

5

1

11

1

5

1

6

1

4

1

5

1

4

1

7

1

4

1

18

1

4

1

4

1

4

1

5

1

4

1

4

1

4

1

11

1

4

1

21

1

4

1

11

1

4

1

2

1

4

1

21

1

4

1

10

1

4

1

5

1

4

1

8

1

4

1

5

1

4

1

6

1

4

1

3

1

4

1

7

1

4

1

9

1

4

1

8

1

4

1

8

1

4

1

5

1

4

1

13

1

4

1

4

1

4

1

5

1

4

1

14

1

4

1

5

1

4

1

7

1

4

1

7

1

4

1

5

1

4

1

6

1

4

1

5

1

4

1

8

1

4

1

13

1

4

1

8

1

4

1

2

1

4

1

12

1

4

1

1

1

4

1

5

1

4

1

3

1

4

1

4

1

4

1

6

1

4

1

4

1

4

1

4

1

4

1

8

1

5

1

2

1

5

1

9

1

5

1

3

1

5

1

10

1

5

1

5

1

5

1

4

1

5

1

2

1

5

1

2

1

5

1

5

1

5

1

3

1

5

1

7

1

5

1

8

1

5

1

6

1

5

1

13

1

5

1

5

1

5

1

10

1

5

1

5

1

5

1

6

1

5

1

3

1

5

1

12

1

5

1

12

1

5

1

3

1

5

1

1

1

5

1

7

1

5

1

6

1

5

1

7

1

5

1

3

1

5

1

6

1

5

1

2

1

5

1

13

1

5

1

3

1

5

1

8

1

5

1

8

1

5

1

3

1

5

1

15

1

5

1

4

1

5

1

4

1

5

1

2

1

5

1

10

1

5

1

7

1

5

1

14

1

5

1

6

1

5

1

7

1

5

1

5

1

5

1

5

1

5

1

13

1

5

1

7

1

5

1

4

1

5

1

6

1

5

1

7

1

5

1

7

1

5

1

5

1

5

1

3

1

5

1

13

1

5

1

12

1

5

1

2

1

5

1

3

1

5

1

8

1

5

1

8

1

5

1

3

1

5

1

1

1

5

1

5

1

5

1

3

1

5

1

4

1

5

1

5

1

5

1

6

1

5

1

8

1

5

1

8

1

5

1

6

1

5

1

2

1

5

1

3

1

5

1

4

1

5

1

13

1

5

1

7

1

5

1

13

1

5

1

6

1

5

1

13

1

5

1

16

1

5

1

9

1

5

1

4

1

5

1

9

1

5

1

4

1

5

1

22

1

5

1

17

1

5

1

16

1

5

1

4

1

5

1

8

1

5

1

2

1

5

1

7

1

5

1

8

1

5

1

10

1

5

1

2

1

5

1

3

1

5

1

8

1

5

1

5

1

5

1

4

1

5

1

5

1

5

1

10

1

5

1

8

1

5

1

9

1

5

1

5

1

5

1

9

1

5

1

7

1

5

1

9

1

5

1

4

1

5

1

7

1

5

1

4

1

5

1

5

1

5

1

8

1

5

1

6

1

5

1

2

1

5

1

4

1

5

1

7

1

5

1

1

1

5

1

6

1

5

1

11

1

5

1

5

1

5

1

8

1

5

1

4

1

5

1

5

1

5

1

4

1

5

1

2

1

5

1

23

1

5

1

10

1

5

1

9

1

5

1

5

1

5

1

4

1

5

1

2

1

5

1

8

1

5

1

8

1

5

1

5

1

5

1

7

1

5

1

9

1

5

1

5

1

5

1

18

1

5

1

9

1

5

1

5

1

5

1

6

1

5

1

14

1

5

1

8

1

5

1

13

1

5

1

8

1

5

1

12

1

5

1

1

1

5

1

11

1

5

1

5

1

5

1

9

1

5

1

6

1

5

1

2

1

5

1

13

1

5

1

4

1

5

1

14

1

5

1

9

1

5

1

8

1

5

1

9

1

5

1

5

1

5

1

2

1

5

1

6

1

5

1

8

1

5

1

20

1

5

1

8

1

5

1

13

1

5

1

9

1

5

1

5

1

5

1

6

1

5

1

7

1

5

1

5

1

5

1

2

1

5

1

12

1

5

1

7

1

5

1

1

1

5

1

6

1

5

1

3

1

5

1

4

1

5

1

12

1

5

1

7

1

5

1

6

1

5

1

7

1

5

1

2

1

5

1

3

1

5

1

5

1

5

1

3

1

5

1

8

1

5

1

6

1

5

1

4

1

5

1

4

1

5

1

2

1

5

1

10

1

5

1

8

1

5

1

3

1

5

1

4

1

5

1

5

1

5

1

6

1

5

1

3

1

5

1

4

1

5

1

7

1

5

1

5

1

5

1

5

1

5

1

8

1

5

1

7

1

5

1

15

1

5

1

18

1

5

1

15

1

5

1

7

1

5

1

7

1

5

1

5

1

5

1

2

1

5

1

7

1

5

1

10

1

5

1

12

1

5

1

1

1

5

1

3

1

5

1

4

1

5

1

12

1

5

1

9

1

4

1

6

1

4

1

2

1

4

1

9

1

4

1

4

1

4

1

2

1

4

1

6

1

4

1

5

1

4

1

6

1

4

1

8

1

4

1

5

1

4

1

8

1

4

1

5

1

4

1

9

1

4

1

5

1

4

1

8

1

4

1

7

1

4

1

8

1

4

1

12

1

4

1

1

1

4

1

5

1

4

1

3

1

4

1

4

1

5

1

2

1

5

1

4

1

5

1

6

1

5

1

17

1

5

1

7

1

5

1

9

1

5

1

2

1

5

1

11

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

5

1

5

1

9

1

5

1

5

1

5

1

9

1

5

1

5

1

5

1

13

1

5

1

4

1

5

1

4

1

5

1

10

1

5

1

7

1

5

1

2

1

5

1

11

1

5

1

3

1

5

1

1

1

5

1

4

1

5

1

9

1

5

1

8

1

5

1

6

1

5

1

2

1

5

1

5

1

5

1

7

1

5

1

4

1

5

1

4

1

5

1

8

1

5

1

2

1

5

1

11

1

5

1

6

1

5

1

4

1

5

1

12

1

5

1

5

1

5

1

5

1

5

1

7

1

5

1

7

1

5

1

8

1

5

1

5

1

5

1

6

1

5

1

18

1

5

1

10

1

5

1

5

1

5

1

8

1

5

1

9

1

5

1

4

1

5

1

9

1

5

1

9

1

5

1

9

1

5

1

5

1

5

1

10

1

5

1

12

1

5

1

1

1

5

1

6

1

5

1

5

1

5

1

4

1

5

1

6

1

5

1

9

1

5

1

4

1

5

1

3

1

5

1

2

1

5

1

11

1

5

1

4

1

5

1

9

1

5

1

5

1

5

1

6

1

5

1

2

1

5

1

22

1

5

1

8

1

5

1

6

1

5

1

8

1

5

1

5

1

5

1

7

1

5

1

9

1

5

1

9

1

5

1

7

1

5

1

9

1

5

1

5

1

5

1

13

1

5

1

8

1

5

1

11

1

5

1

12

1

5

1

11

1

5

1

1

1

5

1

3

1

5

1

4

1

5

1

11

1

5

1

18

1

5

1

12

1

5

1

19

1

5

1

7

1

5

1

6

1

5

1

6

1

5

1

2

1

5

1

6

1

5

1

4

1

5

1

7

1

5

1

3

1

5

1

10

1

5

1

9

1

5

1

3

1

5

1

28

1

5

1

19

1

5

1

4

1

5

1

8

1

5

1

3

1

5

1

9

1

5

1

5

1

5

1

4

1

5

1

3

1

5

1

6

1

5

1

2

1

5

1

6

1

5

1

11

1

5

1

5

1

5

1

8

1

5

1

2

1

5

1

3

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

7

1

5

1

4

1

5

1

4

1

5

1

5

1

5

1

9

1

5

1

7

1

5

1

12

1

5

1

4

1

5

1

9

1

5

1

5

1

5

1

13

1

5

1

12

1

5

1

3

1

5

1

9

1

5

1

1

1

5

1

8

1

5

1

5

1

5

1

11

1

5

1

9

1

5

1

7

1

5

1

2

1

5

1

12

1

5

1

3

1

5

1

8

1

5

1

6

1

5

1

4

1

5

1

4

1

5

1

2

1

5

1

2

1

5

1

7

1

5

1

5

1

5

1

4

1

5

1

15

1

5

1

6

1

5

1

14

1

5

1

8

1

5

1

12

1

5

1

1

1

5

1

5

1

4

1

7

1

4

1

2

1

4

1

4

1

4

1

6

1

4

1

10

1

4

1

9

1

4

1

4

1

4

1

2

1

4

1

4

1

4

1

5

1

4

1

8

1

4

1

7

1

4

1

3

1

4

1

8

1

4

1

5

1

4

1

8

1

4

1

9

1

4

1

14

1

4

1

7

1

4

1

2

1

4

1

6

1

4

1

12

1

4

1

10

1

4

1

1

1

4

1

5

1

4

1

4

1

4

1

13

1

4

1

6

1

5

1

9

1

5

1

2

1

5

1

9

1

5

1

6

1

5

1

10

1

5

1

4

1

5

1

3

1

5

1

4

1

5

1

4

1

5

1

5

1

5

1

8

1

5

1

6

1

5

1

7

1

5

1

2

1

5

1

4

1

5

1

10

1

5

1

17

1

5

1

3

1

5

1

7

1

5

1

8

1

5

1

6

1

5

1

5

1

5

1

4

1

5

1

8

1

5

1

5

1

5

1

7

1

5

1

9

1

5

1

4

1

5

1

5

1

5

1

10

1

5

1

7

1

5

1

7

1

5

1

8

1

5

1

5

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

2

1

5

1

8

1

5

1

4

1

5

1

12

1

5

1

11

1

5

1

12

1

5

1

7

1

5

1

1

1

5

1

10

1

5

1

9

1

5

1

11

1

5

1

4

1

5

1

5

1

4

1

3

1

4

1

9

1

4

1

2

1

4

1

7

1

4

1

4

1

4

1

2

1

4

1

8

1

4

1

8

1

4

1

6

1

4

1

3

1

4

1

7

1

4

1

8

1

4

1

8

1

4

1

11

1

4

1

9

1

4

1

8

1

4

1

4

1

4

1

12

1

4

1

7

1

4

1

1

1

4

1

11

1

4

1

3

1

4

1

8

1

5

1

9

1

5

1

2

1

5

1

7

1

5

1

4

1

5

1

17

1

5

1

12

1

5

1

4

1

5

1

3

1

5

1

4

1

5

1

4

1

5

1

4

1

5

1

7

1

5

1

2

1

5

1

5

1

5

1

13

1

5

1

7

1

5

1

7

1

5

1

8

1

5

1

7

1

5

1

5

1

5

1

5

1

5

1

5

1

5

1

5

1

5

1

23

1

5

1

6

1

5

1

9

1

5

1

6

1

5

1

8

1

5

1

6

1

5

1

6

1

5

1

12

1

5

1

12

1

5

1

7

1

5

1

1

1

5

1

5

1

5

1

12

1

5

1

3

1

5

1

6

1

5

1

2

1

5

1

4

1

5

1

5

1

5

1

14

1

5

1

8

1

5

1

4

1

5

1

17

1

5

1

4

1

5

1

14

1

5

1

6

1

5

1

4

1

5

1

4

1

5

1

2

1

5

1

6

1

5

1

10

1

5

1

14

1

5

1

5

1

5

1

6

1

5

1

7

1

5

1

5

1

5

1

5

1

5

1

7

1

5

1

6

1

5

1

9

1

5

1

11

1

5

1

5

1

5

1

15

1

5

1

10

1

5

1

6

1

5

1

5

1

5

1

5

1

5

1

5

1

5

1

8

1

5

1

2

1

5

1

1

1

5

1

3

1

5

1

4

1

5

5

1

5

1

3

1

5

1

8

1

5

1

14

1

5

1

7

1

5

1

6

1

5

1

8

1

5

1

12

1

5

1

3

1

5

1

4

1

5

1

1

1

5

1

5

1

5

1

3

1

5

1

4

1

5

1

5

1

5

1

11

1

5

1

8

1

5

1

14

1

5

1

27

1

5

1

6

1

5

1

5

1

5

1

9

1

5

1

8

1

5

1

12

1

5

1

8

1

5

1

13

1

5

1

13

1

5

1

16

1

5

1

4

1

5

1

3

1

5

1

4

1

5

1

2

1

5

1

8

1

5

1

4

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

13

1

5

1

9

1

5

1

10

1

5

1

9

1

5

1

7

1

5

1

6

1

5

1

2

1

5

1

12

1

5

1

1

1

5

1

6

1

5

1

6

1

5

1

4

1

4

1

12

1

4

1

9

1

4

1

6

1

4

1

2

1

4

1

6

1

4

1

7

1

4

1

3

1

4

1

7

1

4

1

14

1

4

1

5

1

4

1

4

1

4

1

7

1

4

1

24

1

4

1

8

1

4

1

14

1

4

1

3

1

4

1

6

1

4

1

2

1

4

1

11

1

4

1

4

1

4

1

5

1

4

1

8

1

4

1

6

1

4

1

10

1

4

1

3

1

4

1

4

1

4

1

8

1

4

1

10

1

4

1

5

1

4

1

5

1

4

1

8

1

4

1

9

1

4

1

4

1

4

1

7

1

4

1

6

1

4

1

11

1

4

1

9

1

4

1

8

1

4

1

4

1

4

1

5

1

4

1

6

1

4

1

7

1

4

1

11

1

4

1

7

1

4

1

3

1

4

1

17

1

4

1

12

1

4

1

3

1

4

1

6

1

4

1

7

1

4

1

12

1

4

1

1

1

4

1

5

1

4

1

4

1

4

1

3

1

4

1

4

1

4

1

14

1

4

1

18

1

4

1

5

1

4

1

6

1

4

1

4

1

4

1

6

1

4

1

17

1

4

1

9

1

4

1

4

1

4

1

7

1

5

1

8

1

5

1

6

1

5

1

2

1

5

1

8

1

5

1

5

1

5

1

2

1

5

1

9

1

5

1

10

1

5

1

4

1

5

1

5

1

5

1

9

1

5

1

8

1

5

1

5

1

5

1

6

1

5

1

4

1

5

1

6

1

5

1

7

1

5

1

2

1

5

1

9

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

7

1

5

1

7

1

5

1

8

1

5

1

5

1

5

1

5

1

5

1

4

1

5

1

5

1

5

1

7

1

5

1

8

1

5

1

7

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

5

1

5

1

8

1

5

1

8

1

5

1

13

1

5

1

6

1

5

1

7

1

5

1

1

1

5

1

11

1

5

1

6

1

5

1

3

1

5

1

2

1

5

1

7

1

5

1

11

1

5

1

9

1

5

1

4

1

5

1

4

1

5

1

4

1

5

1

6

1

5

1

6

1

5

1

9

1

5

1

4

1

5

1

7

1

5

1

2

1

5

1

7

1

5

1

11

1

5

1

5

1

5

1

8

1

5

1

3

1

5

1

7

1

5

1

9

1

5

1

8

1

5

1

6

1

5

1

4

1

5

1

8

1

5

1

13

1

5

1

16

1

5

1

10

1

5

1

5

1

5

1

7

1

5

1

7

1

5

1

9

1

5

1

16

1

5

1

26

1

5

1

8

1

5

1

6

1

5

1

3

1

5

1

34

1

5

1

12

1

5

1

1

1

5

1

6

1

5

1

11

1

5

1

5

1

5

1

6

1

5

1

4

1

5

1

25

1

5

1

7

1

5

1

7

1

5

1

9

1

5

1

16

1

5

1

2

1

5

1

4

1

5

1

3

1

5

1

9

1

5

1

14

1

5

1

4

1

5

1

8

1

5

1

25

1

5

1

14

1

5

1

8

1

5

1

3

1

5

1

15

1

5

1

5

1

5

1

2

1

5

1

7

1

5

1

8

1

5

1

6

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

32

1

5

1

8

1

5

1

4

1

5

1

10

1

5

1

5

1

5

1

6

1

5

1

19

1

5

1

7

1

5

1

29

1

5

1

9

1

5

1

5

1

5

1

11

4

1

4

1

12

1

4

1

1

1

4

1

5

1

4

1

3

1

4

1

4

1

4

1

6

1

4

1

6

1

4

1

9

1

4

1

4

1

5

1

4

1

5

1

3

1

5

1

2

1

5

1

13

1

5

1

11

1

5

1

4

1

5

1

6

1

5

1

10

1

5

1

16

1

5

1

5

1

5

1

3

1

5

1

7

1

5

1

8

1

5

1

3

1

5

1

4

1

5

1

13

1

5

1

6

1

5

1

8

1

5

1

5

1

5

1

2

1

5

1

10

1

5

1

5

1

5

1

8

1

5

1

9

1

5

1

4

1

5

1

7

1

5

1

8

1

5

1

6

1

5

1

8

1

5

1

7

1

5

1

4

1

5

1

5

1

5

1

15

1

5

1

5

1

5

1

5

1

5

1

7

1

5

1

9

1

5

1

5

1

5

1

13

1

5

1

5

1

5

1

8

1

5

1

11

1

5

1

17

1

5

1

16

1

5

1

10

1

5

1

12

1

5

1

1

1

5

1

6

1

5

1

9

1

5

1

6

1

5

1

4

1

5

1

11

1

5

1

6

1

5

1

13

1

5

1

16

1

5

1

2

1

5

1

10

1

5

1

9

1

5

1

10

1

5

1

9

1

5

1

4

1

5

1

3

1

5

1

6

1

5

1

4

1

5

1

2

1

5

1

18

1

5

1

18

1

5

1

8

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

12

1

5

1

5

1

5

1

7

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

7

1

5

1

4

1

5

1

8

1

5

1

5

1

5

1

7

1

5

1

7

1

5

1

9

1

5

1

6

1

5

1

8

1

5

1

5

1

5

1

5

1

5

1

5

1

5

1

8

1

5

1

12

1

5

1

4

1

5

1

1

1

5

1

4

1

5

1

11

1

5

1

17

1

5

1

10

1

5

1

2

1

5

1

4

1

5

1

8

1

5

1

10

1

5

1

6

1

5

1

14

1

5

1

4

1

5

1

8

1

5

1

9

1

5

1

3

1

5

1

4

1

5

1

2

1

5

1

5

1

5

1

8

1

5

1

6

1

5

1

7

1

5

1

4

1

8

1

4

1

9

1

4

1

4

1

4

1

10

1

4

1

4

1

4

1

5

1

4

1

9

1

4

1

11

1

4

1

7

1

4

1

11

1

4

1

5

1

4

1

8

1

4

1

13

1

4

1

3

1

4

1

1

1

4

1

9

1

4

1

3

1

4

1

7

1

4

1

2

1

4

1

16

1

4

1

4

1

4

1

13

1

4

1

13

1

4

1

8

1

4

1

3

1

4

1

5

1

4

1

2

1

4

1

10

1

4

1

13

1

4

1

7

1

4

1

8

1

4

1

5

1

4

1

5

1

4

1

5

1

4

1

13

1

4

1

13

1

4

1

9

1

4

1

5

1

4

1

5

1

4

1

7

1

4

1

6

1

4

1

7

1

4

1

5

1

4

1

5

1

4

1

13

1

4

1

19

1

4

1

7

1

4

1

22

1

4

1

1

1

4

1

5

1

4

1

4

1

4

1

6

1

4

1

7

1

4

1

8

1

5

1

11

1

5

1

2

1

5

1

4

1

5

1

8

1

5

1

10

1

5

1

5

1

5

1

4

1

5

1

3

1

5

1

9

1

5

1

2

1

5

1

6

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

9

1

5

1

4

1

5

1

9

1

5

1

4

1

5

1

5

1

5

1

7

1

5

1

5

1

5

1

8

1

5

1

2

1

5

1

12

1

5

1

1

1

5

1

5

1

5

1

4

1

5

1

6

1

4

1

6

1

4

1

2

1

4

1

5

1

4

1

4

1

4

1

3

1

4

1

4

1

4

1

9

1

4

1

6

1

4

1

2

1

4

1

4

1

4

1

8

1

4

1

2

1

4

1

12

1

4

1

5

1

4

1

5

1

4

1

6

1

4

1

7

1

4

1

10

1

4

1

7

1

4

1

8

1

4

1

8

1

4

1

1

1

4

1

5

1

4

1

4

1

4

1

6

1

5

1

2

1

5

1

13

1

5

1

6

1

5

1

8

1

5

1

6

1

5

1

2

1

5

1

6

1

5

1

5

1

5

1

7

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

9

1

5

1

12

1

5

1

3

4

1

5

1

8

1

5

1

7

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

22

1

5

1

5

1

5

1

7

1

5

1

15

1

5

1

5

1

5

1

9

1

5

1

13

1

5

1

10

1

5

1

7

1

5

1

5

1

5

1

8

1

5

1

2

1

5

1

14

1

5

1

12

1

5

1

5

1

5

1

1

1

5

1

3

1

5

1

4

1

5

1

12

1

5

1

15

1

5

1

7

1

5

1

6

1

5

1

4

1

5

1

2

1

5

1

11

1

5

1

14

1

5

1

4

1

5

1

9

1

5

1

5

1

5

1

6

1

5

1

4

1

5

1

4

1

5

1

2

1

5

1

5

1

5

1

5

1

5

1

8

1

5

1

5

1

5

1

3

1

5

1

9

1

5

1

8

1

5

1

5

1

5

1

5

1

5

1

4

1

5

1

5

1

5

1

13

1

5

1

13

1

5

1

8

1

5

1

2

1

5

1

12

1

5

1

1

1

5

1

14

1

5

1

8

1

5

1

3

1

5

1

2

1

5

1

10

1

5

1

8

1

5

1

4

1

5

1

4

1

5

1

5

1

5

1

13

1

5

1

8

1

5

1

4

1

5

1

3

1

5

1

6

1

5

1

5

1

5

1

2

1

5

1

10

1

5

1

14

1

5

1

8

1

5

1

5

1

5

1

8

1

5

1

38

1

5

1

13

1

5

1

5

1

5

1

7

1

5

1

6

1

5

1

5

1

5

1

7

1

5

1

6

1

5

1

10

1

5

1

3

1

5

1

2

1

5

1

3

1

5

1

12

1

5

1

1

1

5

1

5

1

5

1

4

1

5

1

5

1

5

1

4

1

5

1

9

1

5

1

4

1

5

1

7

1

In [None]:
# sentiment

In [None]:
! wget 'http://www.labinform.ru/pub/rusentilex/rusentilex_2017.txt'

In [None]:
tonal_lexic = []

with open('rusentilex_2017.txt', 'r', encoding='utf-8') as f:
    for lines in f:
        tonal_lexic.append(lines)

tonal_lexic = tonal_lexic[18:]

In [None]:
tonal_dict = {}
for item in tonal_lexic:
    i = item.split(',')
    tonal_dict[i[0]] = i[3]

In [None]:
train_reviews = {}
test_reviews = {}
with open('project_autobrea/data/train_reviews.txt', encoding = 'utf-8') as f:
    text = f.read()
    for ind, line in enumerate(text.splitlines()):
        num, text = line.split('\t')
        train_reviews[num] = text
with open('project_autobrea/data/dev_reviews.txt', encoding = 'utf-8') as f:
    text = f.read()
    for ind, line in enumerate(text.splitlines()):
        num, text = line.split('\t')
        test_reviews[num] = text  

In [None]:
! pip install nltk

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

import pymorphy2

morph = pymorphy2.MorphAnalyzer()


def preprocess(text):
    clean_words = [w.lower() for w in word_tokenize(text)]
    clean_words = [morph.parse(w)[0].normal_form for w in clean_words if w]

    clean_words = str(clean_words)
    clean_words = re.sub('[,\[\]\']', '', clean_words)

    return clean_words

In [None]:
import re

with open('out_aspectss.txt', 'r', encoding='utf-8') as f:
    aspects = []
    for lines in f:
        lines = re.sub('\\n', '', lines)
        aspects.append(lines)

In [None]:
def return_element_index(lst, search):
    idx = []
    for index, element in enumerate(lst):
        if element == search:
            idx.append(index)
    return idx

In [None]:
c = 0
for a in aspects:
    asps = a.split('\t')
    rev_id = asps[0]
    mention = asps[1]

    review = preprocess(train_reviews[rev_id])
  
    sent_token = nltk.tokenize.word_tokenize(review)
    asp_token = nltk.tokenize.word_tokenize(preprocess(mention))

    asp_start = asp_token[0]
    asp_end = asp_token[-1]

    start_idx = return_element_index(sent_token, asp_start)
    end_idx = return_element_index(sent_token, asp_end)

    sosedi = []

    if start_idx != [] or end_idx != []:
        if start_idx[0] - 3 >= 0:
            sosedi.append(sent_token[start_idx[0]-3])
            sosedi.append(sent_token[start_idx[0]-2])
            sosedi.append(sent_token[start_idx[0]-1])
        elif start_idx[0]-3 <0 and start_idx[0]-2 >= 0:
            sosedi.append(sent_token[start_idx[0]-2])
            sosedi.append(sent_token[start_idx[0]-1])
        elif start_idx[0]-2 <0 and start_idx[0]-1 >= 0:
            sosedi.append(sent_token[start_idx[0]-1])

        if end_idx[0] + 3 <= len(sent_token)-1:
            sosedi.append(sent_token[end_idx[0]+1])
            sosedi.append(sent_token[end_idx[0]+2])
            sosedi.append(sent_token[end_idx[0]+3])
        elif end_idx[0]+3 > len(sent_token)-1 and end_idx[0]+2 <= len(sent_token)-1:
            sosedi.append(sent_token[end_idx[0]+1])
            sosedi.append(sent_token[end_idx[0]+2])
        elif end_idx[0]+2 > len(sent_token)-1 and end_idx[0]+1 <= len(sent_token)-1:
            sosedi.append(sent_token[end_idx[0]+1])

        sum_senti = 0
        for sosed in sosedi:
            if sosed in tonal_dict:
                senti = tonal_dict[sosed]
                if senti == ' positive':
                    sum_senti += 1
                if senti == ' negative':
                    sum_senti -= 1

        if sum_senti > 0:
            aspect_sentiment = 'positive'
        elif sum_senti < 0:
            aspect_sentiment = 'negative'
        else:
            aspect_sentiment = 'neutral'
        
        

        asps.append(aspect_sentiment)

        
        if c == 0:
            with open('out_with_sentiment.txt', 'w+', encoding='utf-8') as f:
                pass

        else:
            with open('out_with_sentiment.txt', 'a', encoding='utf-8') as f:
                for i in range(len(asps)):
                    if i < len(asps)-1:
                        f.write(asps[i])
                        f.write('\t')
                    else:
                        f.write(asps[i])
                        f.write('\n')
        