# Imports

In [47]:
import json
import math
import re
import time
from datetime import datetime
from string import punctuation, whitespace

import emoji
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from natasha import (PER, Doc, MorphVocab, NamesExtractor, NewsEmbedding,
                     NewsMorphTagger, NewsNERTagger, NewsSyntaxParser,
                     Segmenter)
from nltk.corpus import stopwords
from selenium import webdriver

In [48]:
stopwords_ru = stopwords.words("russian")
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

# Parsing

In [49]:
with open("./data/last_parsed_html.txt", "r+", encoding="UTF8") as f:
    html = "".join(f.readlines())
    soup = BeautifulSoup(html)

In [50]:
reviews = soup.findAll("li", class_="comments__item feedback j-feedback-slide")

In [51]:
res = []
for review in reviews:
    text = review.find("p", class_="feedback__text")
    title_element = review.find("span", class_=lambda value: value and value.startswith("feedback__rating stars-line"))
    res.append({"full_text": text.text, "rating": title_element["class"][-1][-1]})
soup.decompose()

In [52]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 10)
df = pd.DataFrame.from_dict(res)
df.head(10)

Unnamed: 0,full_text,rating
0,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫. –ü—Ä–∏—à–ª–æ –≤—Å—ë —Ü–µ–ª–æ–µ. –°–ø–∞—Å–∏–±–æ –∑–∞ ...,5
1,–ö–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –≥–æ—Ä—à–æ–∫,5
2,"–£–¥–æ–±–Ω—ã–µ —Å—Ç–∏–ª—å–Ω—ã–µ –≥–æ—Ä—à–∫–∏ , –ø–æ–∫—É–ø–∞—é –Ω–µ –ø–µ—Ä–≤—ã–π —Ä–∞–∑ .",5
3,–ö–ª–∞—Å—Å–Ω—ã–π –≥–æ—Ä—à–æ–∫ –¥–ª—è —Ü–≤–µ—Ç–æ–≤ –∑–∞–∫–∞–∑—ã–≤–∞—é —É–∂–µ —Ç—Ä–µ—Ç–∏...,5
4,"–°–ø–∞—Å–∏–±–æ,–≥–æ—Ä—à–∫–∏ –∫—Ä–∞—Å–∏–≤—ã–µ,–Ω–æ –Ω–µ–º–Ω–æ–≥–æ –æ–±–∏–¥–Ω–æ,—á—Ç–æ ...",5
5,"–í—Å–µ –∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–æ, —Ö–æ—Ä–æ—à–æ —É–ø–∞–∫–æ–≤–∞–Ω –∏ –æ—á–µ–Ω—å –ø—Ä–∏—è...",5
6,"–û—á–µ–Ω—å –¥–æ–≤–æ–ª—å–Ω–∞ –≥–æ—Ä—à–∫–∞–º–∏,–∏–Ω–¥–∏–∫–∞—Ç–æ—Ä —Ä–∞–±–æ—Ç–∞–µ—Ç –æ—Ç–ª...",5
7,"–ì–æ—Ä—à–æ–∫ –±–æ–ª—å—à–æ–π, –≤—Å–µ –ø—Ä–∏—à–ª–æ —Ü–µ–ª–æ–µ!",5
8,–û–ø–∏—Å–∞–Ω–∏–µ –∫—Ä–∞–π–Ω–µ –Ω–µ –≤–µ—Ä–Ω–æ–µ. –ì–æ—Ä—à–æ–∫ –Ω–∞–ø–æ–ª—å–Ω—ã–π. –û...,2
9,üëçüëçüëçüëçüëç,5


In [53]:
df.describe()

Unnamed: 0,full_text,rating
count,698,698
unique,687,5
top,–•–æ—Ä–æ—à–∏–π –≥–æ—Ä—à–æ–∫,5
freq,5,547


In [54]:
df

Unnamed: 0,full_text,rating
0,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫. –ü—Ä–∏—à–ª–æ –≤—Å—ë —Ü–µ–ª–æ–µ. –°–ø–∞—Å–∏–±–æ –∑–∞ ...,5
1,–ö–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –≥–æ—Ä—à–æ–∫,5
2,"–£–¥–æ–±–Ω—ã–µ —Å—Ç–∏–ª—å–Ω—ã–µ –≥–æ—Ä—à–∫–∏ , –ø–æ–∫—É–ø–∞—é –Ω–µ –ø–µ—Ä–≤—ã–π —Ä–∞–∑ .",5
3,–ö–ª–∞—Å—Å–Ω—ã–π –≥–æ—Ä—à–æ–∫ –¥–ª—è —Ü–≤–µ—Ç–æ–≤ –∑–∞–∫–∞–∑—ã–≤–∞—é —É–∂–µ —Ç—Ä–µ—Ç–∏...,5
4,"–°–ø–∞—Å–∏–±–æ,–≥–æ—Ä—à–∫–∏ –∫—Ä–∞—Å–∏–≤—ã–µ,–Ω–æ –Ω–µ–º–Ω–æ–≥–æ –æ–±–∏–¥–Ω–æ,—á—Ç–æ ...",5
...,...,...
693,–î–æ–±—Ä–æ–≥–æ –¥–Ω—è! –í–º–µ—Å—Ç–æ –≥–æ—Ä—à–∫–∞ –Ω–∞ 9–ª. –ø—Ä–∏—Å–ª–∞–ª–∏ –Ω–∞ ...,5
694,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫. –£–¥–æ–±–Ω–∞—è —Å–∏—Å—Ç–µ–º–∞ –ø–æ–ª–∏–≤–∞. –í—ã–≥–ª—è...,5
695,"–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫, —Ç–µ–ø–µ—Ä—å —Ö–æ—á—É —Å–æ–±—Ä–∞—Ç—å –≤—Å—é –∫–æ–ª–ª–µ...",5
696,"–•–æ—Ä–æ—à–∏–π –≥–æ—Ä—à–æ–∫, –Ω—É–∂–Ω–æ–≥–æ –º–Ω–µ —Ä–∞–∑–º–µ—Ä–∞. –ü—Ä–∞–≤–∏–ª—å–Ω–∞...",5


# Preprocessing

## Emoji

In [55]:
remove_list = list(emoji.EMOJI_DATA.keys()) + list("0123456789")

df["full_text"] = df["full_text"].map(
    lambda x: "".join(c for c in x if c not in remove_list)
)
# extract_emojis("üëåüíö–≤—Å–µ –ø–æ–Ω—Ä–∞–≤–∏–ª–æ—Å—å , –∑–∞–∫–∞–∂—É –µ—â–µ –æ–¥–∏–Ω")
df = df.drop(df[df["full_text"].map(lambda x: x == "")].index, axis=0)
df.head(10)

Unnamed: 0,full_text,rating
0,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫. –ü—Ä–∏—à–ª–æ –≤—Å—ë —Ü–µ–ª–æ–µ. –°–ø–∞—Å–∏–±–æ –∑–∞ ...,5
1,–ö–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –≥–æ—Ä—à–æ–∫,5
2,"–£–¥–æ–±–Ω—ã–µ —Å—Ç–∏–ª—å–Ω—ã–µ –≥–æ—Ä—à–∫–∏ , –ø–æ–∫—É–ø–∞—é –Ω–µ –ø–µ—Ä–≤—ã–π —Ä–∞–∑ .",5
3,–ö–ª–∞—Å—Å–Ω—ã–π –≥–æ—Ä—à–æ–∫ –¥–ª—è —Ü–≤–µ—Ç–æ–≤ –∑–∞–∫–∞–∑—ã–≤–∞—é —É–∂–µ —Ç—Ä–µ—Ç–∏...,5
4,"–°–ø–∞—Å–∏–±–æ,–≥–æ—Ä—à–∫–∏ –∫—Ä–∞—Å–∏–≤—ã–µ,–Ω–æ –Ω–µ–º–Ω–æ–≥–æ –æ–±–∏–¥–Ω–æ,—á—Ç–æ ...",5
5,"–í—Å–µ –∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–æ, —Ö–æ—Ä–æ—à–æ —É–ø–∞–∫–æ–≤–∞–Ω –∏ –æ—á–µ–Ω—å –ø—Ä–∏—è...",5
6,"–û—á–µ–Ω—å –¥–æ–≤–æ–ª—å–Ω–∞ –≥–æ—Ä—à–∫–∞–º–∏,–∏–Ω–¥–∏–∫–∞—Ç–æ—Ä —Ä–∞–±–æ—Ç–∞–µ—Ç –æ—Ç–ª...",5
7,"–ì–æ—Ä—à–æ–∫ –±–æ–ª—å—à–æ–π, –≤—Å–µ –ø—Ä–∏—à–ª–æ —Ü–µ–ª–æ–µ!",5
8,–û–ø–∏—Å–∞–Ω–∏–µ –∫—Ä–∞–π–Ω–µ –Ω–µ –≤–µ—Ä–Ω–æ–µ. –ì–æ—Ä—à–æ–∫ –Ω–∞–ø–æ–ª—å–Ω—ã–π. –û...,2
10,–û—á–µ–Ω—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–π –≥–æ—Ä—à–æ—á–µ–∫. –£–¥–æ–±–Ω–∞—è –∫–æ–ª–±–∞ –¥–ª—è –∫...,5


# Speller

In [56]:
df['full_text_Count'] = df['full_text'].str.len()
df['full_text_Count'].describe()

count    696.000000
mean     112.466954
std      118.909150
min       10.000000
25%       38.750000
50%       78.000000
75%      140.250000
max      980.000000
Name: full_text_Count, dtype: float64

In [57]:
df.iloc[3]["full_text"]
df[df['full_text_Count'] >= df['full_text_Count'].describe()["75%"]]

Unnamed: 0,full_text,rating,full_text_Count
3,–ö–ª–∞—Å—Å–Ω—ã–π –≥–æ—Ä—à–æ–∫ –¥–ª—è —Ü–≤–µ—Ç–æ–≤ –∑–∞–∫–∞–∑—ã–≤–∞—é —É–∂–µ —Ç—Ä–µ—Ç–∏...,5,220
8,–û–ø–∏—Å–∞–Ω–∏–µ –∫—Ä–∞–π–Ω–µ –Ω–µ –≤–µ—Ä–Ω–æ–µ. –ì–æ—Ä—à–æ–∫ –Ω–∞–ø–æ–ª—å–Ω—ã–π. –û...,2,191
10,–û—á–µ–Ω—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–π –≥–æ—Ä—à–æ—á–µ–∫. –£–¥–æ–±–Ω–∞—è –∫–æ–ª–±–∞ –¥–ª—è –∫...,5,244
11,–ì–æ—Ä—à–æ–∫ –æ—á–µ–Ω—å —Å–∏–º–ø–∞—Ç–∏—á–Ω—ã–π. –ù–∞ –≤–∏–¥ –≥–æ—Ä—à–æ–∫ –±–æ–ª—å—à–æ...,5,377
12,–û—Ç–ª–∏—á–Ω–æ–µ –∫–∞—à–ø–æ!!! –í—Å—ë –≤ –∫–æ–º–ø–ª–µ–∫—Ç–µ!!! –ò–¥–µ–∞–ª—å–Ω–æ ...,5,271
...,...,...,...
678,"–í–æ-–ø–µ—Ä–≤—ã—Ö –≥–æ—Ä—à–æ–∫ —Å–æ—Å—Ç–æ–∏—Ç –∏–∑ –¥–≤—É—Ö, –∫–æ–≥–¥–∞ –±—Ä–∞–ª –¥...",3,296
686,–ì–æ—Ä—à–æ—á–µ–∫ –æ—á–µ–Ω—å –ø–æ–Ω—Ä–∞–≤–∏–ª—Å—è!!! –ö —Å–µ–∑–æ–Ω—É –æ—Ç–ø—É—Å–∫–æ–≤...,5,171
691,"–ò–∑ —Ç—Ä–µ—Ö –≥–æ—Ä—à–∫–æ–≤ –æ–±—ä—ë–º–æ–º –ª–∏—Ç—Ä–æ–≤,–æ–¥–∏–Ω –±—ã–ª —Å –¥–∏—Ñ...",5,220
695,"–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫, —Ç–µ–ø–µ—Ä—å —Ö–æ—á—É —Å–æ–±—Ä–∞—Ç—å –≤—Å—é –∫–æ–ª–ª–µ...",5,149


In [58]:
def speller_api(texts, df, first_index, last_index):
    payload = {'text': texts, 'options': 526} 
    try: 
        r = requests.post('https://speller.yandex.net/services/spellservice.json/checkTexts?', data=payload)
        r.encoding = 'utf-8'
        res = r.json()
        print(res)
        for i in range(len(texts)):
            df_i = i + first_index
            # –î–æ–±–∞–≤–∏—Ç—å —á–∏—Å–ª–æ —Å –∫–æ—Ç–æ—Ä–æ–≥–æ –Ω–∞—á–∏–Ω–∞–µ—Ç—Å—è –æ—Ç—Å—á–µ—Ç
            for mistake_ind in range(len(res[i])):
                suggestion = res[i][mistake_ind]
                if suggestion["s"]:
                    print(df.iloc[df_i]["full_text"])
                    df.iloc[df_i]["full_text"] = (
                        df["full_text"].iloc[df_i][: suggestion["pos"]]
                        + suggestion["s"][0]
                        + df["full_text"].iloc[df_i][suggestion["pos"] + suggestion["len"] :]
                    )
                    print(df.iloc[df_i]["full_text"])
    except Exception as e:
        print(e)
        print(r.text)

In [59]:
first_index = 0
max_sum = 10000
last_index = 0
for last_index, row in df.iterrows():
    if df.iloc[first_index:last_index]["full_text_Count"].sum() >= max_sum:
        texts = df.iloc[first_index:last_index]["full_text"]
        # print(df.iloc[first_index:last_index]["full_text_Count"].sum(), len(texts))
        speller_api(texts, df, first_index, last_index)
        first_index = last_index
texts = df.iloc[first_index:last_index + 1]["full_text"]
speller_api(texts, df, first_index, last_index)

[[], [], [], [], [], [{'code': 1, 'pos': 58, 'row': 0, 'col': 58, 'len': 11, 'word': '–æ–±–Ω–∞—Ä—É–∂–∏–ª –∞', 's': ['–æ–±–Ω–∞—Ä—É–∂–∏–ª–∞']}], [], [], [{'code': 1, 'pos': 16, 'row': 0, 'col': 16, 'len': 9, 'word': '–Ω–µ –≤–µ—Ä–Ω–æ–µ', 's': ['–Ω–µ–≤–µ—Ä–Ω–æ–µ', '–Ω–µ –≤–µ—Ä–Ω–æ–µ']}], [], [{'code': 1, 'pos': 71, 'row': 0, 'col': 71, 'len': 10, 'word': '–Ω–µ –±–æ–ª—å—à–æ–π', 's': ['–Ω–µ–±–æ–ª—å—à–æ–π']}, {'code': 1, 'pos': 199, 'row': 0, 'col': 199, 'len': 6, 'word': '–Ω–µ—á–µ–≥–æ', 's': ['–Ω–∏—á–µ–≥–æ']}], [{'code': 1, 'pos': 140, 'row': 0, 'col': 140, 'len': 5, 'word': '–±–∞–ª–æ–≤', 's': ['–±–∞–ª–ª–æ–≤']}, {'code': 1, 'pos': 236, 'row': 0, 'col': 236, 'len': 7, 'word': '–í–æ–æ–±—â–µ–º', 's': ['–≤ –æ–±—â–µ–º']}], [], [], [], [], [], [], [], [], [{'code': 1, 'pos': 120, 'row': 0, 'col': 120, 'len': 10, 'word': '—Ä–µ–∑–µ—Ä–≤–∞—É—Ä–∞', 's': ['—Ä–µ–∑–µ—Ä–≤—É–∞—Ä–∞']}, {'code': 1, 'pos': 178, 'row': 0, 'col': 178, 'len': 10, 'word': '–Ω–∞ —Å–∫–æ–ª—å–∫–æ', 's': ['–Ω–∞—Å–∫–æ–ª—å–∫–æ', '–Ω–∞ —Å

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[df_i]["full_text"] = (


[[{'code': 1, 'pos': 77, 'row': 1, 'col': 7, 'len': 10, 'word': '—É–ø–∞–∫–æ–≤–∞–Ω—ã–π', 's': ['—É–ø–∞–∫–æ–≤–∞–Ω–Ω—ã–π']}], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [{'code': 1, 'pos': 13, 'row': 0, 'col': 13, 'len': 10, 'word': '–Ω–µ –±–æ–ª—å—à–æ–π', 's': ['–Ω–µ–±–æ–ª—å—à–æ–π', '–Ω–µ –±–æ–ª—å—à–æ–π']}], [{'code': 1, 'pos': 166, 'row': 0, 'col': 166, 'len': 9, 'word': '–ü—Ä–µ–æ–±—Ä–µ–ª–∞', 's': ['–ü—Ä–∏–æ–±—Ä–µ–ª–∞']}], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [{'code': 1, 'pos': 232, 'row': 0, 'col': 232, 'len': 12, 'word': '—Å–∞–º–æ–ø–æ–ª–∏–≤–Ω—ã–π', 's': ['—Å–∞–º–æ–ø–∞–ª–∏–≤–Ω—ã–π']}, {'code': 1, 'pos': 245, 'row': 0, 'col': 245, 'len': 7, 'word': '–≥–æ—Ä–æ—á–µ–∫', 's': ['–≥–æ—Ä—à–æ—á–µ–∫']}], [], [], [], [], [], [], [], [], [], [], [], [], [{'code': 1, 'pos': 296, 'row': 0, 'col': 296, 'len': 6, 'word': '–Ω–µ–¥–µ–ª–∏', 's': ['–Ω–µ–¥–µ–ª—é', '–Ω–µ–¥–µ–ª–∏']}], [{'code': 1, 'pos': 122, 'row': 0, 'col': 122, 'len

In [60]:
payload = {'text': df["full_text"], 'options': 526} 
try: 
    # –ú–∞–∫—Å–∏–º—É–º 10000 —Å–∏–º–≤–æ–ª–æ–≤ –∑–∞ —Ä–∞–∑. –û–±—Ä–∞–±–æ—Ç–∫–∏ –¥–ª—è —ç—Ç–æ–≥–æ –Ω–µ—Ç, –µ—ë, –ø–æ —Ö–æ—Ä–æ—à–µ–º—É, –Ω–∞–¥–æ —Å–¥–µ–ª–∞—Ç—å
    r = requests.post('https://speller.yandex.net/services/spellservice.json/checkTexts?', data=payload)
    r.encoding = 'utf-8'
    res = r.json()
    print(res)
except Exception as e:
    print(e)
    print(r.text)

[[], [], [], [], [], [{'code': 1, 'pos': 58, 'row': 0, 'col': 58, 'len': 11, 'word': '–æ–±–Ω–∞—Ä—É–∂–∏–ª –∞', 's': ['–æ–±–Ω–∞—Ä—É–∂–∏–ª–∞']}], [], [], [{'code': 1, 'pos': 16, 'row': 0, 'col': 16, 'len': 9, 'word': '–Ω–µ –≤–µ—Ä–Ω–æ–µ', 's': ['–Ω–µ–≤–µ—Ä–Ω–æ–µ', '–Ω–µ –≤–µ—Ä–Ω–æ–µ']}], [], [{'code': 1, 'pos': 71, 'row': 0, 'col': 71, 'len': 10, 'word': '–Ω–µ –±–æ–ª—å—à–æ–π', 's': ['–Ω–µ–±–æ–ª—å—à–æ–π']}, {'code': 1, 'pos': 199, 'row': 0, 'col': 199, 'len': 6, 'word': '–Ω–µ—á–µ–≥–æ', 's': ['–Ω–∏—á–µ–≥–æ']}], [{'code': 1, 'pos': 140, 'row': 0, 'col': 140, 'len': 5, 'word': '–±–∞–ª–æ–≤', 's': ['–±–∞–ª–ª–æ–≤']}, {'code': 1, 'pos': 236, 'row': 0, 'col': 236, 'len': 7, 'word': '–í–æ–æ–±—â–µ–º', 's': ['–≤ –æ–±—â–µ–º']}], [], [], [], [], [], [], [], [], [{'code': 1, 'pos': 120, 'row': 0, 'col': 120, 'len': 10, 'word': '—Ä–µ–∑–µ—Ä–≤–∞—É—Ä–∞', 's': ['—Ä–µ–∑–µ—Ä–≤—É–∞—Ä–∞']}, {'code': 1, 'pos': 178, 'row': 0, 'col': 178, 'len': 10, 'word': '–Ω–∞ —Å–∫–æ–ª—å–∫–æ', 's': ['–Ω–∞—Å–∫–æ–ª—å–∫–æ', '–Ω–∞ —Å

In [None]:
# –ù–µ –∫–æ–ø–∏—Ä—É–µ—Ç—Å—è –¥–∞—Ç–∞—Ñ—Ä–µ–π–º, —Ç–æ—á–Ω–µ–µ –Ω–µ –≤—Å—Ç–∞–≤–ª—è—é—Ç—Å—è –∑–Ω–∞—á–µ–Ω–∏—è –≤ "–∫–æ–ø–∏—é"
# df.iloc[0, "full_text"] = "123"
for i in range(len(df["full_text"])):
    for mistake_ind in range(len(res[i])):
        suggestion = res[i][mistake_ind]
        if suggestion["s"]:
            df.iloc[i, 'full_text'] = (
                df["full_text"].iloc[i][: suggestion["pos"]]
                + suggestion["s"][0]
                + df["full_text"].iloc[i][suggestion["pos"] + suggestion["len"] :]
            )

df


In [121]:
df.iloc[50]

full_text          –†–µ–±—è—Ç, –≤—ã —Ç–∞–∫ –∏ –ø–∏—à–∏—Ç–µ, —á—Ç–æ –≥–æ—Ä—à–æ–∫  –ª, –∞ –Ω–µ . ...
rating                                                             1
full_text_Count                                                  108
Name: 51, dtype: object

# Natasha

In [123]:
def get_syntax(df, row, text_cell = "text"):
    res = []
    
    def recursive(df, row, res, text_cell):
        edges = df[df["head_id"]==row["id"]]
        if edges.empty:
            if re.search(fr"[{punctuation}]", row[text_cell]):
                return
            res.append(row[text_cell])
            return
        
        isPrinted = False
        for index, edge in edges.iterrows():
            if edge["text"] == "(":
                k = 0
            if edge["rel"] == "conj":
                continue
            if text_cell == "lemma":
                if edge["text"] in stopwords_ru or re.search(fr"\d|[{punctuation}]", edge["text"]):
                    continue
            if int(edge["id"].split("_")[1]) > int(row["id"].split("_")[1]) and not isPrinted:
                res.append(row[text_cell])
                isPrinted = True
            recursive(df, edge, res, text_cell)
        if not isPrinted:
            res.append(row[text_cell])
            isPrinted = True
    
    recursive(df,row, res, text_cell)
    if text_cell == "text":
        res = re.sub(r'\s([?.!,;:"](?:\s|$))', r'\1', " ".join(res))
    return res

In [124]:
pd.set_option('display.max_rows', 10)

In [125]:
resarr = []
# try:
for df_item in df.to_numpy():
    res = df_item[df.columns.get_indexer(["text"])[0]]
    # res = df_item[0]
    doc = Doc(res)  # Doc(res.text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    data = doc.tokens
    columns = list(doc.tokens[0].as_json.keys())
    df_natasha = pd.DataFrame(data=data, columns=columns)

    df_natasha2 = df_natasha[
        (df_natasha["rel"] == "root") | (df_natasha["rel"] == "conj")
    ]
    # print(df_natasha)
    # print(df_natasha2)
    if df_natasha2.empty:
        df_natasha2 = df_natasha[df_natasha["id"] == df_natasha["head_id"]]
        if df_natasha2.empty:
            # print(df_natasha)
            # –ï—Å–ª–∏ –Ω–∞—Ç–∞—à–∞ –Ω–µ —Å–º–æ–≥–ª–∞ –æ–ø—Ä–µ–¥–µ–ª–∏—Ç—å –ø–æ–¥–ª–µ–∂–∞—â–µ–µ, —Ç–æ –æ–±—ã—á–Ω–æ –æ—Ç–∑—ã–≤ –±–µ—Å—Å–º—ã—Å–ª–µ–Ω–µ–Ω
            continue
        df_natasha.loc[df_natasha2.index, ["head_id"]] = df_natasha2["head_id"].replace(
            to_replace=r"^(\d+_)(\d+)$", value=r"\g<1>" + "0", regex=True
        )
        df_natasha2 = df_natasha.loc[df_natasha2.index]
        # print(df_natasha2)
        df_natasha["syntax"] = df_natasha2.apply(
            lambda row: get_syntax(df_natasha, row), axis=1
        )
        df_natasha["syntax_lemmas"] = df_natasha2.apply(
            lambda row: get_syntax(df_natasha, row, text_cell="lemma"), axis=1
        )
        # print(df_natasha)
        df_natasha2 = df_natasha.loc[df_natasha2.index]
    else:
        df_natasha["syntax"] = df_natasha2.apply(
            lambda row: get_syntax(df_natasha, row), axis=1
        )
        df_natasha["syntax_lemmas"] = df_natasha2.apply(
            lambda row: get_syntax(df_natasha, row, text_cell="lemma"), axis=1
        )
        df_natasha2 = df_natasha[
            (df_natasha["rel"] == "root") | (df_natasha["rel"] == "conj")
        ]

    texts = df_natasha2["syntax"].values
    lemmas = df_natasha2["syntax_lemmas"].values
    for i in range(len(texts)):
        if len(texts[i]) < 2 or len(lemmas[i]) == 0:
            continue
        resarr.append(
            {
                "text": texts[i][0].upper() + texts[i][1:],
                "full_text": res,
                "class": 0,
                "lemmas": lemmas[i],
                "rating": df_item[1],
            }
        )
# except Exception as e:
#     print(e)
df_res = pd.DataFrame(resarr)
df_res


Unnamed: 0,text,full_text,class,lemmas,rating
0,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫. –ü—Ä–∏—à–ª–æ –≤—Å—ë —Ü–µ–ª–æ–µ. –°–ø–∞—Å–∏–±–æ –∑–∞ ...,0,"[–æ—Ç–ª–∏—á–Ω—ã–π, –≥–æ—Ä—à–æ–∫]",5
1,–ü—Ä–∏—à–ª–æ –≤—Å—ë —Ü–µ–ª–æ–µ,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫. –ü—Ä–∏—à–ª–æ –≤—Å—ë —Ü–µ–ª–æ–µ. –°–ø–∞—Å–∏–±–æ –∑–∞ ...,0,"[–ø—Ä–∏–π—Ç–∏, –≤–µ—Å—å, —Ü–µ–ª—ã–π]",5
2,–°–ø–∞—Å–∏–±–æ –∑–∞ –ø–æ–¥–∞—Ä–æ—á–µ–∫-—É–¥–æ–±—Ä–µ–Ω–∏–µ,–û—Ç–ª–∏—á–Ω—ã–π –≥–æ—Ä—à–æ–∫. –ü—Ä–∏—à–ª–æ –≤—Å—ë —Ü–µ–ª–æ–µ. –°–ø–∞—Å–∏–±–æ –∑–∞ ...,0,[—Å–ø–∞—Å–∏–±–æ],5
3,–ö–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –≥–æ—Ä—à–æ–∫,–ö–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π –≥–æ—Ä—à–æ–∫,0,"[–∫–∞—á–µ—Å—Ç–≤–µ–Ω–Ω—ã–π, –≥–æ—Ä—à–æ–∫]",5
4,–£–¥–æ–±–Ω—ã–µ —Å—Ç–∏–ª—å–Ω—ã–µ –≥–æ—Ä—à–∫–∏ –ø–æ–∫—É–ø–∞—é –Ω–µ –ø–µ—Ä–≤—ã–π —Ä–∞–∑,"–£–¥–æ–±–Ω—ã–µ —Å—Ç–∏–ª—å–Ω—ã–µ –≥–æ—Ä—à–∫–∏ , –ø–æ–∫—É–ø–∞—é –Ω–µ –ø–µ—Ä–≤—ã–π —Ä–∞–∑ .",0,"[—É–¥–æ–±–Ω—ã–π, —Å—Ç–∏–ª—å–Ω—ã–π, –≥–æ—Ä—à–æ–∫, –ø–æ–∫—É–ø–∞—Ç—å]",5
...,...,...,...,...,...
2602,–ù–æ —Å –∞–≤—Ç–æ–ø–æ–ª–∏–≤–æ–º –∑–∞–∫–∞–∑–∞–ª–∞ –ø–µ—Ä–≤—ã–π —Ä–∞–∑ –∞,"–û—á–µ–Ω—å –ø–æ–Ω—Ä–∞–≤–∏–ª—Å—è –≥–æ—Ä—à–æ–∫, –Ω–æ —Å –∞–≤—Ç–æ–ø–æ–ª–∏–≤–æ–º –∑–∞–∫–∞...",0,"[–∞–≤—Ç–æ–ø–æ–ª–∏–≤, –∑–∞–∫–∞–∑–∞—Ç—å]",5
2603,–ü–æ—ç—Ç–æ–º—É –Ω–µ —É—á–ª–∞ –ª–∏—Ç—Ä–æ–≤ —á—Ç–æ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π –Ω–∞ –≥–¥–µ —Ç...,"–û—á–µ–Ω—å –ø–æ–Ω—Ä–∞–≤–∏–ª—Å—è –≥–æ—Ä—à–æ–∫, –Ω–æ —Å –∞–≤—Ç–æ–ø–æ–ª–∏–≤–æ–º –∑–∞–∫–∞...",0,"[–ø–æ—ç—Ç–æ–º—É, —É—á–µ—Å—Ç—å, –ª–∏—Ç—Ä, –≤–Ω—É—Ç—Ä–µ–Ω–Ω–∏–π, —ç—Ç–æ, –º–∞–ª–µ–Ω...",5
2604,–≠—Ç–æ –æ–±—ä—ë–º –≤–Ω–µ—à–Ω–µ–≥–æ –≥–æ—Ä—à–∫–∞,"–û—á–µ–Ω—å –ø–æ–Ω—Ä–∞–≤–∏–ª—Å—è –≥–æ—Ä—à–æ–∫, –Ω–æ —Å –∞–≤—Ç–æ–ø–æ–ª–∏–≤–æ–º –∑–∞–∫–∞...",0,"[—ç—Ç–æ, –æ–±—ä–µ–º, –≤–Ω–µ—à–Ω–∏–π, –≥–æ—Ä—à–æ–∫]",5
2605,–õ–∏—Ç—Ä–∞,"–û—á–µ–Ω—å –ø–æ–Ω—Ä–∞–≤–∏–ª—Å—è –≥–æ—Ä—à–æ–∫, –Ω–æ —Å –∞–≤—Ç–æ–ø–æ–ª–∏–≤–æ–º –∑–∞–∫–∞...",0,[–ª–∏—Ç—Ä],5


# Save

In [126]:
dt = datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
file_path = f"./data/unprepared_{dt}.xlsx"
df_res.to_excel(file_path, encoding="UTF-8")