In [1]:
positive_link = 'https://www.kinopoisk.ru/film/447301/reviews/ord/date/status/good/perpage/75/'
negative_link = 'https://www.kinopoisk.ru/film/447301/reviews/?status=bad&ord=date&rnd=1631876404&perpage=75'

In [2]:
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
import collections
from sklearn.metrics import accuracy_score

morph = MorphAnalyzer()

sw = stopwords.words('russian')

In [3]:
ua = UserAgent(verify_ssl=False)
headers = {'User-Agent': ua.random}

pos_resp = requests.get(positive_link, headers=headers)
neg_resp = requests.get(negative_link, headers=headers)

In [4]:
positive_soup = BeautifulSoup(pos_resp.text, 'html.parser')
negative_soup = BeautifulSoup(neg_resp.text, 'html.parser')

In [5]:
pos_rev = positive_soup.find_all('span', {'itemprop': 'reviewBody'})
neg_rev = negative_soup.find_all('span', {'itemprop': 'reviewBody'})

In [6]:
pos_rev_train, pos_rev_test = pos_rev[:70], pos_rev[70:]
neg_rev_train, neg_rev_test = neg_rev[:70], neg_rev[70:]

In [7]:
def lemms(texts):
    lemmas = []
    for text in texts:
        words = [w.lower() for w in word_tokenize(text.text) if w.isalpha()]
        filtered = [w for w in words if w not in sw]
        for word in filtered:
            lemmas.append(morph.parse(word)[0].normal_form)
    return lemmas

In [8]:
def freq(words_list):
    counter = collections.Counter()
    for word in words_list:
        counter[word] += 1
    return counter

In [9]:
pos_counter = freq(lemms(pos_rev_train))
neg_counter = freq(lemms(neg_rev_train))

In [12]:
new_pos_count = {}
for item, value in pos_counter.items():
    if item in neg_counter:
        del neg_counter[item]
    else:
        new_pos_count[item] = value

In [13]:
def review(text, pos_dict, neg_dict):
    review = ''
    normalized = lemms(text)
    pos_count = 0
    neg_count = 0
    for word in normalized:
        if word in pos_dict:
            pos_count += 1
        elif word in neg_dict:
            neg_count += 1
    if pos_count > neg_count:
        review = 'positive'
    else:
        review = 'negative'
    return review

In [28]:
def test(pos_revs, neg_revs):
    result = []
    orig = []
    for text in pos_revs:
        rev = review([text], new_pos_count, neg_counter)
        result.append(rev)
        orig.append('positive')
    for text in neg_revs:
        rev = review([text], new_pos_count, neg_counter)
        result.append(rev)
        orig.append('negative')
    print("Accuracy: %.4f" % accuracy_score(result, orig))

In [29]:
test(pos_rev_test, neg_rev_test)

Accuracy: 0.9000
