### Description of method
- This notebook calculates normalized frequencies of each word from train sample.
    - Norlamized frequency = `count(word) / count(all_words)`
    - It is calculated separately for positive and negative sentiments.
- Then it evaluates sentiment of test sample texts
    - eval = 

#### Installing requirements

In [65]:
!pip install -r reqirements.txt



### Parsing reviews
We parse hosting reviews from https://hosting101.ru

In [66]:
# crawler is a local module that parses target website
import crawler
import random
import pandas as pd
from tqdm import tqdm

#### Checking if we alreade have parsed data

In [67]:
from pathlib import Path
DATA_CSV = Path('./data/reviews.csv')
SKIP_PARSING = DATA_CSV.is_file()

In [68]:
if SKIP_PARSING:
    print(f'Parsing is skipped since parsed file already exists. Delete {DATA_CSV} if you want to parse new data.')
else:
    hosts = crawler.get_hosts(500)
    print(hosts)

['https://hosting101.ru/amhost.net-shared', 'https://hosting101.ru/fighthost.ru', 'https://hosting101.ru/netangels.ru', 'https://hosting101.ru/anti-ddos.pro', 'https://hosting101.ru/tld24.ru', 'https://hosting101.ru/host-protection.com', 'https://hosting101.ru/docker.ru', 'https://hosting101.ru/hozv.ru-shared', 'https://hosting101.ru/work-hosting.ru', 'https://hosting101.ru/hostia.ru', 'https://hosting101.ru/hostlife.net', 'https://hosting101.ru/hoster.kg', 'https://hosting101.ru/ruhoster.com', 'https://hosting101.ru/trust-host.ru', 'https://hosting101.ru/spaceweb.ru', 'https://hosting101.ru/mgnhost.ru', 'https://hosting101.ru/apdate.by', 'https://hosting101.ru/ruelit.com', 'https://hosting101.ru/platforma.ru', 'https://hosting101.ru/mousedc.ru', 'https://hosting101.ru/zomro.com', 'https://hosting101.ru/1gb.ua', 'https://hosting101.ru/1host.by', 'https://hosting101.ru/teli.ru', 'https://hosting101.ru/thejethost.com', 'https://hosting101.ru/uadomen.com', 'https://hosting101.ru/yours-hos

In [69]:
if SKIP_PARSING:
    print(f'Parsing is skipped since parsed file already exists. Delete {DATA_CSV} if you want to parse new data.')
else:
    reviews = {
        'pos': [],
        'neg': []
    }
    for h in tqdm(hosts, desc='Hosts'):
        reviews['pos'] += crawler.parse_host(h, positive=True, delay=(0.5, 1))
        reviews['neg'] += crawler.parse_host(h, positive=False, delay=(0.5, 1))
    
    print(f'Pre normalization pos: {len(reviews["pos"])}; neg: {len(reviews["neg"])}')
    # equalizing negative and positive review amounts
    if len(reviews['pos']) < len(reviews['neg']):
        reviews['neg'] = random.sample(reviews['neg'], len(reviews['pos']))
    elif len(reviews['neg']) < len(reviews['pos']):
        reviews['pos'] = random.sample(reviews['pos'], len(reviews['neg']))
    print(f'Post normalization pos: {len(reviews["pos"])}; neg: {len(reviews["neg"])}')

Hosts:   0%|          | 0/500 [00:00<?, ?it/s]

Hosts:  16%|█▌        | 81/500 [03:27<17:55,  2.57s/it]


KeyboardInterrupt: 

In [None]:
# saving results to csv
if SKIP_PARSING:
    print(f'Parsing is skipped since parsed file already exists. Delete {DATA_CSV} if you want to parse new data.')
else:
    df = pd.DataFrame(reviews)
    df.to_csv(DATA_CSV, index=False)
    print(f'Saved parsed data to {DATA_CSV}')

Parsing is skipped since parsed file already exists. Delete data/reviews.csv if you want to parse new data.


#### Loading data

In [None]:
# Loading data from csv
df = pd.read_csv(DATA_CSV)
pos, neg = df["pos"].to_list(), df["neg"].to_list()
del df

""" pos = pos[:200]
neg = neg[:200] """

' pos = pos[:200]\nneg = neg[:200] '

#### Splitting into test and reference ("train") groups

In [None]:
# Shuffling lists before splitting into
test_rat = 0.15
random.shuffle(pos)
random.shuffle(neg)
# Splitting samples
split_pos_idx = int(len(pos) * (1 - test_rat))
split_neg_idx = int(len(neg) * (1 - test_rat))
pos_train, pos_test = pos[:split_pos_idx], pos[split_pos_idx:]
neg_train, neg_test = neg[:split_neg_idx], neg[split_neg_idx:]
del pos
del neg
f"Pos: test={len(pos_test)}, train={len(pos_train)}; Neg: test={len(neg_test)}, train={len(neg_train)}"

'Pos: test=330, train=1868; Neg: test=330, train=1868'

### Train group processing

In [None]:
from pymorphy2 import MorphAnalyzer
from tqdm import tqdm
from typing import List, Tuple, Dict
import nltk
from nltk.corpus import stopwords
import random
nltk.download('punkt')
nltk.download('stopwords')

morph = MorphAnalyzer()

[nltk_data] Downloading package punkt to /home/kesha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kesha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Tokenization

stoplist = set(stopwords.words('russian'))
def tokenizing_pipeline(text: str) -> List[str]:
    # Tokenization
    tokens = nltk.tokenize.word_tokenize(text.lower(), language="russian")
    # removing punctuation
    tokens = filter(lambda x: x.isalpha(), tokens)
    # Stemming words
    stems = map(lambda x: morph.parse(x)[0].normal_form, tokens)
    # Removing stopwords
    stems = filter(lambda x: not x in stoplist, stems)
    return list(stems)

pos_train = [ tokenizing_pipeline(x) for x in tqdm(pos_train) ]
neg_train = [ tokenizing_pipeline(x) for x in tqdm(neg_train) ]

' '.join(random.choice(pos_train)[:10]), ' '.join(random.choice(neg_train)[:10])

  0%|          | 6/1868 [00:00<00:36, 50.73it/s]

100%|██████████| 1868/1868 [00:24<00:00, 77.14it/s] 
100%|██████████| 1868/1868 [00:35<00:00, 53.18it/s]


('отличный хостинг грамотный технический поддержка пользоваться год',
 'очень советовать пользоваться услуга мирохост отвратительный качество')

#### Counting frequencies

In [None]:
from collections import Counter, defaultdict
minimum_freq = 5

def count_sample_freq(arr: List[List[str]]) -> defaultdict:
    """Counts normalized word frequency from all texts in given list.
    Normalized means a float number [0, 1] = word_freq / total_words """
    counter = Counter()
    for text in arr:
        for word in text:
            counter[word] += 1
    # Converting to list of tuples
    freq_list = list(counter.items())
    # Filtering out low frequest words
    freq_list = [ (w, f) for w, f in freq_list if f > minimum_freq ]
    # Counting total words
    total_words = 0
    for _, f in freq_list: total_words += f
    # Calculating normalized freq
    freq_list = { w: f / total_words for w, f in freq_list }
    return defaultdict(float, freq_list)

pos_freq = count_sample_freq(pos_train)
neg_freq = count_sample_freq(neg_train)
list(pos_freq.items())[:5], list(neg_freq.items())[:5]

([('всё', 0.02278874220068018),
  ('нравиться', 0.002222638781029912),
  ('поддержка', 0.019227164394933455),
  ('быстрый', 0.004204268537610798),
  ('отличие', 0.0004284604879093806)],
 [('сайт', 0.03602850515140749),
  ('хостинг', 0.02842155858811728),
  ('полный', 0.001609161773003699),
  ('уходить', 0.0007732335792355437),
  ('точнее', 0.00018808384359783495)])

### Testing on test samples

In [None]:
def eval_sentiment(text) -> int:
    """Returns 1 if positive and -1 if negative"""
    tokens = tokenizing_pipeline(text)
    sent_eval = 0
    for tkn in tokens:
        sent_eval += pos_freq[tkn] - neg_freq[tkn]
    return 1 if sent_eval > 0 else -1

correct = incorrect = 0

for txt in tqdm(pos_test):
    sentiment = eval_sentiment(txt)
    correct += 1 * (sentiment == 1)
    incorrect += 1 * (sentiment != 1)
for txt in tqdm(neg_test):
    sentiment = eval_sentiment(txt)
    correct += 1 * (sentiment == -1)
    incorrect += 1 * (sentiment != -1)

print(f"Accuracy: {correct / (correct + incorrect)}")

100%|██████████| 330/330 [00:04<00:00, 71.07it/s]
100%|██████████| 330/330 [00:06<00:00, 48.02it/s]

Accuracy: 0.6924242424242424





In [None]:
print(f"""Accuracy: {correct / (correct + incorrect)}\n
minimum_freq: {minimum_freq}""")

Accuracy: 0.6924242424242424

minimum_freq: 5
