### Description of method #1
- Get sets of words that are unique for positive/negative reviews
- Evaluate sentiment of test samples
    - Get set of words from sample
    - Intersect with positive and negative sets
    - Largest intersection is our evaluation
### Description of method #2
- Calculate normalized frequencies of each word from train sample.
    - `Norlamized frequency = count(word) / count(all_words)`
    - It is calculated separately for positive and negative sentiments.
- Then evaluate sentiment of test sample texts
    - `eval = sum(pos_freq[word]) - sum(neg_freq[word])`
    - (eval > 0)? then positive else negative
### Description of method #3
- Combination of methods #1 and #2
- Calculate frequencies
- Remove words that are present in both positive and negative reviews
- Normalize frequencies
- Then evaluate sentiment of test sample texts
    - `eval = sum(pos_freq[word]) - sum(neg_freq[word])`
    - (eval > 0)? then positive else negative

#### Installing requirements

In [None]:
!pip install -r reqirements.txt

### Parsing reviews
We parse hosting reviews from https://hosting101.ru

In [None]:
# crawler is a local module that parses target website
import crawler
import random
import json
from tqdm.notebook import tqdm

#### Checking if we alreade have parsed data

In [None]:
from pathlib import Path
DATA_FILE = Path('./data/reviews.json')
SKIP_PARSING = DATA_FILE.is_file()

In [None]:
if SKIP_PARSING:
    print(f'Parsing is skipped since parsed file already exists. Delete {DATA_FILE} if you want to parse new data.')
else:
    hosts = crawler.get_hosts(1000)
    print(len(hosts))

In [None]:
if SKIP_PARSING:
    print(f'Parsing is skipped since parsed file already exists. Delete {DATA_FILE} if you want to parse new data.')
else:
    reviews = {
        'pos': [],
        'neg': []
    }
    pb = tqdm(hosts, position=1)
    for i, h in enumerate(pb):
        reviews['pos'] += crawler.parse_host(h, positive=True, delay=(1.5, 3), pb=pb)
        reviews['neg'] += crawler.parse_host(h, positive=False, delay=(1.5, 3), pb=pb)
        if i % 10 == 0:
            with open(DATA_FILE, 'w') as f:
                json.dump(reviews, f)

In [None]:
# saving results to csv
if SKIP_PARSING:
    print(f'Parsing is skipped since parsed file already exists. Delete {DATA_FILE} if you want to parse new data.')
else:
    with open(DATA_FILE, 'w') as f:
        json.dump(reviews, f)
    print(f'Saved parsed data to {DATA_FILE}')

#### Loading data

In [None]:
# Loading data from csv
with open(DATA_FILE, 'r') as f:
    data = json.load(f)
pos, neg = data["pos"], data["neg"]

#### Splitting into test and reference ("train") groups

In [None]:
# Shuffling lists before splitting into
test_rat = 0.15
random.shuffle(pos)
random.shuffle(neg)
# Splitting samples
split_pos_idx = int(len(pos) * (1 - test_rat))
split_neg_idx = int(len(neg) * (1 - test_rat))
pos_train, pos_test = pos[:split_pos_idx], pos[split_pos_idx:]
neg_train, neg_test = neg[:split_neg_idx], neg[split_neg_idx:]
del pos, neg, data
f"Pos: test={len(pos_test)}, train={len(pos_train)}; Neg: test={len(neg_test)}, train={len(neg_train)}"

### Train group processing

In [None]:
from pymorphy2 import MorphAnalyzer
from tqdm import tqdm
from typing import List, Tuple, Dict
import nltk
from nltk.corpus import stopwords
import random
nltk.download('punkt')
nltk.download('stopwords')

morph = MorphAnalyzer()

In [None]:
# Tokenization

stoplist = set(stopwords.words('russian'))
def tokenizing_pipeline(text: str) -> List[str]:
    # Tokenization
    tokens = nltk.tokenize.word_tokenize(text.lower(), language="russian")
    # removing punctuation and some stopwords to reduce amount of words to parse
    tokens = filter(lambda x: x.isalpha() and not x in stoplist, tokens)
    # Stemming words
    stems = map(lambda x: morph.parse(x)[0].normal_form, tokens)
    # Removing remaining stopwords that werent in normal form
    stems = filter(lambda x: not x in stoplist, stems)
    return list(stems)

pos_train = [ tokenizing_pipeline(x) for x in tqdm(pos_train) ]
neg_train = [ tokenizing_pipeline(x) for x in tqdm(neg_train) ]

' '.join(random.choice(pos_train)[:10]), ' '.join(random.choice(neg_train)[:10])

#### Counting unique words / frequencies

In [None]:
# Method 1
from collections import Counter
minimum_freq = 150

def form_set(arr: List[List[str]]) -> set:
    counter = Counter()
    for text in arr:
        for word in text:
            counter[word] += 1
    for k, v in list(counter.items()):
        if v < minimum_freq:
            del counter[k]
    return set(counter.keys())

# words for positive and negative comments
pos_set = form_set(pos_train)
neg_set = form_set(neg_train)
# their intersection
intersection = pos_set.intersection(neg_set)
# unique words for positive and negative that are not present in other category
pos_unique = pos_set.difference(intersection)
neg_unique = neg_set.difference(intersection)
del pos_set, neg_set
len(pos_unique), len(neg_unique)

In [None]:
# method 2
from collections import defaultdict

def count_normalized_freq(arr: List[List[str]]) -> defaultdict:
    """Counts normalized word frequency from all texts in given list.
    Normalized means a float number [0, 1] = word_freq / total_words """
    counter = Counter()
    for text in arr:
        for word in text:
            counter[word] += 1
    # Filtering out low frequest words
    freq_list = { k: v for k, v in counter.items() if v >= minimum_freq }
    # Counting total words
    total_words = sum(freq_list.values())
    # Calculating normalized freq
    for k, v in freq_list.items():
        freq_list[k] = v / total_words
    return defaultdict(float, freq_list)

pos_normalized_freq = count_normalized_freq(pos_train)
neg_normalized_freq = count_normalized_freq(neg_train)
print(f'Total words: pos={len(pos_normalized_freq)}, neg={len(neg_normalized_freq)}')
list(pos_normalized_freq.items())[:3], list(neg_normalized_freq.items())[:3]

In [None]:
# Method 3
def count_freq(arr: List[List[str]]) -> defaultdict:
    counter = Counter()
    for text in arr:
        for word in text:
            counter[word] += 1
    # Filtering out low frequest words
    freq_list = filter(lambda x: x[1] >= minimum_freq, counter.items())
    return defaultdict(float, { k: v for k, v in freq_list })
# Calculating frequencies
pos_freq = count_freq(pos_train)
neg_freq = count_freq(neg_train)
print(f'Total words: pos={len(pos_freq)}, neg={len(neg_freq)}')
# Calculating words that are present in both sets
intersection = set(pos_freq.keys()).intersection(set(neg_freq.keys()))
# Removing words that are present in other set
for k in intersection:
    del pos_freq[k], neg_freq[k]
# Calculating normalized freq
def normalize_freq(freq_dict: dict):
    total_words = sum(freq_dict.values())
    for k, v in freq_dict.items():
        freq_dict[k] = v / total_words
normalize_freq(pos_freq)
normalize_freq(neg_freq)
pos_normalized_unique_freq = pos_freq
neg_normalized_unique_freq = neg_freq
print(f'Total unique words: pos={len(pos_freq)}, neg={len(neg_freq)}')

#### Word amount
|minimum_freq|total words|total unique* words|
|-|-|-|
|1|pos=7102, neg=13476|pos=2505, neg=8879|
|2|pos=3388, neg=6630|pos=688, neg=3930|
|3|pos=2426, neg=4959|pos=367, neg=2900|
|5|pos=1658, neg=3506|pos=206, neg=2054|
|10|pos=1018, neg=2233|pos=110, neg=1325|
|50|pos=258, neg=604|pos=43, neg=389|
|100|pos=113, neg=307|pos=25, neg=219|

\* Unique words - words that are not present in another group. Words exept the intersection of pos, neg

### Testing on test samples

#### Method #1
| minimum_freq | accuracy |
| - |  -   |
| 1 | 0.67 |
| 2 | 0.69 |
| 3 | 0.68 |
| 5 | 0.70 |
| 10 | 0.71 |
| 50 | 0.73 |
| 100 | 0.73 |

In [None]:
def eval_sentiment_1(text) -> int:
    """Returns 1 if positive and -1 if negative"""
    tokens = tokenizing_pipeline(text)
    tokens_unique = set(tokens)
    sent_eval = 0
    sent_eval += len(tokens_unique.intersection(pos_unique))
    sent_eval -= len(tokens_unique.intersection(neg_unique))
    return 1 if sent_eval > 0 else -1

correct = incorrect = 0

for txt in tqdm(pos_test):
    sentiment = eval_sentiment_1(txt)
    correct += 1 * (sentiment == 1)
    incorrect += 1 * (sentiment != 1)
for txt in tqdm(neg_test):
    sentiment = eval_sentiment_1(txt)
    correct += 1 * (sentiment == -1)
    incorrect += 1 * (sentiment != -1)

print(f"Accuracy: {correct / (correct + incorrect):.4f}")

#### Method #2
| minimum_freq | accuracy |
| - |  -   |
| 1 | 0.62 |
| 2 | 0.63 |
| 3 | 0.61 |
| 5 | 0.61 |
| 10 | 0.60 |
| 50 | 0.58 |
| 100 | 0.56 |

In [None]:
def eval_sentiment_2(text) -> int:
    """Returns 1 if positive and -1 if negative"""
    tokens = tokenizing_pipeline(text)
    sent_eval = 0
    for tkn in tokens:
        sent_eval += pos_normalized_freq[tkn] - neg_normalized_freq[tkn]
    return 1 if sent_eval > 0 else -1

correct = incorrect = 0

for txt in tqdm(pos_test):
    sentiment = eval_sentiment_2(txt)
    correct += 1 * (sentiment == 1)
    incorrect += 1 * (sentiment != 1)
for txt in tqdm(neg_test):
    sentiment = eval_sentiment_2(txt)
    correct += 1 * (sentiment == -1)
    incorrect += 1 * (sentiment != -1)

print(f"Accuracy: {correct / (correct + incorrect):.4f}")

#### Method #3
| minimum_freq | accuracy |
| - |  -   |
| 1 | 0.68 |
| 2 | 0.70 |
| 3 | 0.70 |
| 5 | 0.72 |
| 10 | 0.74 |
| 50 | 0.81 |
| 100 | 0.83 |

In [None]:
def eval_sentiment_3(text) -> int:
    """Returns 1 if positive and -1 if negative"""
    tokens = tokenizing_pipeline(text)
    sent_eval = 0
    for tkn in tokens:
        sent_eval += pos_normalized_unique_freq[tkn] - neg_normalized_unique_freq[tkn]
    return 1 if sent_eval > 0 else -1

correct = incorrect = 0

for txt in tqdm(pos_test):
    sentiment = eval_sentiment_3(txt)
    correct += 1 * (sentiment == 1)
    incorrect += 1 * (sentiment != 1)
for txt in tqdm(neg_test):
    sentiment = eval_sentiment_3(txt)
    correct += 1 * (sentiment == -1)
    incorrect += 1 * (sentiment != -1)

print(f"Accuracy: {correct / (correct + incorrect):.4f}")

### Method #4
| minimum_freq | accuracy |
| - |  -   |
| 1 | 0.77 |
| 2 | 0.80 |
| 3 | 0.79 |
| 5 | 0.80 |
| 10 | 0.77 |
| 50 | 0.74 |
| 100 | 0.74 |

In [None]:
def eval_sentiment_3(text) -> int:
    """Returns 1 if positive and -1 if negative"""
    tokens = tokenizing_pipeline(text)
    tokens_unique = set(tokens)
    sent_eval = 0
    pos_eval = len(tokens_unique.intersection(pos_unique))
    neg_eval = len(tokens_unique.intersection(neg_unique))
    if pos_eval == 0 and neg_eval == 0:
        for tkn in tokens:
            sent_eval += pos_normalized_freq[tkn] - neg_normalized_freq[tkn]
    else:
        sent_eval = pos_eval - neg_eval
    return 1 if sent_eval > 0 else -1

correct = incorrect = 0

for txt in tqdm(pos_test):
    sentiment = eval_sentiment_3(txt)
    correct += 1 * (sentiment == 1)
    incorrect += 1 * (sentiment != 1)
for txt in tqdm(neg_test):
    sentiment = eval_sentiment_3(txt)
    correct += 1 * (sentiment == -1)
    incorrect += 1 * (sentiment != -1)

print(f"Accuracy: {correct / (correct + incorrect):.4f}")

### Best performances
