<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/places_rating_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comments rating classification

## download libs

In [None]:
%%capture
!pip install nltk gensim

## imports

In [74]:
import pandas as pd
import nltk
import re

from collections import Counter
from itertools import chain
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from typing import List

import torch
from torch.nn.utils.rnn import pack_sequence
from torch.utils.data import Dataset, DataLoader


In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## load data

In [2]:
%%capture
!wget https://raw.githubusercontent.com/Dimildizio/DS_course/main/Neural_networks/NLP/Text_classification/data/train_reviews.csv
!wget https://github.com/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/data/test_reviews.csv

In [9]:
trdf = pd.read_csv('train_reviews.csv')
test_df = pd.read_csv('test_reviews.csv')

In [57]:
trdf.sample(5)

Unnamed: 0,rate,text
44158,5,Удивительно отличная подборка алкоголя. Особен...
10099,5,Лучшая пятёрочка в Питере) просто лучшая без л...
3237,5,"Можно припарковаться, сама по себе чистая,перс..."
24812,4,"Пятёрочка сам по себе магазин хороший, цены дл..."
20955,2,"Хотели купить бутылку вина, ценник не соответс..."


## set target in range 0:x

In [10]:
trdf['rate'].value_counts()

5    26069
4     9922
3     6126
1     4138
2     2410
Name: rate, dtype: int64

In [15]:
def norm_target(df, to_train=True):
  num = -1 if to_train else 1
  dfr = df.copy()
  dfr['rate'] = dfr['rate'] + num
  return dfr

In [16]:
train_df = norm_target(trdf)

## tokenize

In [48]:
def tokenize_text(text, lang='russian'):
    tokens = word_tokenize(text, language=lang)
    return [token for token in tokens if token.isalpha()]

In [68]:
tok_txt = [tokenize_text(t) for t in train_df.text.values]

### create vocab

In [69]:
class Voc:
  def __init__(self, txt, vocab_size):
    toks = [tok for word in txt for tok in word]
    tok_dict = Counter(toks)
    self.tokens = [tok for tok, num in tok_dict.most_common(vocab_size)]

In [70]:
vocabulary = Voc(tok_texts, 350000)
vocabulary.tokens[:20]

['и',
 'не',
 'в',
 'магазин',
 'на',
 'с',
 'что',
 'но',
 'всегда',
 'есть',
 'очень',
 'по',
 'персонал',
 'как',
 'все',
 'ассортимент',
 'Хороший',
 'нет',
 'а',
 'самообслуживания']

### Split

In [71]:
X = train_df.drop('rate', axis=1)
y = train_df['rate']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

## Preset funcs for rnn

copied

In [75]:
class Tokenizer:
    def __init__(self, word_pattern="[\w']+"):
        """
        Simple tokenizer that splits the sentence by given regex pattern
        :param word_pattern: pattern that determines word boundaries
        """
        self.word_pattern = re.compile(word_pattern)

    def tokenize(self, text):
        return self.word_pattern.findall(text)


class Vocab:
    def __init__(self, tokenized_texts: List[List[str]], max_vocab_size=None):
        """
        Builds a vocabulary by concatenating all tokenized texts and counting words.
        Most common words are placed in vocabulary, others are replaced with [UNK] token
        :param tokenized_texts: texts to build a vocab
        :param max_vocab_size: amount of words in vocabulary
        """
        counts = Counter(chain(*tokenized_texts))
        max_vocab_size = max_vocab_size or len(counts)
        common_pairs = counts.most_common(max_vocab_size)
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.EOS_IDX = 2
        self.itos = ["<PAD>", "<UNK>", "<EOS>"] + [pair[0]
                                                   for pair in common_pairs]
        self.stoi = {token: i for i, token in enumerate(self.itos)}

    def vectorize(self, text: List[str]):
        """
        Maps each token to it's index in the vocabulary
        :param text: sequence of tokens
        :return: vectorized sequence
        """
        return [self.stoi.get(tok, self.UNK_IDX) for tok in text]

    def __iter__(self):
        return iter(self.itos)

    def __len__(self):
        return len(self.itos)


class TextDataset(Dataset):
    def __init__(self, tokenized_texts, labels, vocab: Vocab):
        """
        A Dataset for the task
        :param tokenized_texts: texts from a train/val/test split
        :param labels: corresponding toxicity ratings
        :param vocab: vocabulary with indexed tokens
        """
        self.texts = tokenized_texts
        self.labels = labels
        self.vocab = vocab

    def __getitem__(self, item):
        return (
            self.vocab.vectorize(self.texts[item]) + [self.vocab.EOS_IDX],
            self.labels[item],
        )

    def __len__(self):
        return len(self.texts)

    def collate_fn(self, batch):
        """
        Technical method to form a batch to feed into recurrent network
        """
        tmp = pack_sequence(
            [torch.tensor(pair[0]) for pair in batch], enforce_sorted=False
        ), torch.tensor([pair[1] for pair in batch])
        return tmp


def custom_train_test_split(data, train_frac=0.85):
    """
    Splits the data into train and test parts, stratifying by labels.
    Should it shuffle the data before split?
    :param data: dataset to split
    :param train_frac: proportion of train examples
    :return: texts and labels for each split
    """
    n_toxicity_ratings = 5
    train_labels = []
    val_labels = []
    train_texts = []
    val_texts = []
    for label in range(n_toxicity_ratings):
        texts = data[data.rate == label].text.values
        n_train = int(len(texts) * train_frac)
        n_val = len(texts) - n_train
        train_texts.extend(texts[:n_train])
        val_texts.extend(texts[n_train:])
        train_labels += [label] * n_train
        val_labels += [label] * n_val
    return train_texts, train_labels, val_texts, val_labels

In [76]:
tok = Tokenizer()
vocab = Vocab([tok.tokenize(t) for t in train_df.text.values], 30000)
train_texts, train_labels, val_texts, val_labels = custom_train_test_split(train_df)


train_dataset = TextDataset([tok.tokenize(t) for t in train_texts],
                            train_labels,
                            vocab)
val_dataset = TextDataset([tok.tokenize(t) for t in val_texts],
                          val_labels,
                          vocab)