<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/places_rating_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comments rating classification

## download libs

In [96]:
%%capture
!pip install nltk gensim wandb

## imports

In [97]:
import gensim.downloader as api
import numpy as np
import pandas as pd
import nltk
import re
import wandb

from collections import Counter
from itertools import chain
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from typing import List, Dict

import torch
import torch.nn as nn
import torch.functional as F
from torch.nn.utils.rnn import pack_sequence, PackedSequence
from torch.utils.data import Dataset, DataLoader
from torch.multiprocessing import Pool

#from functools import partial

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
%%capture
gensim_model = api.load("word2vec-ruscorpora-300")



### Set WandB

In [98]:
wandb_config = {
    'achitecture':'biLSTM',
    'dropout':0.18,
    'hid_layers':2,
    'hid_size':64,
    'activation':'relu',
    'optimizer':'AdamW',
    'lr':4e-3,
    'epochs':10,
    'train_batch':128
}

In [None]:
WANDBAPI = False
try:
  with open('wandb_api.txt', 'r') as f:
    WANDBAPI = f.readline()
    project=""
    entity=''
    wandb.login(key=WANDBAPI)

except Exception as e:
  print(e)

In [130]:
def wandb_wrapper(func):
    def wrapped_function(*args, **kwargs):
        if WANDBAPI:
            wandb.init(project=project,entity=entity, config=wandb_config)

        result = func(*args, **kwargs)
        if WANDBAPI:
            wandb.finish()
        return result
    return wrapped_function

## load data

In [112]:
%%capture
!wget https://raw.githubusercontent.com/Dimildizio/DS_course/main/Neural_networks/NLP/Text_classification/data/train_reviews.csv

!wget https://raw.githubusercontent.com/Dimildizio/DS_course/main/Neural_networks/NLP/Text_classification/data/test_reviews.csv


In [113]:
trdf = pd.read_csv('train_reviews.csv')
test_df = pd.read_csv('test_reviews.csv')

In [7]:
trdf.sample(5)

Unnamed: 0,rate,text
19083,2,Очень медленно обслуживают. Работает одна касс...
32953,4,"Хорошо, что в магазинах появились кассы самооб..."
24674,5,Директор Наталья Вячеславовна огонь просто)) д...
4975,5,Всегда вежливые продавцы и хороший товар 🌹
40249,5,"Большой ассортимент, вежливые кассиры"


## set target in range 0:x

In [8]:
trdf['rate'].value_counts()

5    26069
4     9922
3     6126
1     4138
2     2410
Name: rate, dtype: int64

In [9]:
def norm_target(df, to_train=True):
  num = -1 if to_train else 1
  dfr = df.copy()
  dfr['rate'] = dfr['rate'] + num
  return dfr

In [10]:
train_df = norm_target(trdf)

## tokenize

In [11]:
def tokenize_text(text, lang='russian'):
    tokens = word_tokenize(text, language=lang)
    return [token for token in tokens if token.isalpha()]

In [12]:
tok_txt = [tokenize_text(t) for t in train_df.text.values]

### create vocab

In [13]:
class Voc:
  def __init__(self, txt, vocab_size):
    toks = [tok for word in txt for tok in word]
    tok_dict = Counter(toks)
    self.tokens = [tok for tok, num in tok_dict.most_common(vocab_size)]

In [15]:
vocabulary = Voc(tok_txt, 350000)
vocabulary.tokens[:20]

['и',
 'не',
 'в',
 'магазин',
 'на',
 'с',
 'что',
 'но',
 'всегда',
 'есть',
 'очень',
 'по',
 'персонал',
 'как',
 'все',
 'ассортимент',
 'Хороший',
 'нет',
 'а',
 'самообслуживания']

### Split

In [16]:
X = train_df.drop('rate', axis=1)
y = train_df['rate']

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

## Preset funcs for rnn

copied

In [17]:
class Tokenizer:
    def __init__(self, word_pattern="[\w']+"):
        """
        Simple tokenizer that splits the sentence by given regex pattern
        :param word_pattern: pattern that determines word boundaries
        """
        self.word_pattern = re.compile(word_pattern)

    def tokenize(self, text):
        return self.word_pattern.findall(text)


class Vocab:
    def __init__(self, tokenized_texts: List[List[str]], max_vocab_size=None):
        """
        Builds a vocabulary by concatenating all tokenized texts and counting words.
        Most common words are placed in vocabulary, others are replaced with [UNK] token
        :param tokenized_texts: texts to build a vocab
        :param max_vocab_size: amount of words in vocabulary
        """
        counts = Counter(chain(*tokenized_texts))
        max_vocab_size = max_vocab_size or len(counts)
        common_pairs = counts.most_common(max_vocab_size)
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.EOS_IDX = 2
        self.itos = ["<PAD>", "<UNK>", "<EOS>"] + [pair[0]
                                                   for pair in common_pairs]
        self.stoi = {token: i for i, token in enumerate(self.itos)}

    def vectorize(self, text: List[str]):
        """
        Maps each token to it's index in the vocabulary
        :param text: sequence of tokens
        :return: vectorized sequence
        """
        return [self.stoi.get(tok, self.UNK_IDX) for tok in text]

    def __iter__(self):
        return iter(self.itos)

    def __len__(self):
        return len(self.itos)


class TextDataset(Dataset):
    def __init__(self, tokenized_texts, labels, vocab: Vocab):
        """
        A Dataset for the task
        :param tokenized_texts: texts from a train/val/test split
        :param labels: corresponding toxicity ratings
        :param vocab: vocabulary with indexed tokens
        """
        self.texts = tokenized_texts
        self.labels = labels
        self.vocab = vocab

    def __getitem__(self, item):
        return (
            self.vocab.vectorize(self.texts[item]) + [self.vocab.EOS_IDX],
            self.labels[item],
        )

    def __len__(self):
        return len(self.texts)

    def collate_fn(self, batch):
        """
        Technical method to form a batch to feed into recurrent network
        """
        tmp = pack_sequence(
            [torch.tensor(pair[0]) for pair in batch], enforce_sorted=False
        ), torch.tensor([pair[1] for pair in batch])
        return tmp


def custom_train_test_split(data, train_frac=0.85):
    """
    Splits the data into train and test parts, stratifying by labels.
    Should it shuffle the data before split?
    :param data: dataset to split
    :param train_frac: proportion of train examples
    :return: texts and labels for each split
    """
    n_toxicity_ratings = 5
    train_labels = []
    val_labels = []
    train_texts = []
    val_texts = []
    for label in range(n_toxicity_ratings):
        texts = data[data.rate == label].text.values
        n_train = int(len(texts) * train_frac)
        n_val = len(texts) - n_train
        train_texts.extend(texts[:n_train])
        val_texts.extend(texts[n_train:])
        train_labels += [label] * n_train
        val_labels += [label] * n_val
    return train_texts, train_labels, val_texts, val_labels

In [18]:
tok = Tokenizer()
vocab = Vocab([tok.tokenize(t) for t in train_df.text.values], 30000)
train_texts, train_labels, val_texts, val_labels = custom_train_test_split(train_df)


train_dataset = TextDataset([tok.tokenize(t) for t in train_texts],
                            train_labels,
                            vocab)
val_dataset = TextDataset([tok.tokenize(t) for t in val_texts],
                          val_labels,
                          vocab)

In [36]:
train_texts[42]

'На днях зашёл купить в этот магазин сигарет , так же хотел купить икры , что стоит в холодильнике ближе к касее "алкоголя и табака", но посмотрев что качество икры не очень , поставил банку обратно . После покупки сигарет на кассе , меня остановил не понятная лично не славянской внешности ... и со словами "эй ты ... где икра , выворачивай карманы" вот вам и качество обслуживания и вежливость сотрудников . )'

In [28]:
TAG_MAPPING = {'A': 'ADJ',
               'ADV': 'ADV',
               'ADVPRO': 'ADV',
               'ANUM': 'ADJ',
               'APRO': 'DET',
               'COM': 'ADJ',
               'CONJ': 'SCONJ',
               'INTJ': 'INTJ',
               'NONLEX': 'X',
               'NUM': 'NUM',
               'PART': 'PART',
               'PR': 'ADP',
               'S': 'NOUN',
               'SPRO': 'PRON',
               'UNKN': 'X',
               'V': 'VERB'}

In [51]:
class EmbeddingProcessor:
  def __init__(self, gensim_model):
      self.model = gensim_model
      self.mean = self.model.vectors.mean(1).mean()
      self.std = self.model.vectors.std(1).mean()
      self.vec_size = self.model.vector_size


  @staticmethod
  def get_universal_tag(word):
      m = Mystem()
      processed = m.analyze(word)[0]
      lemma = processed["analysis"][0]["lex"].lower().strip()
      pos = processed["analysis"][0]["gr"].split(',')[0]
      pos = pos.split('=')[0].strip()
      tagged = lemma + '_' + pos
      return tagged


  def process_word(self, idx, word):
      try:
        tagged = self.add_tag(word)
        vector = self.model.get_vector(tagged)
        return idx, torch.tensor(vector)
      except Exception as e:
        random_vector = torch.randn(self.vec_size) * self.std + self.mean
        return idx, random_vector


  def add_tag(self, word):
      word = self.get_universal_tag(word)
      tag = word.split('_')[1]
      tag = TAG_MAPPING.get(tag, tag)

      word = word.split('_')[0] + '_' + tag
      return word

In [52]:
def multiproc_emb(mymodel, vocab, n_processes=3):
  matrix = torch.zeros(len(vocab), mymodel.vec_size)

  with Pool(processes=n_processes) as p:
    arguments = [(idx, word) for idx, word in enumerate(vocab.itos[:1], 1)]
    embs = p.starmap(mymodel.process_word, arguments)
  for idx, emb in embs:
    matrix[idx] = emb
  return matrix

In [53]:
emb_model = EmbeddingProcessor(gensim_model)
emb_matrix = multiproc_emb(emb_model, vocab)


Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [64]:
torch_emb_matrix = torch.tensor(emb_matrix, dtype=torch.float)

  torch_emb_matrix = torch.tensor(emb_matrix, dtype=torch.float)


In [58]:
print('mean:', emb_model.mean)
print('std:', emb_model.std)
print('вода:', sum(emb_model.model.get_vector('вода_NOUN')))
new_word = emb_model.add_tag('бежать')
print('бежать:', new_word)
print('бежать:', sum(emb_model.model[new_word]))
print('vals:', emb_matrix.values)

mean: -0.0012723453
std: 0.057641353
вода: -1.2167136888456298
бежать: бежать_VERB
бежать: -0.14577608798572328
vals: <built-in method values of Tensor object at 0x7a654c0aa0c0>


## Create LSTM model

In [127]:
#  copied
class RecurrentClassifier(nn.Module):
    def __init__(self, config: Dict, vocab: Vocab, emb_matrix):
        """
        Baseline classifier, hyperparameters are passed in `config`.
        Consists of recurrent part and a classifier (Multilayer Perceptron) part
        Keys are:
            - freeze: whether word embeddings should be frozen
            - cell_type: one of: RNN, GRU, LSTM, which recurrent cell model should use
            - hidden_size: size of hidden state for recurrent cell
            - num_layers: amount of recurrent cells in the model
            - cell_dropout: dropout rate between recurrent cells (not applied if model has only one cell!)
            - bidirectional: boolean, whether to use unidirectional of bidirectional model
            - out_activation: one of: "sigmoid", "tanh", "relu", "elu". Activation in classifier part
            - out_dropout: dropout rate in classifier part
            - out_sizes: List[int], hidden size of each layer in classifier part. Empty list means that final
                layer is attached directly to recurrent part output
        :param config: configuration of model
        :param vocab: vocabulary
        :param emb_matrix: embeddings matrix from `prepare_emb_matrix`
        """
        super().__init__()
        self.config = config
        self.vocab = vocab
        self.emb_matrix = emb_matrix
        self.embeddings = nn.Embedding.from_pretrained(
            emb_matrix, freeze=config["freeze"], padding_idx=vocab.PAD_IDX
        )
        cell_types = {"RNN": nn.RNN, "GRU": nn.GRU, "LSTM": nn.LSTM}
        cell_class = cell_types[config["cell_type"]]
        self.cell = cell_class(input_size=emb_matrix.size(1),
                               batch_first=True,
                               hidden_size=config["hidden_size"],
                               num_layers=config["num_layers"],
                               dropout=config["cell_dropout"],
                               bidirectional= config["bidirectional"],)

        activation_types = {"sigmoid": nn.Sigmoid,
                            "tanh": nn.Tanh,
                            "relu": nn.ReLU,
                            "elu": nn.ELU}

        self.out_activation = activation_types[config["out_activation"]]
        self.out_dropout = nn.Dropout(config["out_dropout"])
        cur_out_size = config["hidden_size"] * config["num_layers"]
        if config["bidirectional"]:
            cur_out_size *= 2
        out_layers = []
        for cur_hidden_size in config["out_sizes"]:
            out_layers.append(nn.Linear(cur_out_size, cur_hidden_size))
            nn.init.kaiming_normal_(out_layers[-1].weight)
            cur_out_size = cur_hidden_size
        out_layers.append(nn.Linear(cur_out_size, 6))
        self.out_proj = nn.Sequential(*out_layers)

    def forward(self, input):
        embedded = self.embeddings(input.data)
        _, last_state = self.cell(PackedSequence(embedded,
                                                 input.batch_sizes,
                                                 sorted_indices=input.sorted_indices,
                                                 unsorted_indices=input.unsorted_indices))

        if isinstance(last_state, tuple):
            last_state = last_state[0]
        last_state = last_state.transpose(0, 1)
        last_state = last_state.reshape(last_state.size(0), -1)
        return self.out_proj(last_state)

In [128]:
config = {'freeze':False,
          'cell_type': 'LSTM',
          "cell_dropout": 0.18,
          'num_layers':3,
          'hidden_size':128,
          'out_activation':"elu",
          'bidirectional':True,
          'out_dropout': 0.18,
          'out_sizes':[200]}

In [129]:
model = RecurrentClassifier(config, vocab, torch_emb_matrix)

## Train model

In [131]:
# copied
class Trainer:
    def __init__(self, config: Dict):
        """
        Fits end evaluates given model with Adam optimizer.
         Hyperparameters are specified in `config`
        Possible keys are:
            - n_epochs: number of epochs to train
            - lr: optimizer learning rate
            - weight_decay: l2 regularization weight
            - device: on which device to perform training ("cpu" or "cuda")
            - verbose: whether to print anything during training
        :param config: configuration for `Trainer`
        """
        self.config = config
        self.n_epochs = config["n_epochs"]
        self.setup_opt_fn = lambda model: torch.optim.AdamW(
            model.parameters(), config["lr"], weight_decay=config["weight_decay"])

        self.model = None
        self.opt = None
        self.history = None
        self.loss_fn = nn.CrossEntropyLoss()
        self.device = config["device"]
        self.verbose = config.get("verbose", True)


    @wandb_wrapper
    def fit(self, model, train_loader, val_loader):
        """
        Fits model on training data, each epoch evaluates on validation data
        :param model: PyTorch model for toxic comments classification (for example, `RecurrentClassifier`)
        :param train_loader: DataLoader for training data
        :param val_loader: DataLoader for validation data
        :return:
        """

        self.model = model.to(self.device)
        self.opt = self.setup_opt_fn(self.model)
        self.history = {"train_loss": [], "val_loss": [], "val_acc": []}
        for epoch in range(self.n_epochs):
            print(f"Epoch {epoch + 1}/{self.n_epochs}")
            train_info = self._train_epoch(train_loader)
            val_info = self._val_epoch(val_loader)
            self.history["train_loss"].extend(train_info["train_loss"])
            self.history["val_loss"].append(val_info["loss"])
            self.history["val_acc"].append(val_info["acc"])
        return self.model.eval()

    def _train_epoch(self, train_loader):
        self.model.train()
        losses = []
        if self.verbose:
            train_loader = tqdm(train_loader)
        for batch in train_loader:
            self.model.zero_grad()
            texts, labels = batch
            logits = self.model.forward(texts.to(self.device))
            loss = self.loss_fn(logits, labels.to(self.device))
            loss.backward()
            self.opt.step()
            loss_val = loss.item()
            if self.verbose:
                train_loader.set_description(f"Loss={loss_val:.3}")
            losses.append(loss_val)
        if WANDBAPI:
          wandb.log({"Train Loss": sum(losses) / len(losses)})
        return {"train_loss": losses}

    def _val_epoch(self, val_loader):
        self.model.eval()
        all_logits = []
        all_labels = []
        if self.verbose:
            val_loader = tqdm(val_loader)
        with torch.no_grad():
            for batch in val_loader:
                texts, labels = batch
                logits = self.model.forward(texts.to(self.device))
                all_logits.append(logits)
                all_labels.append(labels)
        all_labels = torch.cat(all_labels).to(self.device)
        all_logits = torch.cat(all_logits)
        loss = nn.CrossEntropyLoss()(all_logits, all_labels).item()
        acc = (all_logits.argmax(1) == all_labels).float().mean().item()
        if self.verbose:
            val_loader.set_description(f"Loss={loss:.3}; Acc:{acc:.3}")
        if WANDBAPI:
          wandb.log({"Validation Loss": self.val_info["loss"], "Validation Accuracy": self.val_info["acc"]})
        return {"acc": acc, "loss": loss}

    def predict(self, test_loader):
        if self.model is None:
            raise RuntimeError("You should train the model first")
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_loader:
                texts, labels = batch
                logits = self.model.forward(texts.to(self.device))
                predictions.extend(logits.argmax(1).tolist())
        return np.asarray(predictions)

    def save(self, path: str):
        if self.model is None:
            raise RuntimeError("You should train the model first")
        checkpoint = {
            "config": self.model.config,
            "trainer_config": self.config,
            "vocab": self.model.vocab,
            "emb_matrix": self.model.emb_matrix,
            "state_dict": self.model.state_dict(),
        }
        torch.save(checkpoint, path)

    @classmethod
    def load(cls, path: str):
        ckpt = torch.load(path)
        keys = ["config", "trainer_config",
                "vocab", "emb_matrix", "state_dict"]
        for key in keys:
            if key not in ckpt:
                raise RuntimeError(f"Missing key {key} in checkpoint")
        new_model = RecurrentClassifier(
            ckpt["config"], ckpt["vocab"], ckpt["emb_matrix"]
        )
        new_model.load_state_dict(ckpt["state_dict"])
        new_trainer = cls(ckpt["trainer_config"])
        new_trainer.model = new_model
        new_trainer.model.to(new_trainer.device)
        return new_trainer

In [88]:
device = lambda: 'cuda' if torch.cuda.is_available() else 'cpu'

In [133]:
train_config = {
    "lr": 4e-3,
    "n_epochs": 10,
    "weight_decay": 1e-6,
    "batch_size": 128,
    "device": device()
}
for key,val in train_config.items():
  wandb_config[key] = val

In [135]:
train_dataloader = DataLoader(train_dataset,
                              batch_size=train_config["batch_size"], shuffle=True,
                              collate_fn=train_dataset.collate_fn)

val_dataloader = DataLoader(val_dataset,
                            batch_size=train_config["batch_size"], shuffle=False,
                            collate_fn=val_dataset.collate_fn)

test_dataloader = DataLoader(TextDataset([tok.tokenize(sent) for sent in test_df['text'].values],
                            [-1] * test_df.shape[0], vocab), shuffle=False,
                             batch_size=train_config["batch_size"],
                             collate_fn=val_dataset.collate_fn)

In [None]:
training = Trainer(train_config)
training.fit(model, train_dataloader, val_dataloader)

## inference

In [115]:
preds = training.predict(test_dataloader)

In [122]:
len(preds)

12167

In [126]:
result = pd.DataFrame({'rate':preds+1})
result.to_csv('submissions.csv', index=True, index_label='index')