<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/Transfer_learning/Comments_rating_ruBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rating NLP classification

## Install libs

In [1]:
%%capture
!pip install transformers

## Imports

In [69]:
import os
import pandas as pd
import torch
import torch.nn.functional as F

from numpy import asarray
from sklearn.model_selection import train_test_split
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertPreTrainedModel
from typing import Dict, Union, Optional, Tuple

## Get data ready

In [4]:
MAX_LEN = 200
BATCH_SIZE = 128

### Download data

In [3]:
%%capture
!wget https://raw.githubusercontent.com/Dimildizio/DS_course/main/Neural_networks/Transformers/data/train_rating.csv
!wget https://raw.githubusercontent.com/Dimildizio/DS_course/main/Neural_networks/Transformers/data/test_rating.csv

### Upload data to pd and "label encode"

In [76]:
train_data = pd.read_csv("train_rating.csv")
train_data['rate'] = train_data['rate']-1

test_data =  pd.read_csv("test_rating.csv")

### Split data

In [77]:
train, val = train_test_split(train_data, stratify=train_data["rate"], test_size=0.15, random_state=42)

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

### Class for dataset

In [78]:
class MyDataset(Dataset):
  """
    Custom dataset class for processing text data.

    Args:
        dataframe: Pandas DataFrame containing the text data.
        tokenizer: Tokenizer to encode text.
        max_seq_len: Maximum sequence length for padding.

    Attributes:
        data: DataFrame containing text data.
        text: Series containing the text.
        targets: Series containing target values (if available).
        tokenizer: Tokenizer for encoding text.
        max_seq_len: Maximum sequence length for padding.

    Methods:
        __getitem__(index):
            Retrieves a single data point by index.
        __len__():
            Returns the total number of data points.

  """
  def __init__(self, dataframe, tokenizer, max_seq_len):
    self.data = dataframe
    self.text = dataframe['text']
    self.targets = None
    if 'rate' in dataframe:
      self.targets = dataframe['rate']
    self.tokenizer = tokenizer
    self.max_seq_len = max_seq_len


  def __getitem__(self, index):
    """
        Retrieves a single data point by index.

        Args:
            index: Index of the data point.

        Returns:
            Dict: Dictionary containing 'ids', 'mask', and 'rate' (if available).
    """
    text = str(self.text[index])
    text = ' '.join(text.split())

    inputs = self.tokenizer.encode_plus(text, add_special_tokens=True,
                                        max_length=self.max_seq_len,
                                        pad_to_max_length=True,
                                        return_token_type_ids=True,
                                        truncation=True)
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    if self.targets is not None:
      return {'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'rate': torch.tensor(self.targets[index], dtype=torch.long)}
    else:
      return {'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long)}


  def __len__(self) -> int:
    return len(self.text)

## Model

In [79]:
class DistilBertForClassification(torch.nn.Module):
    """
    DistilBERT model for sequence classification.

    Args:
        distil_bert_model: Pre-trained DistilBERT model.
        config (Dict): Configuration dictionary for the model.

    Attributes:
        model_name (str): Name of the DistilBERT model.
        config (Dict): Configuration dictionary for the model.
        n_classes (int): Number of classes for classification.
        dropout_rate (float): Dropout rate.
        bert (DistilBertModel): Pre-trained DistilBERT model for sequence classification.
        pre_classifier (Linear): Linear layer for pre-classification.
        dropout (Dropout): Dropout layer.
        classifier (Linear): Linear layer for classification.

    Methods:
        forward(input_ids, attention_mask):
            Forward pass for the model.

    """
    def __init__(self, distil_bert_model, config: Dict):
        """
        Initialize the DistilBertForClassification model.
        Args:
            distil_bert_model: Pre-trained DistilRuBERT model.
            config (Dict): Configuration dictionary for the model.
        """
        super(DistilBertForClassification, self).__init__()
        self.model_name = 'distilrubert-small-cased-conversational'
        self.config = config
        self.n_classes = config['num_classes']
        self.dropout_rate = config['dropout_rate']
        self.bert = distil_bert_model
        self.pre_classifier = torch.nn.Linear(768, 128)
        self.dropout = torch.nn.Dropout(self.dropout_rate)
        self.classifier = torch.nn.Linear(128, self.n_classes)


    def forward(self,input_ids: Tensor, attention_mask: Tensor) -> Tensor:
        """
        Forward pass for the model.
        Args:
            input_ids (torch.Tensor): Input token IDs.
            attention_mask (torch.Tensor): Attention mask.
        Returns:
            Tensor: Logits produced by the model.
        """
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output.last_hidden_state
        hidden_state = hidden_state[:, 0]
        hidden_state = self.pre_classifier(hidden_state)
        hidden_state = torch.nn.ReLU()(hidden_state)
        hidden_state = self.dropout(hidden_state)
        output = self.classifier(hidden_state)
        return output


## Trainer to train the model

In [80]:
class Trainer:
    def __init__(self, config: Dict[str, Union[int, float, str]]):
      '''
      Initialize the Trainer with a configuration dictionary.
      '''
      self.config = config
      self.n_epochs = config['n_epochs']
      self.model = None
      self.optimizer = None
      self.opt_fn = lambda model: AdamW(model.parameters(), config['lr'])
      self.loss_fn = CrossEntropyLoss()
      self.history = None
      self.device = config['device']
      self.verbose = config.get('verbose', True)


    def batchto(self, batch: Dict[str, Tensor]) -> Tuple[Tensor, Tensor, Tensor]:
      """
      Move the batch to the specified device and convert to the appropriate data types.
      Args:
          batch (Dict[str, Tensor]): Dictionary containing 'ids', 'mask', and 'rate'.
      Returns:
          Tuple[Tensor, Tensor, Tensor]: Tuple containing 'ids', 'mask', and 'targets'.

      """
      ids = batch['ids'].to(self.device, dtype=torch.long)
      mask = batch['mask'].to(self.device, dtype=torch.long)
      targets = batch['rate'].to(self.device, dtype=torch.long)
      return ids, mask, targets


    def fit(self, model: torch.nn.Module, train_dataloader: DataLoader,
            val_dataloader: Optional[DataLoader] = None) -> torch.nn.Module:
      '''
      Train the model for a specified number of epochs.
      '''
      self.model = model.to(self.device)
      self.optimizer = self.opt_fn(model)
      scheduler = StepLR(self.optimizer, step_size=2, gamma=0.1)

      self.history = {'train_loss': [],
                        'val_loss': [],
                        'val_acc': []}

      for epoch in range(self.n_epochs):
          print(f"Epoch {epoch + 1}/{self.n_epochs}")
          train_info = self.train_epoch(train_dataloader)
          self.history['train_loss'].extend(train_info['loss'])
          if val_dataloader != None:
            val_info = self.val_epoch(val_dataloader)
            self.history['val_loss'].extend([val_info['loss']])
            self.history['val_acc'].extend([val_info['acc']])
          scheduler.step()
      return self.model.eval()


    def train_epoch(self, train_dataloader: DataLoader) -> Dict[str, list]:
        '''
        Train the model for one epoch.
        '''
        self.model.train()
        losses = []
        if self.verbose:
            train_dataloader = tqdm(train_dataloader)
        for batch in train_dataloader:
            ids, mask,targets = self.batchto(batch)

            outputs = self.model(ids, mask)
            loss = self.loss_fn(outputs, targets)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            iloss = loss.item()
            if self.verbose:
                train_dataloader.set_description(f"Loss: {iloss:.4}")
            losses.append(iloss)
        return {'loss': losses}


    def val_epoch(self, val_dataloader: DataLoader) -> Dict[str, Union[float, int]]:
      '''
      Evaluate the model on the validation set for one epoch.
      '''
      self.model.eval()
      logits = []
      labels = []
      val_dataloader = tqdm(val_dataloader)
      with torch.no_grad():
        for batch in val_dataloader:
          ids, mask, targets = self.batchto(batch)

          outputs = self.model(ids, mask)
          logits.append(outputs)
          labels.append(targets)

      labels = torch.cat(labels).to(self.device)
      logits = torch.cat(logits).to(self.device)
      loss = self.loss_fn(logits, labels).item()
      acc = (logits.argmax(1) == labels).float().mean().item()
      print('Accuracy:', acc)
      val_dataloader.set_description(f"Loss={loss: .4}\nAccuracy:{acc:.4}")
      metrics = {'acc': acc, 'loss': loss}
      return metrics

    def predict(self, test_dataloader):
        self.model.eval()
        predictions = []
        with torch.no_grad():
            for batch in test_dataloader:
                ids = batch['ids'].to(self.device, dtype=torch.long)
                mask = batch['mask'].to(self.device, dtype=torch.long)
                outputs = self.model(ids, mask)
                predictions.extend(outputs.argmax(1).tolist())
        return asarray(predictions)


## Create instances for the  model and tokenizer

### Set up configuration and parametes

In [81]:
train_params = { "batch_size": BATCH_SIZE,
                "shuffle": True,
                 "num_workers": 0}

test_params = {"batch_size": BATCH_SIZE,
               "shuffle": False,
               "num_workers": 0}

config = {"num_classes": 5,
          "dropout_rate": 0.1,
          "dim": 768}

trainer_config = {"batch_size": 128,
                  "n_epochs": 5,
                  "lr": 4e-6,
                  "weight_decay": 1e-6,
                  "device": "cuda" if torch.cuda.is_available() else "cpu"}

### Create tokenizer and the model

In [82]:
# Load the Russian tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational')
# Load the Russian DistilBERT model
russian_model = DistilBertModel.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational')
# Create model
russian_model = DistilBertForClassification(russian_model, config=config)

### Set up dataset

In [83]:
#train
train_dataset = MyDataset(train, tokenizer, MAX_LEN)
train_dataloader = DataLoader(train_dataset, **train_params)
#val
val_dataset = MyDataset(val, tokenizer, MAX_LEN)
val_dataloader = DataLoader(val_dataset, **test_params)
#test
test_dataset = MyDataset(test_data, tokenizer, MAX_LEN)
test_dataloader = DataLoader(test_dataset, **test_params)

## Train model

In [None]:
trainer = Trainer(trainer_config)
trainer.fit(russian_model, train_dataloader, val_dataloader)

### Active training

In [28]:
train_whole = MyDataset(train_data, tokenizer, MAX_LEN)
train_wholeloader = DataLoader(train_dataset, **train_params)
wholetrainer = Trainer(trainer_config)
wholetrainer.fit(russian_model, train_wholeloader, None)

RATE: True
Epoch 1/5


  0%|          | 0/324 [00:00<?, ?it/s]



Epoch 2/5


  0%|          | 0/324 [00:00<?, ?it/s]

Epoch 3/5


  0%|          | 0/324 [00:00<?, ?it/s]

Epoch 4/5


  0%|          | 0/324 [00:00<?, ?it/s]

Epoch 5/5


  0%|          | 0/324 [00:00<?, ?it/s]

DistilBertForClassification(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-1): 2 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (l

In [11]:
!nvidia-smi

Sat Nov 11 15:26:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Create predictions csv

In [29]:
df = test_data.copy()

predictions = wholetrainer.predict(test_dataloader)
df['rate'] = predictions+1

result_df = df[['index', 'rate']].copy()
result_df.to_csv('predictions.csv', index=False)