<a href="https://colab.research.google.com/github/EleonoraBaim/NPS_Dialogue_system/blob/main/bert_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries



In [6]:
pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 330 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [7]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys
import tqdm
from transformers import BertTokenizer, BertModel

project_path = "/content/drive/MyDrive/Colab_Notebooks/NPS_dialogue_system/"
dataset_folder = "ready_datasets/"
tokenizer_model_path = "BERT_models/rubert-base-cased-conversational"
classification_model_path = "BERT_models/rubert-base-cased-conversational2"
ckpt_path = str(project_path + "model_parts/curr_ckpt.pt")
best_model_path = str(project_path + "model_parts/bert/best_model.pt")
labels_path = str(project_path + "model_parts/categories.json")

In [2]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, hamming_loss, accuracy_score

In [4]:
from sklearn import metrics

In [8]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [9]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# Data Importing

In [10]:
#IMPORTING DATA
print('Importing data...', '\n')

train_df = pd.read_csv(str(project_path + dataset_folder +'train_dataset.csv'))
train_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

test_df = pd.read_csv(str(project_path + dataset_folder + 'test_dataset.csv'))
test_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

val_df = pd.read_csv(str(project_path + dataset_folder + 'val_dataset.csv'))
val_df.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

text = 'CONTEXT'
target_list = train_df.columns[(train_df.columns!='CONTEXT')&(train_df.columns!='normalized')]

Importing data... 



In [11]:
# Mean length of sentence
lens = []
for t in train_df[text]:
  lens.append(len(t))

np.array(lens).mean()

# hyperparameters
MAX_LEN = 100
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 1e-05
#LEARNING_RATE = 0.01

# Tokenization

In [12]:
# Tokenization
print( 'Tokenizer ...', '\n')
tokenizer = BertTokenizer.from_pretrained(project_path + tokenizer_model_path)

class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['CONTEXT']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

Tokenizer ... 



In [13]:
# Creating Dataloaders
print('Creating Dataloaders ...', '\n')

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)

train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

Creating Dataloaders ... 



# Model initialization

In [14]:
# Model initialization
print('Model initialization...')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Using device:', device,'\n')

Model initialization...
Using device: cpu 



In [15]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath, map_location=torch.device('cpu'))
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer
    #checkpoint['epoch']
    #, valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [16]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained(str(project_path+classification_model_path), return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(target_list))
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [17]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Model Training

In [None]:
val_targets=[]
val_outputs=[]

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
   
 
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if batch_idx%5000==0:
            print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [19]:
# Загрузка модели
load_ckp(best_model_path, model, optimizer)

(BERTClass(
   (bert_model): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(119547, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0): BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
before loss data in training 0.07008767127990723 0.06982482578745455
after loss data in training 0.07008767127990723 0.06982522045035613
yyy epoch 666
before loss data in training 0.06506309658288956 0.06982522045035613
after loss data in training 0.06506309658288956 0.06981808083436293
yyy epoch 667
before loss data in training 0.06380659341812134 0.06981808083436293
after loss data in training 0.06380659341812134 0.06980908160170389
yyy epoch 668
before loss data in training 0.06882757693529129 0.06980908160170389
after loss data in training 0.06882757693529129 0.06980761447963152
yyy epoch 669
before loss data in training 0.07645534723997116 0.06980761447963152
after loss data in training 0.07645534723997116 0.06981753646882606
yyy epoch 670
before loss data in training 0.07500267773866653 0.06981753646882606
after loss data in training 0.07500267773866653 0.06982526395208961
yyy epoch 671
before loss 

# Model Testing

In [18]:
from sklearn import metrics

val_preds = (np.array(val_outputs) > 0.5).astype(int)

accuracy = metrics.accuracy_score(val_targets, val_preds)
f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
print("Hamming Loss: ", hamming_loss(val_targets, val_preds))

NameError: ignored

In [None]:
from sklearn.metrics import multilabel_confusion_matrix as mcm, classification_report
cm_labels = target_list
cm = mcm(val_targets, val_preds)
print(cm)

print(classification_report(val_targets, val_preds))

[[[24207     9]
  [  352    83]]

 [[23630   169]
  [  164   688]]

 [[23125    80]
  [  710   736]]

 [[24247     5]
  [  313    86]]

 [[23149   128]
  [  592   782]]

 [[24352     5]
  [  217    77]]

 [[23627    61]
  [  195   768]]

 [[22702   371]
  [  239  1339]]

 [[24049    50]
  [  188   364]]

 [[24155    19]
  [  204   273]]

 [[23829    72]
  [   80   670]]

 [[24283     2]
  [  308    58]]

 [[23951    46]
  [  348   306]]

 [[23008   137]
  [  201  1305]]

 [[22958   199]
  [  376  1118]]

 [[24073    35]
  [   28   515]]

 [[24007    56]
  [  407   181]]

 [[24257    13]
  [  331    50]]

 [[24200    43]
  [  287   121]]

 [[24013    20]
  [  543    75]]

 [[23303   124]
  [   81  1143]]

 [[24120    36]
  [  358   137]]

 [[24072    24]
  [  405   150]]

 [[23881    68]
  [  395   307]]

 [[24035    22]
  [  525    69]]

 [[24137    10]
  [  450    54]]

 [[24440    19]
  [   80   112]]

 [[23264    88]
  [  153  1146]]

 [[24229     2]
  [  107   313]]

 [[21495   441

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#Testing
test_targets=[]
test_outputs=[]

# Put model in evaluation mode
model.eval()

with torch.no_grad():
    for batch_idx, data in tqdm.tqdm(enumerate(test_data_loader)):
        
        input_ids = data['input_ids'].to(device, dtype=torch.long)
        attention_mask = data['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        output = model(input_ids, attention_mask, token_type_ids)
        
        test_targets.extend(targets.cpu().detach().numpy().tolist())
        test_outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())   
        
test_preds = (np.array(test_outputs) > 0.5).astype(int)

260it [42:35,  9.83s/it]


In [21]:
accuracy = metrics.accuracy_score(test_targets, test_preds)
f1_score_micro = metrics.f1_score(test_targets, test_preds, average='micro')
f1_score_macro = metrics.f1_score(test_targets, test_preds, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

from sklearn.metrics import multilabel_confusion_matrix as mcm, classification_report
cm_labels = target_list
cm = mcm(test_targets, test_preds)
print(cm)

print(classification_report(test_targets, test_preds))

Accuracy Score = 0.4275039170784621
F1 Score (Micro) = 0.6166506652036757
F1 Score (Macro) = 0.460537689590157
[[[8122   29]
  [  66   80]]

 [[7969   44]
  [  54  230]]

 [[7811    4]
  [ 462   20]]

 [[8160    4]
  [ 133    0]]

 [[7839    0]
  [ 455    3]]

 [[8201    1]
  [  56   39]]

 [[7957   19]
  [ 275   46]]

 [[7754   18]
  [ 469   56]]

 [[8091   22]
  [  23  161]]

 [[8136    1]
  [ 121   39]]

 [[8031   16]
  [  55  195]]

 [[8169    5]
  [  90   33]]

 [[8044   35]
  [  93  125]]

 [[7578  217]
  [  80  422]]

 [[7656  143]
  [ 117  381]]

 [[8116    0]
  [ 181    0]]

 [[8090   11]
  [ 160   36]]

 [[8167    3]
  [ 119    8]]

 [[8146   14]
  [ 100   37]]

 [[8080   10]
  [ 146   61]]

 [[7854   35]
  [  26  382]]

 [[8096   36]
  [  72   93]]

 [[8105    6]
  [ 105   81]]

 [[8036   27]
  [  77  157]]

 [[8092    8]
  [ 147   50]]

 [[8123    5]
  [ 129   40]]

 [[8233    0]
  [  64    0]]

 [[7807   58]
  [  53  379]]

 [[8152    4]
  [   8  133]]

 [[7161  231]
  [  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
print("Hamming Loss: ", hamming_loss(test_targets, test_preds))

Hamming Loss:  0.025912980595395926


In [None]:
# testing
example = 'км ну потому что очень часто заказываю всегда товар хороший и быстрая доставка даже с другого города жалоб нету вам спасибо kaspi shopping'

encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

model.eval()

with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    #print(train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])

In [None]:
final_preds = (np.array(final_output) > 0.3).astype(int)

import json
labels_dict = json.load(open(labels_path, 'r', encoding='utf-8'))
pred_target_df = pd.DataFrame({'Preds':final_preds[0], "Target": list(labels_dict.values())})
list(pred_target_df[pred_target_df.Preds==1].iloc[:,1])

['Позитив -> Магазин на Kaspi.kz -> Получение -> Нравится доставка...']