In [None]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 9.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 326 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm_notebook as tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, BertTokenizerFast
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
name_model = "cointegrated/LaBSE-en-ru"
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP/NLP_project/Data/Dataset_with_target.csv')

In [None]:
df.head()

Unnamed: 0,question_text,answer_text,target
0,Добрый день! Мы бы хотели пройти бесплатное о...,Здравствуйте! Регистрируйтесь на сайте https:/...,0
1,Добрый день.Помогите пожалуйста пройти регистр...,Добрый вечер!В системе на Вашу почту зарегистр...,0
2,Здравствуйте. Начала изучать курс для учителя ...,Здравствуйте! Возможно Ваша презентация имеет ...,0
3,"13 лет,интересует стоимость",Здравствуйте! Курс бесплатный -- можете регист...,0
4,"Здравствуйте, подскажите, пожалуйста, я прошёл...","Здравствуйте! Сертификат придет Вам на почту, ...",0


In [None]:
df_train, tmp = train_test_split(df, random_state=42,  stratify=df['target'], test_size=0.3)
df_val, df_test = train_test_split(tmp, random_state=42,  stratify=tmp['target'], test_size=0.5)

# Train model

In [None]:
class model_dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['question_text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        return_token_type_ids=False,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']      
        target = self.df['target'].values[index]
        
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long).flatten(),
            'attention_mask': torch.tensor(mask, dtype=torch.long).flatten(),
            'target': torch.tensor(target, dtype=torch.long)
        }  

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(name_model)

train_dataset = model_dataset(df_train, tokenizer=tokenizer, max_length=300)
val_dataset = model_dataset(df_val, tokenizer=tokenizer, max_length=300)

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Work with a dataloader

In [None]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

Classification

In [None]:
class QuestionsClassifier(nn.Module):
    def __init__(self, n_classes):
        super(QuestionsClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(name_model)
        self.drop = nn.Dropout(p=0.2)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        output = self.drop(outputs['last_hidden_state'][:,0,:])
        return self.out(output)

In [None]:
n_class = len(df_train['target'].unique())
print(f'n_class: {n_class}')
bert_model = QuestionsClassifier(n_class)

n_class: 2


Downloading:   0%|          | 0.00/492M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import gc
def cleanup():
    """
    A function that clears the cache on CUDA
    """
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [None]:
bert_model.to(device);

Prepare for fine-tune our model

In [None]:
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(bert_model.parameters(),
                  lr = 2e-5 # args.learning_rate - default is 5e-5
                )

epochs = 11
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

Function to train 1 epoch

In [None]:
def train_epoch(model, dataset, loss_fn,
                optimizer, device, scheduler, n_examples):
    batch_size = 32
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 

    model = model.train()

    losses = []
    correct_predictions = 0

    for i, d in enumerate(tqdm(data_loader)):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['target'].to(device)

        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask)
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if i % 100 == 0:
            cleanup()
    cleanup()
    return correct_predictions.double() / n_examples, np.mean(losses) #Accuracy and Loss

Function to evaluate our model

In [None]:
def eval_model(model, dataset, loss_fn, device, n_examples):
    batch_size = 32
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 

    model = model.eval()

    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for i, d in enumerate(tqdm(data_loader)):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['target'].to(device)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            if i % 3000 == 0:
                cleanup()
        cleanup()

    return correct_predictions.double() / n_examples, np.mean(losses) #Accuracy and Loss



Fine-tune our model

In [None]:
best_accuaracy = 0

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model=bert_model, 
                                        dataset=train_dataset, 
                                        loss_fn=loss_fn, 
                                        optimizer=optimizer, 
                                        device=device, 
                                        scheduler=scheduler, 
                                        n_examples=len(df_train))
    
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model=bert_model, 
                                    dataset=val_dataset, 
                                    loss_fn=loss_fn,
                                    device=device,
                                    n_examples=len(df_val))
    
    print(f'Val loss:{val_loss}  accuracy:{val_acc}')

    if val_acc > best_accuaracy:
        torch.save(bert_model.state_dict(), 'classification_bert.bin')
        best_accuracy = val_acc



Epoch 1/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.39680495256414783 accuracy 0.8626506024096386


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:0.38904238913370215  accuracy:0.8665730337078651
Epoch 2/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.34245651946044886 accuracy 0.8695783132530122


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:0.37834863299908844  accuracy:0.8637640449438202
Epoch 3/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.2822072460817603 accuracy 0.8843373493975905


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:0.4341103211045265  accuracy:0.8609550561797753
Epoch 4/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.19502259884029627 accuracy 0.9177710843373494


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:0.5086524214433588  accuracy:0.8230337078651685
Epoch 5/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.12038362097854798 accuracy 0.9557228915662651


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:0.6792508648789447  accuracy:0.8061797752808989
Epoch 6/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.05743224802203118 accuracy 0.9810240963855422


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:1.0110301770593808  accuracy:0.8581460674157303
Epoch 7/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.02841454795662475 accuracy 0.9900602409638555


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:1.206397545726403  accuracy:0.8623595505617977
Epoch 8/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.01890242706143069 accuracy 0.9945783132530122


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:1.2051043627054796  accuracy:0.8497191011235955
Epoch 9/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.007070395815704755 accuracy 0.9978915662650604


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:1.1842963967634283  accuracy:0.8441011235955056
Epoch 10/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.0015904328681874457 accuracy 0.999698795180723


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:1.2352342294610066  accuracy:0.8469101123595505
Epoch 11/11
----------


  0%|          | 0/104 [00:00<?, ?it/s]

Train loss 0.0022798331795526724 accuracy 0.999698795180723


  0%|          | 0/23 [00:00<?, ?it/s]

Val loss:1.2263037268396304  accuracy:0.8497191011235955


# Test model

In [None]:
name_model = "cointegrated/LaBSE-en-ru"
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

device: cuda:0


Load our fine tuned model

In [None]:
n_class = len(df['target'].unique())
print(f'n_class = {n_class}')
bert_model = QuestionsClassifier(n_class)
tokenizer = BertTokenizerFast.from_pretrained(name_model)
bert_model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/NLP_project/good work/Data/classification_bert1.bin'))
bert_model.eval();

n_class = 2


Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
bert_model.to(device);

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)
test_dataset = model_dataset(df_test, tokenizer=tokenizer, max_length=300)

Evaluation of the BERT model on the test

In [None]:
test_acc, test_loss = eval_model(model=bert_model,
                                 dataset=test_dataset,
                                loss_fn=loss_fn,
                                device=device,
                                n_examples=len(df_test))

print(f'Test loss:{test_acc}  accuracy:{test_acc}')

  0%|          | 0/23 [00:00<?, ?it/s]

Test loss:0.8497191011235955  accuracy:0.8497191011235955
