In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [4]:
import torch
import random

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [5]:
from datasets import load_dataset

dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")

Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/699k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [6]:
unique, counts = np.unique(dataset['train']['label'], return_counts=True)

In [7]:
counts

array([4265, 4265])

Нет дисбаланса классов, их строго одинаковое количество. 

Поскольку нет дисбаланса классов и при этом классы равнозначны, т.е. цена ошибки на обоих классах одинакова, то будем в дальнейшем в качестве метрики классификации использовать accuracy

Перейдем к модели

In [8]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [10]:
def tokenize_function(example):
    return tokenizer(example["text"])

In [11]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorWithPadding

2024-06-14 06:34:33.070298: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-14 06:34:33.070402: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-14 06:34:33.185768: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [16]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [17]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [18]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Device used: {}.".format(device))

Device used: cuda.


In [20]:
from torch import nn

in_features = 768
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class BertWithClassifier(nn.Module):
    def __init__(self, linear_size):
        super(BertWithClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.head = nn.Sequential(
            nn.Dropout(),
            nn.Linear(in_features=in_features, out_features=linear_size),
            nn.BatchNorm1d(num_features=linear_size),
            nn.Dropout(p=0.8),
            nn.Linear(in_features=linear_size, out_features=1),
            # nn.BatchNorm1d(num_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        y = self.head(bert_output[1]) 
        return y
        
    def freeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=False
    
    def unfreeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=True



Пока потренируем только голову, это будет бейзлайном

In [21]:
# parameters
num_of_epochs = 24
learning_rate = 27e-6
batch_size = 16
hidden_layers = 8

print("Epochs: {}".format(num_of_epochs))
print("Learning rate: {:.6f}".format(learning_rate))
print("Batch size: {}".format(batch_size))
print("The number of hidden layers in the custom head: {}".format(hidden_layers))

Epochs: 24
Learning rate: 0.000027
Batch size: 16
The number of hidden layers in the custom head: 8


In [22]:
model = BertWithClassifier(linear_size=hidden_layers)
model.to(device)

BertWithClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [23]:
from transformers import AdamW


optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()



In [24]:
from transformers import get_scheduler

num_training_steps = num_of_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

25608


In [25]:
model.train()
model.freeze_bert()

In [25]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 49]),
 'token_type_ids': torch.Size([8, 49]),
 'attention_mask': torch.Size([8, 49])}

In [26]:
from tqdm.auto import tqdm

In [26]:


progress_bar = tqdm(range(num_training_steps))

model.train()
model.freeze_bert()

for epoch in range(num_of_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = torch.flatten(model(tokens=batch['input_ids'], attention_mask=batch['attention_mask']))
        loss = loss_fn(outputs, batch['labels'].float())
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/25608 [00:00<?, ?it/s]

In [27]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def eval_prediction(y_batch_actual, y_batch_predicted):
    """Return batches of accuracy and f1 scores."""
    y_batch_actual_np = y_batch_actual.cpu().detach().numpy()
    y_batch_predicted_np = np.round(y_batch_predicted.cpu().detach().numpy())
    
    acc = accuracy_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np)
    f1 = f1_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np, average='weighted')
    
    return acc, f1

In [31]:
!pip install evaluate

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [29]:
model.eval()
model.freeze_bert()

size = len(eval_dataloader)
f1, acc = 0, 0

with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        X = batch['input_ids']
        attention_mask = batch['attention_mask']
        y = batch['labels']

        pred = model(tokens=X, attention_mask=attention_mask)

        acc_batch, f1_batch = eval_prediction(y.float(), pred)                        
        acc += acc_batch
        f1 += f1_batch

    acc = acc/size
    f1 = f1/size

acc, f1

(0.6222014925373134, 0.7495407991676658)

In [30]:
# model = BertWithClassifier(linear_size=hidden_layers)
# model.to(device)

In [31]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
model.unfreeze_bert()

for epoch in range(num_of_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = torch.flatten(model(tokens=batch['input_ids'], attention_mask=batch['attention_mask']))
        loss = loss_fn(outputs, batch['labels'].float())
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/25608 [00:00<?, ?it/s]

In [32]:
model.eval()
model.freeze_bert()

size = len(eval_dataloader)
f1, acc = 0, 0

with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        X = batch['input_ids']
        attention_mask = batch['attention_mask']
        y = batch['labels']

        pred = model(tokens=X, attention_mask=attention_mask)

        acc_batch, f1_batch = eval_prediction(y.float(), pred)                        
        acc += acc_batch
        f1 += f1_batch

    acc = acc/size
    f1 = f1/size

acc, f1

(0.6231343283582089, 0.7514877742489693)

In [28]:
def training_step(dataloader, model, optimizer, loss_fn, if_freeze_bert):
    """Method to train the model"""
    
    model.train()
    model.freeze_bert() if if_freeze_bert else model.unfreeze_bert()
      
    epoch_loss = 0
    size = len(dataloader.dataset)
 
    for i, batch in enumerate(dataloader):        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        outputs = torch.flatten(model(tokens=input_ids, attention_mask=attention_mask))
                        
        optimizer.zero_grad()
        loss = loss_fn(outputs, labels.float())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

In [29]:
def validation_step(dataloader, model, loss_fn):
    """Method to test the model's accuracy and loss on the validation set"""
    
    model.eval()
    model.freeze_bert()
    
    size = len(dataloader)
    f1, acc = 0, 0
    
    with torch.no_grad():
        for batch in dataloader:
            X = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)
                  
            pred = model(tokens=X, attention_mask=attention_mask)
            
            acc_batch, f1_batch = eval_prediction(y.float(), pred)                        
            acc += acc_batch
            f1 += f1_batch

        acc = acc/size
        f1 = f1/size
                
    return acc, f1

In [35]:
model = BertWithClassifier(linear_size=hidden_layers)
model.to(device)

BertWithClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [30]:
# parameters
# num_of_epochs = 24
num_of_epochs = 13
# learning_rate = 1e-6
learning_rate = 1e-5
batch_size = 16
hidden_layers = 8

print("Epochs: {}".format(num_of_epochs))
print("Learning rate: {:.6f}".format(learning_rate))
print("Batch size: {}".format(batch_size))
print("The number of hidden layers in the custom head: {}".format(hidden_layers))

Epochs: 13
Learning rate: 0.000010
Batch size: 16
The number of hidden layers in the custom head: 8


In [31]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

In [38]:
from tqdm.auto import tqdm

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = True
        print("Bert is freezed")
    else:
        if_freeze_bert = False
        print("Bert is not freezed")
    
    training_step(train_dataloader, model,optimizer, loss_fn, if_freeze_bert)
    train_acc, train_f1 = validation_step(train_dataloader, model, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model, path)

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.502, f1: 0.358
Validation results: 
Acc: 0.507, f1: 0.510
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.513, f1: 0.381
Validation results: 
Acc: 0.511, f1: 0.526
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.544, f1: 0.455
Validation results: 
Acc: 0.556, f1: 0.614
Epoch: #4
Bert is freezed
Training results: 
Acc: 0.554, f1: 0.479
Validation results: 
Acc: 0.564, f1: 0.634
Epoch: #5
Bert is freezed
Training results: 
Acc: 0.553, f1: 0.476
Validation results: 
Acc: 0.560, f1: 0.626
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.886, f1: 0.885
Validation results: 
Acc: 0.839, f1: 0.906
Epoch: #7
Bert is not freezed
Training results: 
Acc: 0.906, f1: 0.905
Validation results: 
Acc: 0.834, f1: 0.902
Epoch: #8
Bert is not freezed
Training results: 
Acc: 0.944, f1: 0.944
Validation results: 
Acc: 0.866, f1: 0.923
Epoch: #9
Bert is not freezed
Training results: 
Acc: 0.947, f1: 0.948
Validation results: 
Acc: 0.844, f1: 

Для базового Embedding MixUp буду опираться на эту статью: https://aclanthology.org/2020.coling-main.305.pdf

Эмбеддинги смешиваются на выходе из пулера берта, перед подачей в классификационную голову. 

In [39]:
best_acc

0.8684701492537313

In [40]:
# next(iter(train_dataloader))['input_ids'].shape

In [32]:
from torch import nn

in_features = 768
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class BertWithClassifierMixUp(nn.Module):
    def __init__(self, linear_size):
        super(BertWithClassifierMixUp, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.head = nn.Sequential(
            nn.Dropout(),
            nn.Linear(in_features=in_features, out_features=linear_size),
            nn.BatchNorm1d(num_features=linear_size),
            nn.Dropout(p=0.8),
            nn.Linear(in_features=linear_size, out_features=1),
            # nn.BatchNorm1d(num_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        y = self.head(bert_output[1]) 
        return y
    
    def forward_mixup(self, tokens1, attention_mask1, tokens2, attention_mask2, lam):
        bert_output1 = self.bert(input_ids=tokens1, attention_mask=attention_mask1)
        bert_output2 = self.bert(input_ids=tokens2, attention_mask=attention_mask2)
    
        bert_output = lam * bert_output1[1] + (1.0 - lam) * bert_output2[1]
    
        y = self.head(bert_output) 
        return y
        
    def freeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=False
    
    def unfreeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=True



In [33]:
def training_step_with_mixup(dataloader, model, optimizer, loss_fn, if_freeze_bert, lam):
    """Method to train the model"""
    
    model.train()
    model.freeze_bert() if if_freeze_bert else model.unfreeze_bert()
      
    epoch_loss = 0
    size = len(dataloader.dataset)
 
    for i, batch in enumerate(dataloader):
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

#         batch2 = next(iter(dataloader))
#         input_ids2 = batch2['input_ids'].to(device)
#         attention_mask2 = batch2['attention_mask'].to(device)
#         labels2 = batch2['labels'].to(device)
        
#         print(input_ids.shape)
#         print(attention_mask.shape)
#         print(labels.shape)
        
    
        model_answer = model.forward_mixup(input_ids, attention_mask, torch.flip(input_ids, dims=(0,)), torch.flip(attention_mask, dims=(0,)), lam)
    
        outputs = torch.flatten(model_answer)
        
        mixup_labels = lam * labels.float() + (1.0 - lam) * torch.flip(labels.float(), dims=(0,))
        
        optimizer.zero_grad()
        loss = loss_fn(outputs, mixup_labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

In [43]:

model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)

BertWithClassifierMixUp(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [20]:
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

NameError: name 'model_mixup' is not defined

In [45]:
from tqdm.auto import tqdm

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False
lam = 1

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        lam = 0.7
        print("Bert is not freezed")
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model, path)

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.565, f1: 0.496
Validation results: 
Acc: 0.583, f1: 0.655
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.609, f1: 0.609
Validation results: 
Acc: 0.597, f1: 0.725
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.591, f1: 0.578
Validation results: 
Acc: 0.580, f1: 0.694
Epoch: #4
Bert is freezed
Training results: 
Acc: 0.609, f1: 0.608
Validation results: 
Acc: 0.608, f1: 0.738
Epoch: #5
Bert is freezed
Training results: 
Acc: 0.618, f1: 0.609
Validation results: 
Acc: 0.615, f1: 0.739
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.860, f1: 0.860
Validation results: 
Acc: 0.825, f1: 0.896
Epoch: #7
Bert is not freezed
Training results: 
Acc: 0.895, f1: 0.894
Validation results: 
Acc: 0.840, f1: 0.905
Epoch: #8
Bert is not freezed
Training results: 
Acc: 0.918, f1: 0.918
Validation results: 
Acc: 0.851, f1: 0.913
Epoch: #9
Bert is not freezed
Training results: 
Acc: 0.930, f1: 0.930
Validation results: 
Acc: 0.835, f1: 

In [46]:
best_acc

0.8572761194029851

In [50]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False
lam = 1

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        lam = 0.85
        print("Bert is not freezed")
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model, path)
        
best_acc



  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.492, f1: 0.359
Validation results: 
Acc: 0.503, f1: 0.516
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.501, f1: 0.356
Validation results: 
Acc: 0.497, f1: 0.496
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.534, f1: 0.440
Validation results: 
Acc: 0.525, f1: 0.564
Epoch: #4
Bert is freezed
Training results: 
Acc: 0.557, f1: 0.494
Validation results: 
Acc: 0.552, f1: 0.628
Epoch: #5
Bert is freezed
Training results: 
Acc: 0.574, f1: 0.534
Validation results: 
Acc: 0.563, f1: 0.653
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.873, f1: 0.873
Validation results: 
Acc: 0.835, f1: 0.903
Epoch: #7
Bert is not freezed
Training results: 
Acc: 0.908, f1: 0.908
Validation results: 
Acc: 0.838, f1: 0.907
Epoch: #8
Bert is not freezed
Training results: 
Acc: 0.928, f1: 0.928
Validation results: 
Acc: 0.850, f1: 0.913
Epoch: #9
Bert is not freezed
Training results: 
Acc: 0.945, f1: 0.945
Validation results: 
Acc: 0.855, f1: 

0.855410447761194

In [48]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False
lam = 1

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 3:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        lam = 0.6
        print("Bert is not freezed")
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model, path)
        
best_acc



  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.500, f1: 0.353
Validation results: 
Acc: 0.503, f1: 0.502
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.506, f1: 0.366
Validation results: 
Acc: 0.505, f1: 0.511
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.531, f1: 0.421
Validation results: 
Acc: 0.535, f1: 0.574
Epoch: #4
Bert is not freezed
Training results: 
Acc: 0.822, f1: 0.822
Validation results: 
Acc: 0.815, f1: 0.890
Epoch: #5
Bert is not freezed
Training results: 
Acc: 0.866, f1: 0.866
Validation results: 
Acc: 0.827, f1: 0.897
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.886, f1: 0.886
Validation results: 
Acc: 0.835, f1: 0.902
Epoch: #7
Bert is not freezed
Training results: 
Acc: 0.902, f1: 0.902
Validation results: 
Acc: 0.844, f1: 0.908
Epoch: #8
Bert is not freezed
Training results: 
Acc: 0.924, f1: 0.924
Validation results: 
Acc: 0.844, f1: 0.910
Epoch: #9
Bert is not freezed
Training results: 
Acc: 0.938, f1: 0.938
Validation results: 
Acc: 0.8

0.8526119402985075

In [51]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model_with_mix.pt'
if_freeze_bert = False
lam = 1

num_of_epochs = 20

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        print("Bert is not freezed")
    
    if i >= 10:
        lam = 0.9
        print("mix_up_on")
        
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model_mixup, path)
        
best_acc



  0%|          | 0/20 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.568, f1: 0.545
Validation results: 
Acc: 0.584, f1: 0.705
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.557, f1: 0.492
Validation results: 
Acc: 0.571, f1: 0.649
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.572, f1: 0.526
Validation results: 
Acc: 0.590, f1: 0.683
Epoch: #4
Bert is freezed
Training results: 
Acc: 0.589, f1: 0.565
Validation results: 
Acc: 0.590, f1: 0.704
Epoch: #5
Bert is freezed
Training results: 
Acc: 0.603, f1: 0.602
Validation results: 
Acc: 0.607, f1: 0.737
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.860, f1: 0.859
Validation results: 
Acc: 0.828, f1: 0.898
Epoch: #7
Bert is not freezed
Training results: 
Acc: 0.919, f1: 0.919
Validation results: 
Acc: 0.855, f1: 0.916
Epoch: #8
Bert is not freezed
Training results: 
Acc: 0.938, f1: 0.937
Validation results: 
Acc: 0.853, f1: 0.915
Epoch: #9
Bert is not freezed
Training results: 
Acc: 0.949, f1: 0.949
Validation results: 
Acc: 0.841, f1: 

0.8600746268656716

In [53]:
lam = 0.95
if_freeze_bert = False

for i in tqdm(range(5)):
    print("Epoch: #{}".format(i+1))
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model_mixup, path)
        
best_acc

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: #1
Training results: 
Acc: 0.994, f1: 0.994
Validation results: 
Acc: 0.854, f1: 0.916
Epoch: #2
Training results: 
Acc: 0.994, f1: 0.994
Validation results: 
Acc: 0.857, f1: 0.918
Epoch: #3
Training results: 
Acc: 0.993, f1: 0.993
Validation results: 
Acc: 0.856, f1: 0.917
Epoch: #4
Training results: 
Acc: 0.995, f1: 0.995
Validation results: 
Acc: 0.851, f1: 0.915
Epoch: #5
Training results: 
Acc: 0.996, f1: 0.996
Validation results: 
Acc: 0.850, f1: 0.913


0.8600746268656716

In [30]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False
lam = 1

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        lam = 0.95
        print("Bert is not freezed")
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model_mixup, path)
        
best_acc



  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.508, f1: 0.368
Validation results: 
Acc: 0.504, f1: 0.507
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.538, f1: 0.441
Validation results: 
Acc: 0.536, f1: 0.588
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.578, f1: 0.541
Validation results: 
Acc: 0.581, f1: 0.681
Epoch: #4
Bert is freezed
Training results: 
Acc: 0.554, f1: 0.476
Validation results: 
Acc: 0.561, f1: 0.630
Epoch: #5
Bert is freezed
Training results: 
Acc: 0.571, f1: 0.518
Validation results: 
Acc: 0.574, f1: 0.656
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.879, f1: 0.879
Validation results: 
Acc: 0.850, f1: 0.912
Epoch: #7
Bert is not freezed
Training results: 
Acc: 0.914, f1: 0.914
Validation results: 
Acc: 0.860, f1: 0.920
Epoch: #8
Bert is not freezed
Training results: 
Acc: 0.943, f1: 0.943
Validation results: 
Acc: 0.855, f1: 0.916
Epoch: #9
Bert is not freezed
Training results: 
Acc: 0.959, f1: 0.959
Validation results: 
Acc: 0.852, f1: 

0.8600746268656716

In [31]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False
lam = 1

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        lam = 1
        print("Bert is not freezed")
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model, path)
        
best_acc



  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.502, f1: 0.357
Validation results: 
Acc: 0.497, f1: 0.496
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.503, f1: 0.360
Validation results: 
Acc: 0.499, f1: 0.499
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.514, f1: 0.387
Validation results: 
Acc: 0.503, f1: 0.512
Epoch: #4
Bert is freezed
Training results: 
Acc: 0.541, f1: 0.459
Validation results: 
Acc: 0.534, f1: 0.584
Epoch: #5
Bert is freezed
Training results: 
Acc: 0.538, f1: 0.448
Validation results: 
Acc: 0.534, f1: 0.578
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.874, f1: 0.873
Validation results: 
Acc: 0.838, f1: 0.906
Epoch: #7
Bert is not freezed
Training results: 
Acc: 0.917, f1: 0.917
Validation results: 
Acc: 0.855, f1: 0.917
Epoch: #8
Bert is not freezed
Training results: 
Acc: 0.927, f1: 0.926
Validation results: 
Acc: 0.838, f1: 0.904
Epoch: #9
Bert is not freezed
Training results: 
Acc: 0.946, f1: 0.946
Validation results: 
Acc: 0.851, f1: 

0.855410447761194

In [37]:
from torch import nn

in_features = 768
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class BertWithClassifierMixUp(nn.Module):
    def __init__(self, linear_size):
        super(BertWithClassifierMixUp, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.head = nn.Sequential(
            nn.Dropout(),
            nn.Linear(in_features=in_features, out_features=linear_size),
            nn.BatchNorm1d(num_features=linear_size),
            nn.Dropout(p=0.8),
            nn.Linear(in_features=linear_size, out_features=1),
            # nn.BatchNorm1d(num_features=1),
            nn.Sigmoid()
        )
        
#     def forward(self, tokens, attention_mask):
#         bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
#         y = self.head(bert_output[1]) 
#         return y
    
    def forward(self, tokens1, attention_mask1, tokens2, attention_mask2, lam):
        bert_output1 = self.bert(input_ids=tokens1, attention_mask=attention_mask1)
        bert_output2 = self.bert(input_ids=tokens2, attention_mask=attention_mask2)
    
        bert_output = lam * bert_output1[1] + (1.0 - lam) * bert_output2[1]
    
        y = self.head(bert_output) 
        return y
        
    def freeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=False
    
    def unfreeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=True



In [38]:
def training_step_with_mixup(dataloader, model, optimizer, loss_fn, if_freeze_bert, lam):
    """Method to train the model"""
    
    model.train()
    model.freeze_bert() if if_freeze_bert else model.unfreeze_bert()
      
    epoch_loss = 0
    size = len(dataloader.dataset)
 
    for i, batch in enumerate(dataloader):
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        indexes = torch.randperm(len(input_ids)).to(device)
#         print(input_ids.shape)
    
        model_answer = model(input_ids, attention_mask, input_ids[indexes], attention_mask[indexes], lam)
    
        outputs = torch.flatten(model_answer)
        
        mixup_labels = lam * labels.float() + (1.0 - lam) * labels.float()[indexes]
        
        optimizer.zero_grad()
        loss = loss_fn(outputs, mixup_labels)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

In [39]:
def validation_step_mixup(dataloader, model, loss_fn):
    """Method to test the model's accuracy and loss on the validation set"""
    
    model.eval()
    model.freeze_bert()
    
    size = len(dataloader)
    f1, acc = 0, 0
    
    with torch.no_grad():
        for batch in dataloader:
            X = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)
                  
            pred = model(X, attention_mask, X, attention_mask, 1)
            
            acc_batch, f1_batch = eval_prediction(y.float(), pred)                        
            acc += acc_batch
            f1 += f1_batch

        acc = acc/size
        f1 = f1/size
                
    return acc, f1

In [38]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False
lam = 1

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 3:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        print("Bert is not freezed")
        
    if i >= 6:
        print("Mix Up")
        lam = 0.9
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step_mixup(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step_mixup(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
#         torch.save(model_mixup, path)
        
best_acc



  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.500, f1: 0.351
Validation results: 
Acc: 0.497, f1: 0.496
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.515, f1: 0.387
Validation results: 
Acc: 0.504, f1: 0.518
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.541, f1: 0.451
Validation results: 
Acc: 0.528, f1: 0.578
Epoch: #4
Bert is not freezed
Training results: 
Acc: 0.855, f1: 0.854
Validation results: 
Acc: 0.832, f1: 0.902
Epoch: #5
Bert is not freezed
Training results: 
Acc: 0.918, f1: 0.918
Validation results: 
Acc: 0.848, f1: 0.913
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.949, f1: 0.949
Validation results: 
Acc: 0.865, f1: 0.922
Epoch: #7
Bert is not freezed
Mix Up
Training results: 
Acc: 0.959, f1: 0.959
Validation results: 
Acc: 0.860, f1: 0.920
Epoch: #8
Bert is not freezed
Mix Up
Training results: 
Acc: 0.965, f1: 0.965
Validation results: 
Acc: 0.839, f1: 0.906
Epoch: #9
Bert is not freezed
Mix Up
Training results: 
Acc: 0.976, f1: 0.976
Validati

0.8647388059701493

In [40]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False
lam = 1

num_of_epochs= 13

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 3:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        print("Bert is not freezed")
        
    if i >= 6:
        print("Mix Up")
        lam = 0.5
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step_mixup(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step_mixup(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model_mixup, path)
        
best_acc



  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.501, f1: 0.360
Validation results: 
Acc: 0.501, f1: 0.506
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.563, f1: 0.539
Validation results: 
Acc: 0.559, f1: 0.673
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.576, f1: 0.560
Validation results: 
Acc: 0.571, f1: 0.692
Epoch: #4
Bert is not freezed
Training results: 
Acc: 0.885, f1: 0.885
Validation results: 
Acc: 0.840, f1: 0.907
Epoch: #5
Bert is not freezed
Training results: 
Acc: 0.926, f1: 0.925
Validation results: 
Acc: 0.848, f1: 0.912
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.948, f1: 0.948
Validation results: 
Acc: 0.856, f1: 0.918
Epoch: #7
Bert is not freezed
Mix Up
Training results: 
Acc: 0.952, f1: 0.952
Validation results: 
Acc: 0.836, f1: 0.903
Epoch: #8
Bert is not freezed
Mix Up
Training results: 
Acc: 0.965, f1: 0.965
Validation results: 
Acc: 0.855, f1: 0.917
Epoch: #9
Bert is not freezed
Mix Up
Training results: 
Acc: 0.971, f1: 0.971
Validati

0.8600746268656716

In [45]:
model_mixup = BertWithClassifierMixUp(linear_size=hidden_layers)
model_mixup.to(device)
optimizer = AdamW(model_mixup.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'train_dataloader
if_freeze_bert = False
lam = 1

num_of_epochs = 6

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 3:
        if_freeze_bert = True
        print("Bert is freezed")
        lam = 1
    else:
        if_freeze_bert = False
        lam = 0.9
        print("Bert is not freezed")
        
    
    training_step_with_mixup(train_dataloader, model_mixup, optimizer, loss_fn, if_freeze_bert, lam)
    train_acc, train_f1 = validation_step_mixup(train_dataloader, model_mixup, loss_fn)
    val_acc, val_f1 = validation_step_mixup(eval_dataloader, model_mixup, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
#         torch.save(model_mixup, path)
        
best_acc



  0%|          | 0/6 [00:00<?, ?it/s]

Epoch: #1
Bert is freezed
Training results: 
Acc: 0.550, f1: 0.484
Validation results: 
Acc: 0.533, f1: 0.601
Epoch: #2
Bert is freezed
Training results: 
Acc: 0.610, f1: 0.611
Validation results: 
Acc: 0.614, f1: 0.743
Epoch: #3
Bert is freezed
Training results: 
Acc: 0.603, f1: 0.586
Validation results: 
Acc: 0.617, f1: 0.732
Epoch: #4
Bert is not freezed
Training results: 
Acc: 0.879, f1: 0.878
Validation results: 
Acc: 0.848, f1: 0.913
Epoch: #5
Bert is not freezed
Training results: 
Acc: 0.919, f1: 0.919
Validation results: 
Acc: 0.853, f1: 0.914
Epoch: #6
Bert is not freezed
Training results: 
Acc: 0.922, f1: 0.922
Validation results: 
Acc: 0.825, f1: 0.893


0.8526119402985075

Финальные метрики

In [42]:
model_mixup = torch.load("/kaggle/working/best_model.pt")

In [52]:
test_dataloader = DataLoader(
    tokenized_datasets['test'], shuffle=True, batch_size=8, collate_fn=data_collator
)

In [53]:
acc, f1 = validation_step_mixup(test_dataloader, model_mixup, loss_fn)

In [46]:
print(f"{acc=}, {f1=}")

acc=0.8460820895522388, f1=0.8468752847484188
