In [65]:
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git
!pip install transformers

fatal: destination path 'character-tokenizer' already exists and is not an empty directory.


In [66]:
import torch
import torch.nn as nn
import sys


sys.path.append("/kaggle/working/character-tokenizer")
from charactertokenizer import CharacterTokenizer

In [67]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
device

device(type='cuda')

In [70]:
chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
model_max_length = 64

batch_size = 64
epochs = 4
seed_val = 42

In [None]:
tokenizer = CharacterTokenizer(chars, model_max_length)
example = "Привет"
tokens = tokenizer(example)
tokens

Задание: обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers. Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip

1. Напишите класс для Dataset/Dataloder и разбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
2. Попробуйте обучить одну или несколько из моделей: Bert, Albert, Deberta. Посчитайте метрику Accuracy на train и test. (1 балл). При преодолении порога в Accuracy на test 0.8: (+1 балл), 0.85: (+2 балла), 0.89: (+3 балла).
Пример конфигурации для deberta: https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/config.json

## Prepare data

In [None]:
import pandas as pd


df = pd.read_table('/kaggle/input/hw10-wordstress/all_accents.tsv', header=None, names = ['word', 'stressed_word'])
df['stress_idx'] = df['stressed_word'].str.find('^')

# model_max_length = df['word'].apply(lambda x: len(x)).max() + 2
# model_max_length

In [None]:
from sklearn.model_selection import train_test_split


train_df, test_df = train_test_split(df, test_size=0.5, random_state=seed_val)
test_df, val_df = train_test_split(test_df, test_size=0.2, random_state=seed_val)

train_df.shape[0], val_df.shape[0], test_df.shape[0]

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split


class WordStressDataset(Dataset):
    def __init__(self, df, max_len):
        self.df = df
        self.max_len = max_len
        self.tokenizer = CharacterTokenizer(chars, model_max_length)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        word = self.df['word'].iloc[idx]
        stress_idx = self.df['stress_idx'].iloc[idx]

        tokens = self.tokenizer(
            word,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        labels = torch.zeros((self.max_len), dtype=torch.long)
        if stress_idx > 0:
            labels[stress_idx] = 1
        
        return tokens['input_ids'].flatten(), tokens['attention_mask'].flatten(), labels

In [None]:
train_dataset = WordStressDataset(train_df, model_max_length)
val_dataset = WordStressDataset(val_df, model_max_length)
test_dataset = WordStressDataset(test_df, model_max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

## Create model

In [None]:
from transformers import DebertaV2ForTokenClassification, AdamW, DebertaV2Config, get_linear_schedule_with_warmup


config = DebertaV2Config(
    architectures="DebertaV2ForTokenClassification",
    model_type="deberta-v2",
    transformers_version="4.25.1",
    torch_dtype="float32",
    
    num_labels=2,

    hidden_size=1024,
    intermediate_size=2048,
    num_attention_heads=16,
    num_hidden_layers=5,
    
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2,
    position_biased_input=True,
    
    vocab_size=len(tokenizer.get_vocab()),
    
    max_length=model_max_length,
    max_position_embeddings=model_max_length,
    max_relative_positions=model_max_length,
    
    output_attentions=False,
    output_hidden_states=False,    
    
    #initializer_range=0.02,
    pooler_dropout = 0,
    pooler_hidden_act = "gelu",
    pooler_hidden_size = 1536
)

model = DebertaV2ForTokenClassification(config)

model.cuda()

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8,
)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps,
)

## Train

In [None]:
import random
import numpy as np
import time
import datetime

In [None]:
def flat_accuracy(preds, labels):    
    return np.all(preds == labels, axis=1).sum() / len(labels)

In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
count_logging_elapsed = 10

In [None]:
def train(epoch_i):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    
    for step, batch in enumerate(train_dataloader):
        if step % int(len(train_dataloader) / (count_logging_elapsed)) == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(
            b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )
        
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        
        optimizer.step()
        #scheduler.step()

    print("")
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    
    train_time = format_time(time.time() - t0)
    print("  Training epoch took: {:}".format(train_time))
    
    return avg_train_loss, train_time

In [None]:
def validate():
    print("")
    print("Running Validation...")

    t0 = time.time()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                attention_mask=b_input_mask,
                labels=b_labels
            )
    
        loss = outputs.loss
        total_eval_loss += loss.item()

        logits = torch.argmax(outputs.logits.detach(), dim=2).cpu().numpy()
        
        label_ids = b_labels.cpu().numpy()
        
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(val_dataloader)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    
    val_time = format_time(time.time() - t0)
    print("  Validation took: {:}".format(val_time))
    
    return avg_val_loss, avg_val_accuracy, val_time

In [None]:
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

train_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    model.train()
    avg_train_loss, train_time = train(epoch_i)

    model.eval()
    avg_val_loss, avg_val_accuracy, val_time = validate()

    train_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': train_time,
            'Validation Time': val_time
        }
    )

print("")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
df_stats = pd.DataFrame(data=train_stats)
df_stats = df_stats.set_index('epoch')

df_stats

Как видно лосс на валидационной выборке падает, поэтому запустим еще на две итерацию

In [None]:
model.train()
avg_train_loss, train_time = train(4)

model.eval()
avg_val_loss, avg_val_accuracy, val_time = validate()

train_stats.append(
    {
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': train_time,
        'Validation Time': val_time
    }
)

In [None]:
model.train()
avg_train_loss, train_time = train(5)

model.eval()
avg_val_loss, avg_val_accuracy, val_time = validate()

train_stats.append(
    {
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': train_time,
        'Validation Time': val_time
    }
)

In [73]:
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.019774,0.013858,0.775947,0:42:51,0:03:16
2,0.013193,0.010349,0.833933,0:42:49,0:03:17
3,0.010828,0.008581,0.865191,0:42:49,0:03:15
4,0.009258,0.007619,0.88432,0:42:46,0:03:15
4,0.008158,0.006536,0.898526,0:42:52,0:03:17
4,0.007316,0.005819,0.911593,0:42:55,0:03:16


## Test

In [74]:
def test():
    predictions , true_labels = [], []
    
    total_test_accuracy = 0
    
    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2]
        
        with torch.no_grad():
            outputs = model(
                b_input_ids, 
                token_type_ids=None,
                attention_mask=b_input_mask,
            )
             
        logits = torch.argmax(outputs.logits.detach(), dim=2).cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        total_test_accuracy += flat_accuracy(logits, label_ids)

    avg_test_accuracy = total_test_accuracy / len(test_dataloader)
    
    return avg_test_accuracy, predictions, true_labels

In [75]:
model.eval()
avg_test_accuracy, predictions, true_labels = test()

print(f'{avg_test_accuracy}')

0.9110370021377138


In [76]:
batch = next(iter(test_dataloader))
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2]

with torch.no_grad():
    outputs = model(
        b_input_ids, 
        token_type_ids=None,
        attention_mask=b_input_mask,
    )

logits = torch.argmax(outputs.logits.detach(), dim=2).cpu().numpy()

In [77]:
print("".join(map(lambda x: x if x != '[PAD]' else "", tokenizer.convert_ids_to_tokens(b_input_ids[0]))))
print(f"actual: {np.where(logits[0] == 1)[0]}, expected: {np.where(b_labels[0].numpy() == 1)[0]}")

[CLS]вскрывающей[SEP]
actual: [6], expected: [6]


In [78]:
print("".join(map(lambda x: x if x != '[PAD]' else "", tokenizer.convert_ids_to_tokens(b_input_ids[1]))))
print(f"actual: {np.where(logits[1] == 1)[0]}, expected: {np.where(b_labels[1].numpy() == 1)[0]}")

[CLS]добропорядошный[SEP]
actual: [8], expected: [8]


In [79]:
print("".join(map(lambda x: x if x != '[PAD]' else "", tokenizer.convert_ids_to_tokens(b_input_ids[2]))))
print(f"actual: {np.where(logits[2] == 1)[0]}, expected: {np.where(b_labels[2].numpy() == 1)[0]}")

[CLS]мыслившего[SEP]
actual: [1], expected: [1]


Таким образом, наша модель достаточно хорошо проставляет ударения