In [None]:
!git clone https://github.com/KuzmaKhrabrov/character-tokenizer.git

Cloning into 'character-tokenizer'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 20 (delta 5), reused 10 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (20/20), 5.89 KiB | 5.89 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [None]:
!pip install transformers



In [None]:
import string
import sys
sys.path.append("character-tokenizer")
from charactertokenizer import CharacterTokenizer

chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
model_max_length = 64
tokenizer = CharacterTokenizer(chars, model_max_length)

In [None]:
example = "Привет"
tokens = tokenizer(example)
print(tokens)

{'input_ids': [0, 39, 42, 26, 12, 18, 46, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


Задание: обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers. Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip

1. Напишите класс для Dataset/Dataloder и разбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
2. Попробуйте обучить одну или несколько из моделей: Bert, Albert, Deberta. Посчитайте метрику Accuracy на train и test. (1 балл). При преодолении порога в Accuracy на test 0.8: (+1 балл), 0.85: (+2 балла), 0.89: (+3 балла).
Пример конфигурации для deberta: https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/config.json

In [None]:
!wget https://github.com/Koziev/NLP_Datasets/raw/master/Stress/all_accents.zip
!unzip all_accents.zip

--2024-12-09 16:22:43--  https://github.com/Koziev/NLP_Datasets/raw/master/Stress/all_accents.zip
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Stress/all_accents.zip [following]
--2024-12-09 16:22:43--  https://raw.githubusercontent.com/Koziev/NLP_Datasets/master/Stress/all_accents.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10624775 (10M) [application/zip]
Saving to: ‘all_accents.zip’


2024-12-09 16:22:45 (118 MB/s) - ‘all_accents.zip’ saved [10624775/10624775]

Archive:  all_accents.zip
  inflating: all_accents.tsv         


In [None]:
from sklearn.model_selection import train_test_split

def load_data(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()
    data = []
    for line in lines:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            word, label = parts
            data.append((word, label))
    return data

data = load_data("all_accents.tsv")
train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)


In [None]:
import torch
from torch.utils.data import Dataset

class StressDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=64):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word, label = self.data[idx]
        inputs = self.tokenizer(word, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        labels = torch.tensor([int(c == "+") for c in label] + [0] * (self.max_len - len(label)), dtype=torch.long)
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels,
        }


In [None]:
from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=model_max_length)

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader

train_dataset = StressDataset(train_data, tokenizer)
test_dataset = StressDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)
trainer.train()


In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

print("Модель и токенизатор успешно сохранены в './saved_model'")

Модель и токенизатор успешно сохранены в './saved_model'


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [None]:
from sklearn.metrics import accuracy_score

def compute_accuracy(loader, model):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds = torch.argmax(outputs, dim=-1)

            mask = labels != -100
            correct += (preds[mask] == labels[mask]).sum().item()
            total += mask.sum().item()

    return correct / total if total > 0 else 0

train_accuracy = compute_accuracy(train_loader, model)
test_accuracy = compute_accuracy(test_loader, model)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Train Accuracy: 1.0000
Test Accuracy: 1.0000
