In [1]:
import sys
import pandas as pd
import numpy as np

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    pipeline,
    Trainer,
    TrainingArguments,
)
import evaluate
from sklearn.model_selection import train_test_split

sys.path.append("character-tokenizer")
from charactertokenizer import CharacterTokenizer

import warnings
warnings.filterwarnings('ignore')

MODEL_NAME = "DeepPavlov/rubert-base-cased"
MODEL_MAX_LENGTH = 64

In [2]:
def parse_label(line):
    labels = []
    i = 0
    while i < len(line):
        if line[i] == "^":
            labels.append(1)
            i += 1
        else:
            labels.append(0)
        i += 1
    return " ".join([str(x) for x in labels])


def create_labels(s):
    labels = []
    flag = False
    for char in s:
        if char == "^":
            flag = True
        else:
            labels.append(1 if flag else 0)
            flag = False

    return " ".join([str(x) for x in labels])

In [3]:
data = pd.read_csv("data/all_accents.tsv", sep="\t", header=None)
data = data.rename(columns={0: "word", 1: "accent"})
data["labels"] = data["accent"].apply(lambda x: create_labels(x))

X = data.drop('labels', axis=1)
y = data['labels']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
data.head()

Unnamed: 0,word,accent,labels
0,-де,-д^е,0 0 1
1,-ка,-к^а,0 0 1
2,-либо,-л^ибо,0 0 1 0 0
3,-нибудь,-ниб^удь,0 0 0 0 1 0 0
4,-с,-с,0 0


In [5]:
chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
tokenizer = CharacterTokenizer(chars, MODEL_MAX_LENGTH, is_split_into_words=True,)
special_tokens = list(range(6))

In [6]:
# def tokenize_and_align_labels(examples):
#     annotations = [int(i) for i in examples["labels"].strip().split(" ")]
#     print(f"annotations: {annotations}")

#     tokenized_inputs = tokenizer(examples["word"].strip())
#     print(f"tokenized_inputs: {tokenized_inputs}")

#     labels = []
#     for i, input_id in enumerate(tokenized_inputs["input_ids"]):
#         if input_id in special_tokens:
#             labels.append(-100)
#         else:
#             labels.append(annotations.pop(0))

#     tokenized_inputs["labels"] = labels

#     return tokenized_inputs


def tokenize_and_align_labels(texts, text_labels):
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, is_split_into_words=True,)

    labels = []
    for i, label in enumerate(text_labels):
        annotations = [int(j) for j in label[0].strip().split(" ")]

        label_ids = []
        for k, input_id in enumerate(tokenized_inputs["input_ids"][i]):
            if input_id in special_tokens:
                label_ids.append(-100)
            else:
                label_ids.append(annotations.pop(0))
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


class StressDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
        # item["labels"] = torch.tensor(self.data[idx])
        return item

    def __len__(self):
        return len(self.data)

In [7]:
# texts = data["word"].tolist()
# text_labels = data["labels"].tolist()

train_texts = [[sample] for sample in X_train["word"]]
train_text_labels = [[sample] for sample in y_train]

test_texts = [[sample] for sample in X_test["word"]]
test_text_labels = [[sample] for sample in y_test]

train_data_for_dataset = tokenize_and_align_labels(train_texts, train_text_labels)
test_data_for_dataset = tokenize_and_align_labels(test_texts, test_text_labels)

train_dataset = StressDataset(train_data_for_dataset)
test_dataset = StressDataset(test_data_for_dataset)

In [8]:
train_dataset[3]

{'input_ids': tensor([ 0, 36, 18, 38, 46, 44, 46, 48, 40, 36,  8, 72,  1,  4,  4,  4,  4,  4,
          4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
          4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
          4,  4,  4,  4]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([-100,    0,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -

In [9]:
id2label = {0: "O", 1: "A"}
label2id = {"O": 0, "A": 1}

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id
).to("cuda")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import wandb


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    label_names = ["O", "A"]

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metric = evaluate.load("seqeval")
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


wandb.init(
    project="huggingface-stress",
    name=f"{MODEL_NAME}-{MODEL_MAX_LENGTH}-{'stress'}",
)

training_args = TrainingArguments(
    output_dir="./results",  # output directory
    num_train_epochs=20,  # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,  # batch size for evaluation
    learning_rate=2e-5,
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",  # directory for storing logs
    logging_steps=10,
    report_to="wandb",
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlaputin001[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7185212969779968, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.37209302325581395, 'eval_runtime': 1.036, 'eval_samples_per_second': 3.861, 'eval_steps_per_second': 0.965, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7156548500061035, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.4186046511627907, 'eval_runtime': 0.8367, 'eval_samples_per_second': 4.781, 'eval_steps_per_second': 1.195, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7097111344337463, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.4186046511627907, 'eval_runtime': 0.6629, 'eval_samples_per_second': 6.034, 'eval_steps_per_second': 1.509, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7007609009742737, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.4186046511627907, 'eval_runtime': 0.8377, 'eval_samples_per_second': 4.775, 'eval_steps_per_second': 1.194, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6892118453979492, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.4883720930232558, 'eval_runtime': 0.6887, 'eval_samples_per_second': 5.808, 'eval_steps_per_second': 1.452, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.675236701965332, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.5813953488372093, 'eval_runtime': 0.9268, 'eval_samples_per_second': 4.316, 'eval_steps_per_second': 1.079, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6587273478507996, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7906976744186046, 'eval_runtime': 0.677, 'eval_samples_per_second': 5.909, 'eval_steps_per_second': 1.477, 'epoch': 7.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6398428082466125, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.7236, 'eval_samples_per_second': 5.528, 'eval_steps_per_second': 1.382, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6200997829437256, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.8624, 'eval_samples_per_second': 4.638, 'eval_steps_per_second': 1.159, 'epoch': 9.0}
{'loss': 0.6376, 'learning_rate': 4.0000000000000003e-07, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5985599160194397, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.6573, 'eval_samples_per_second': 6.086, 'eval_steps_per_second': 1.521, 'epoch': 10.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5754058957099915, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 1.0382, 'eval_samples_per_second': 3.853, 'eval_steps_per_second': 0.963, 'epoch': 11.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5516551733016968, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 1.0327, 'eval_samples_per_second': 3.873, 'eval_steps_per_second': 0.968, 'epoch': 12.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5275481939315796, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 1.0375, 'eval_samples_per_second': 3.855, 'eval_steps_per_second': 0.964, 'epoch': 13.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5027416944503784, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.8014, 'eval_samples_per_second': 4.991, 'eval_steps_per_second': 1.248, 'epoch': 14.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.47790655493736267, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.8797, 'eval_samples_per_second': 4.547, 'eval_steps_per_second': 1.137, 'epoch': 15.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.45363128185272217, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.9172, 'eval_samples_per_second': 4.361, 'eval_steps_per_second': 1.09, 'epoch': 16.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4304647147655487, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.8568, 'eval_samples_per_second': 4.668, 'eval_steps_per_second': 1.167, 'epoch': 17.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4086436927318573, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 1.0462, 'eval_samples_per_second': 3.823, 'eval_steps_per_second': 0.956, 'epoch': 18.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.3874046802520752, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 1.0243, 'eval_samples_per_second': 3.905, 'eval_steps_per_second': 0.976, 'epoch': 19.0}
{'loss': 0.4792, 'learning_rate': 8.000000000000001e-07, 'epoch': 20.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.3677537143230438, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9069767441860465, 'eval_runtime': 0.7651, 'eval_samples_per_second': 5.228, 'eval_steps_per_second': 1.307, 'epoch': 20.0}
{'train_runtime': 79.8453, 'train_samples_per_second': 1.002, 'train_steps_per_second': 0.25, 'train_loss': 0.558400011062622, 'epoch': 20.0}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▂▂▂▃▄▆█████████████
eval/f1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/loss,████▇▇▇▆▆▆▅▅▄▄▃▃▂▂▁▁
eval/precision,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/recall,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,█▄▁▄▂▆▁▂▅▁███▄▅▆▅██▃
eval/samples_per_second,▁▄█▄▇▃▇▆▄█▁▁▁▅▃▃▄▁▁▅
eval/steps_per_second,▁▄█▄▇▃▇▆▄█▁▁▁▅▃▃▄▁▁▅
train/epoch,▁▁▂▂▂▃▃▄▄▄▄▅▅▅▆▆▇▇▇████
train/global_step,▁▁▂▂▂▃▃▄▄▄▄▅▅▅▆▆▇▇▇████

0,1
eval/accuracy,0.90698
eval/f1,0.0
eval/loss,0.36775
eval/precision,0.0
eval/recall,0.0
eval/runtime,0.7651
eval/samples_per_second,5.228
eval/steps_per_second,1.307
train/epoch,20.0
train/global_step,20.0


In [11]:
pipe = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="none",
    ignore_labels=("NO",), 
    device="cuda"
)

text = "-нибудь"
print(text)
index = pipe(text)[0]["index"]
print(text[:index] + "'" + text[index:])

-нибудь
-'нибудь


In [12]:
model = AutoModelForTokenClassification.from_pretrained(r"D:\study\nn\nn-homeworks\hw9\results\checkpoint-20")
model = model.to("cuda")

In [13]:
texts = "мама"
tokenized_inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    logits = model(**tokenized_inputs).logits
predictions = torch.argmax(logits, dim=2)
predictions, logits

(tensor([[0, 0, 0, 0, 0, 0]], device='cuda:0'),
 tensor([[[ 0.0207, -1.0284],
          [ 0.3162, -0.9342],
          [ 0.3113, -0.9281],
          [ 0.3618, -0.9046],
          [ 0.3407, -0.8975],
          [ 0.3504, -0.8938]]], device='cuda:0'))

In [14]:
texts = "-нибудь"
tokenized_inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    logits = model(**tokenized_inputs).logits
predictions = torch.argmax(logits, dim=2)
predictions, logits

(tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),
 tensor([[[ 0.4738, -0.8442],
          [ 0.3427, -0.7130],
          [ 0.3059, -0.7027],
          [ 0.3410, -0.6875],
          [ 0.4057, -0.6962],
          [ 0.4588, -0.6740],
          [ 0.4262, -0.6706],
          [ 0.4436, -0.6046],
          [ 0.4316, -0.6121]]], device='cuda:0'))

По итогу модель очень плохо обучилась, предсказываются все время нули. Долго пытался понять в чем дело, в итоге модель даже не переобучается на один пример. Возможно проблема в токенизации, то что после изменения токенизотора надо очень долго обучать, либо ошибка токенизации в коде, но ее я так и не смог найти, в качестве разметки специальные токены заполнял -100, как и в примере hf, чтобы игнорировались при расчете лосса кросс энтропии, токенам с ударенем поставил метку 1, остальным - 0.

Буду рад, если оставите обратную свзяь и подскажите в чем могла быть ошибка.