In [None]:
!python D:/progamming/va/truecase/ru-punctuation-truecase/src/prepare_data.py --data_dir D:/progamming/va/truecase/ru-punctuation-truecase/data/tatoeba_dataset --num_samples -1 --percent_dev 0.2

In [None]:
!python3.10 /Users/falaputin/virtual_assistant/trucase_project/ru-punctuation-truecase/src/prepare_data.py --data_dir /Users/falaputin/virtual_assistant/trucase_project/ru-punctuation-truecase/data/tatoeba_dataset --num_samples -1 --percent_dev 0.2

In [1]:
from typing import List, Tuple
import pickle
import os

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import evaluate
metric = evaluate.load("seqeval")

import sys
sys.path.insert(0, '/Users/falaputin/virtual_assistant/trucase_project/ru-punctuation-truecase/src/')
from process_text import clean_text, clean_text_3times

# ========== Data global variables ==========
PATH_TO_DATA = "/Users/falaputin/virtual_assistant/trucase_project/ru-punctuation-truecase/data"

# ========== Model global variables ==========
MODEL_NAME = "DeepPavlov/rubert-base-cased-conversational"
# "DeepPavlov/rubert-base-cased-conversational" -> rubert-base-cased-conversational
SHORT_MODEL_NAME = MODEL_NAME.split('/')[1] if '/' in MODEL_NAME else MODEL_NAME
MODEL_MAX_LENGTH = 512

Подготовка данных для hugging face transformers

Подготовка собственного датасета

In [2]:
def file2array(path_to_file: str) -> List[List[str]]:
    result = []
    with open(path_to_file, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            result.append(line.split(' '))
    return result


def textlabel2arrays(path_to_text: str, path_to_labels: str) -> Tuple[List[List[str]], List[List[str]]]:
    texts = []
    labels = []
    with open(path_to_text, 'r') as f_text:
        with open(path_to_labels, 'r') as f_labels:
            for line_text, line_labels in zip(f_text.readlines(), f_labels.readlines()):

                line_text = line_text.strip()
                line_labels = line_labels.strip()
                
                texts.append(line_text.split(' '))
                labels.append(line_labels.split(' '))
    return texts, labels


def encode_tags(tags, tag2id, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels


def save_to_pickle(filename: str, data):
    # If file exists, delete it.
    if os.path.isfile(filename):
        os.remove(filename)
    else:
        print("Error: %s file not found" % filename)

    with open(f'{filename}', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


def load_from_pickle(filename: str):
    data = None
    if os.path.isfile(filename):
        with open(f'{filename}', 'rb') as handle:
            data = pickle.load(handle)
    else:
        print("Error: %s file not found" % filename)
        
    return data

In [9]:
path_to_train_text = "../data/tatoeba_dataset/text_train.txt"
path_to_train_labels = "../data/tatoeba_dataset/labels_train.txt"
path_to_val_text = "../data/tatoeba_dataset/text_dev.txt"
path_to_val_labels = "../data/tatoeba_dataset/labels_dev.txt"

train_texts, train_tags = textlabel2arrays(path_to_train_text, path_to_train_labels)
val_texts, val_tags = textlabel2arrays(path_to_val_text, path_to_val_labels)

unique_labels = set(tag for doc in train_tags for tag in doc)
label_names = list(unique_labels)
label2id = {tag: id for id, tag in enumerate(label_names)}
id2label = {id: tag for tag, id in label2id.items()}

print(len(train_texts) == len(train_tags))
print(len(val_texts) == len(val_tags))
print(len(train_texts), len(train_tags))
print(len(val_texts), len(val_tags))

True
True
806458 806458
201614 201614


Calculate encodings, labels and save it to pickle

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=MODEL_MAX_LENGTH)

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

train_labels = encode_tags(train_tags, label2id, train_encodings)
val_labels = encode_tags(val_tags, label2id, val_encodings)

# save train_encodings
pickle_filename = f"{PATH_TO_DATA}/cached/train_encodings_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
save_to_pickle(pickle_filename, train_encodings)

# save val_encodings
pickle_filename = f"{PATH_TO_DATA}/cached/val_encodings_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
save_to_pickle(pickle_filename, val_encodings)

# save train_labels
pickle_filename = f"{PATH_TO_DATA}/cached/train_labels_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
save_to_pickle(pickle_filename, train_labels)

# save val_labels
pickle_filename = f"{PATH_TO_DATA}/cached/val_labels_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
save_to_pickle(pickle_filename, val_labels)

load encodings, labels from pickle

In [4]:
# load train_encodings
pickle_filename = f"{PATH_TO_DATA}/cached/train_encodings_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
train_encodings = load_from_pickle(pickle_filename)

# load val_encodings
pickle_filename = f"{PATH_TO_DATA}/cached/val_encodings_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
val_encodings = load_from_pickle(pickle_filename)

# load train_labels
pickle_filename = f"{PATH_TO_DATA}/cached/train_labels_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
train_labels = load_from_pickle(pickle_filename)

# load val_labels
pickle_filename = f"{PATH_TO_DATA}/cached/val_labels_{SHORT_MODEL_NAME}_{MODEL_MAX_LENGTH}.pickle"
val_labels = load_from_pickle(pickle_filename)

#### Dataset class

In [5]:
class CapitalizationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping")
val_encodings.pop("offset_mapping")
train_dataset = CapitalizationDataset(train_encodings, train_labels)
val_dataset = CapitalizationDataset(val_encodings, val_labels)

In [6]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

#### Model

In [7]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_names), id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification wer

#### Train

In [8]:
import wandb
wandb.init(project="huggingface-punctuation-and-capitalization",
           name=f"{SHORT_MODEL_NAME}-{MODEL_MAX_LENGTH}-{'tatoeba_dataset'}")

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    learning_rate=2e-5,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to="wandb"
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlaputin001[0m. Use [1m`wandb login --relogin`[0m to force relogin




  0%|          | 0/151212 [00:00<?, ?it/s]

In [None]:
model

In [None]:
labels = [[tag2id[tag] for tag in doc] for doc in train_tags]
encoded_labels = []
idx = 0
for doc_labels, doc_offset in zip(labels, train_encodings.offset_mapping):
    # create an empty array of -100
    doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
    arr_offset = np.array(doc_offset)
    # print(doc_enc_labels)
    # print(len(arr_offset))

    # set labels whose first offset position is 0 and the second is not 0
    try:
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())
    except Exception as e:
        print(idx)
        
    idx += 1

In [None]:
broken_texts_128 = [36934, 80799, 80800, 99989, 119863, 126620, 130799, 131430, 139275, 160289]
broken_texts_512 = [126620, 130799, 160289]
for idx in broken_texts_128:
    print(val_texts[idx])
    print(len(val_texts[idx]))

In [None]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [None]:
for token, label in zip(train_encodings.tokens(), train_labels[0]):
    print(token, label)