In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
from tqdm import tqdm

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer

#### Visualisation

In [5]:
import matplotlib
import matplotlib.pyplot as plt

In [6]:
import scienceplots

plt.style.use('science')
%config InlineBackend.figure_format = 'retina'

lables_fs = 16
ticks_fs = 12

### [REFERENCE](https://github.com/Markusiko/RuPunctNet/blob/8bc765ddbd3f61822efc6ed6272fb5960dc8a37e/DL_experiments/bert-base.ipynb#L929)

### Prepare `train` and `test` datasets from data

In [7]:
prepared_dir = '../data/prepared'
filename_csv = '01_punct_pushkin.csv'

In [8]:
# load saved dataset
dataset_df = pd.read_csv(os.path.join(prepared_dir, filename_csv), index_col=0)
dataset_df.shape

(4456, 4)

In [9]:
dataset_df = dataset_df.drop('input_lemma', axis=1)
dataset_df = dataset_df.drop('input_pos', axis=1)

In [10]:
PUNC_2_TOKEN = {' ': 'S', ',': 'C', '.': 'P', '!': 'EX', '?': 'Q'}
TOKEN_2_PUNC = {v: k for k, v in PUNC_2_TOKEN.items()}

In [11]:
TOKEN_2_ENTITY = {
    'S': 'O',
    'C': 'B-,',
    'P': 'B-.',
    'EX': 'B-!',
    'Q': 'B-?',
}

ENTITY_2_ID = {
    'O': 0,
    'B-,': 1,
    'B-.': 2,
    'B-!': 3,
    'B-?': 4,
    'B-:': 5,
    'B-...': 6
}
ID_2_ENTITY = {v: k for k, v in ENTITY_2_ID.items()}

In [12]:
tokens_all = []
labels_all = []

for index, row in tqdm(dataset_df.iterrows(), total=dataset_df.shape[0]):
    sent_with_punc = ''
    tokens_all.append(row['input'].split())

    labels_this = []
    for token in row['target'].split():
        labels_this.append(TOKEN_2_ENTITY[token])

    labels_all.append(labels_this)

100%|████████████████████████████████████| 4456/4456 [00:00<00:00, 37522.30it/s]


In [13]:
df = pd.DataFrame()

df['tokens'] = tokens_all
df['labels'] = labels_all

In [14]:
pd.options.display.max_colwidth = 150
df.sample(5)

Unnamed: 0,tokens,labels
4019,"[никто, того, не, заметил, гости, продолжали, пить, и, уже, благовестили, к, вечерне, когда, встали, из-за, стола]","[O, O, O, B-,, O, O, B-,, O, O, O, O, B-,, O, O, O, B-.]"
1223,"[слушай, сказал, пугачев, с, каким-то, диким, вдохновением]","[B-,, O, O, O, O, O, B-.]"
4035,"[гробовщик, по, обыкновению, своему, побожился, что, лишнего, не, возьмет, значительным, взглядом, обменялся, с, приказчиком, и, поехал, хлопотать]","[B-,, O, O, B-,, B-,, O, O, O, B-,, O, O, O, O, O, O, O, B-.]"
225,"[господи, владыко, простонал, мой, савельич]","[O, B-,, O, O, B-.]"
4054,"[солнце, давно, уже, освещало, постелю, на, которой, лежал, гробовщик]","[O, O, O, O, B-,, O, O, O, B-.]"


#### Dataset

In [15]:
splitting_random_state = 78
test_ratio = 0.25

df_train, df_val_test = train_test_split(
    df, test_size=0.2,
    random_state=splitting_random_state
)
df_val, df_test = train_test_split(
    df_val_test, test_size=0.25,
    random_state=splitting_random_state
)  # validation and test

data = DatasetDict({
    'train': Dataset.from_pandas(df_train, preserve_index=False),
    'test': Dataset.from_pandas(df_val, preserve_index=False)
})

data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3564
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 669
    })
})

### Tokenization

In [16]:
from transformers import AutoTokenizer

In [17]:
model_name = 'markusiko/rubert-base-punctuation'  # 'ai-forever/ruBert-base'

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [26]:
# label_list = set()
# for labels in df_train['labels']:
#     label_list |= set(labels)

# label_list = sorted(list(label_list))

label_list = ['B-!', 'B-,', 'B-.', 'B-...', 'B-:', 'B-?', 'O']
label_list

['B-!', 'B-,', 'B-.', 'B-...', 'B-:', 'B-?', 'O']

In [27]:
def tokenize_and_align_labels(pair, label_all_tokens=False):
    tokenized_inputs = tokenizer(pair["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(pair['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [28]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3564 [00:00<?, ? examples/s]

Map:   0%|          | 0/669 [00:00<?, ? examples/s]

### Model

In [29]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

In [30]:
import logging
from transformers.trainer import logger as noisy_logger
noisy_logger.setLevel(logging.WARNING)

In [31]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True
)

In [32]:
# dictionaries for labels
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

In [33]:
for param in model.bert.embeddings.parameters():
    param.requires_grad = False

In [34]:
batch_size = 20
epochs = 3

args = TrainingArguments(
    "ner",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    include_inputs_for_metrics=True,
    logging_steps=100,
    # hub_model_id="rubert-base-punctuation",
    save_strategy="epoch",
    push_to_hub=False
)

In [35]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [36]:
metric = load_metric("seqeval", trust_remote_code=True)

  metric = load_metric("seqeval", trust_remote_code=True)


In [37]:
def compute_metrics(p):
    predictions, labels, inputs = p.predictions, p.label_ids, p.inputs
    predictions = np.argmax(p.predictions, axis=2)

    # send only the first token of each word to the evaluation
    true_predictions = []
    true_labels = []
    for prediction, label, tokens in zip(predictions, labels, inputs):
        true_predictions.append([])
        true_labels.append([])
        for (p, l, t) in zip(prediction, label, tokens):
            if l != -100 and not tokenizer.convert_ids_to_tokens(int(t)).startswith('##'):
                true_predictions[-1].append(label_list[p])
                true_labels[-1].append(label_list[l])

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [38]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [39]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1157,0.095493,0.95092,0.892943,0.92102,0.967852
2,0.0651,0.093192,0.941379,0.917427,0.929249,0.971044
3,0.0461,0.102711,0.938079,0.923668,0.930818,0.9715


TrainOutput(global_step=537, training_loss=0.0725080227718673, metrics={'train_runtime': 513.6937, 'train_samples_per_second': 20.814, 'train_steps_per_second': 1.045, 'total_flos': 186333424126608.0, 'train_loss': 0.0725080227718673, 'epoch': 3.0})

### Metrics

In [40]:
from transformers import pipeline

In [46]:
punct_corrector = pipeline("token-classification", tokenizer=tokenizer, model=model)

In [48]:
# make the output readable + in target form
ALL_PUNC = '.,:;!?'
UNK_TOKEN = 'U'


def get_sentence_with_punctuation(sent, model_preds):

    sent_words = sent.split()
    sent_new = (sent + ' ')[:-1]
    
    n_marks = 0
    for pred in model_preds:

        punc_mark = pred['entity'][-1]

        ind_to_place = pred['end'] + n_marks
        if len(sent_new) == ind_to_place:  # last mark
            sent_new = sent_new + punc_mark
        else:
            if (sent_new[ind_to_place] == ' '):
                sent_new = sent_new[:ind_to_place] + punc_mark + sent_new[ind_to_place:]
                n_marks += 1

    target_like = []
    for word in sent_new.split():
        last_chr = word[-1]
        if last_chr in ALL_PUNC:
            if last_chr in PUNC_2_TOKEN.keys():
                target_like.append(PUNC_2_TOKEN[last_chr])
            else:
                target_like.append(UNK_TOKEN)
        else:
            target_like.append(PUNC_2_TOKEN[' '])

    return sent_new, ' '.join(target_like)

In [50]:
sent = 'в тридцати шагах промаху в карту не дам разумеется из знакомых пистолетов'

get_sentence_with_punctuation(sent, punct_corrector(sent))  # prediction

('в тридцати шагах промаху в карту не дам, разумеется, из знакомых пистолетов.',
 'S S S S S S S C C S S P')

In [63]:
# inference on validation set!

model_preds = []
model_preds_as_target = []

for index, row in tqdm(df_val_test.iterrows(), total=df_val_test.shape[0]):
    sent_with_punc, target_like = get_sentence_with_punctuation(
        dataset_df.iloc[index]['input'], punct_corrector(dataset_df.iloc[index]['input'])
    )
    model_preds.append(sent_with_punc)
    model_preds_as_target.append(target_like)

100%|█████████████████████████████████████████| 892/892 [00:27<00:00, 32.15it/s]


In [65]:
val_targets_vs_preds = pd.DataFrame()

val_targets_vs_preds['target'] = dataset_df.iloc[df_val_test.index]['target']
val_targets_vs_preds['pred'] = model_preds_as_target

In [66]:
val_targets_vs_preds.sample(5)

Unnamed: 0,target,pred
3154,S S S S S S C S S P,S S S S S S C S S P
444,S S S C S Q,S S S C S Q
128,C S S S C S P,C S S S C S P
2106,S S C C S S S P,S S C S S S S P
2378,S S S S S C S S S S S S S P,S S S S S C S S S S S S S P


#### Functions

In [71]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [67]:
END_PUNC = ['P', 'EX', 'Q']
INTR_PUNC = ['S', 'C']

NAMES_PUNC = {
    'S': 'space (` `)',
    'C': 'comma (`,`)',
    'P': 'point (`.`)',
    'EX': 'excl. (`!`)',
    'Q': 'question (`?`)',
    'U': 'other'
}

CLASSES = sorted(END_PUNC + INTR_PUNC)  # alphabetic order

In [68]:
def return_separate_punct(target_vs_pred_df, pred_col_name='output_punc'):
    test_all_punc_target = []  # list of all punctuation
    test_all_punc_preds = []
    
    for target_this, predicted_this in zip(target_vs_pred_df['target'], target_vs_pred_df[pred_col_name]):
        test_all_punc_target.extend(target_this.split(' '))
        test_all_punc_preds.extend(predicted_this.split(' '))
    
    assert len(test_all_punc_target) == len(test_all_punc_preds)
    
    return test_all_punc_target, test_all_punc_preds


def get_all_metrics(test_df, pred_col_name='output_punc'):
    test_all_punc_target, test_all_punc_preds = return_separate_punct(
        test_df, pred_col_name=pred_col_name
    )

    cm = confusion_matrix(test_all_punc_target, test_all_punc_preds)
    # precision = TP / (TP + FP)
    precision = precision_score(test_all_punc_target, test_all_punc_preds, average=None, zero_division=np.nan)
    # recall = TP / (TP + FN)
    recall = recall_score(test_all_punc_target, test_all_punc_preds, average=None, zero_division=np.nan)
    # f1 = 2TP / (2TP + FP + FN)
    f1 = f1_score(test_all_punc_target, test_all_punc_preds, average=None)

    # PRINT
    metrics_names = ['Precision', 'Recall', 'F1 score']
    metrics = {'Precision': precision, 'Recall': recall, 'F1 score': f1}
    col_w = 16
    
    print(' ' * col_w + '|' + ''.join([f"{NAMES_PUNC[token] + (col_w - len(NAMES_PUNC[token])) * ' '}|" for token in CLASSES]))  # header
    print(''.join(['-' * col_w + '|' for _ in range(len(CLASSES) + 1)]) )
    for ind, metric_name in enumerate(metrics_names):
        row = f"{metric_name + (col_w - len(metric_name)) * ' '}|"
        for score in metrics[metric_name]:
            score_str = f'{score:.6f}'
            row += f"{score_str + (col_w - len(score_str)) * ' '}|"
        print(row)

#### Result on validation set

In [72]:
%%time
get_all_metrics(val_targets_vs_preds, pred_col_name='pred')

                |comma (`,`)     |excl. (`!`)     |point (`.`)     |question (`?`)  |space (` `)     |
----------------|----------------|----------------|----------------|----------------|----------------|
Precision       |0.936912        |0.500000        |0.958587        |0.770492        |0.981751        |
Recall          |0.912668        |0.166667        |0.978856        |0.810345        |0.987088        |
F1 score        |0.924631        |0.250000        |0.968615        |0.789916        |0.984412        |
CPU times: user 98.9 ms, sys: 12.9 ms, total: 112 ms
Wall time: 118 ms


#### Result on all dataset

In [76]:
# inference on validation set!

model_all_preds = []

for index, row in tqdm(dataset_df.iterrows(), total=dataset_df.shape[0]):
    _, target_like = get_sentence_with_punctuation(
        row['input'], punct_corrector(row['input'])
    )
    model_all_preds.append(target_like)

100%|███████████████████████████████████████| 4456/4456 [01:09<00:00, 63.82it/s]


In [77]:
dataset_df['pred'] = model_all_preds

In [78]:
dataset_df.sample(5)

Unnamed: 0,input,target,pred
770,они отряхиваясь подходили к руке пугачева который объявлял им прощение и принимал в свою шайку,C C S S S C S S S S S S S S P,C C S S S C S S S S S S S S P
3755,малое число книг найденных мною под шкафами и в кладовой были вытвержены мною наизусть,S S C S S S S S S C S S S P,S S C S S S S S S C S S S P
3101,несколько троек наполненных разбойниками разъезжали днем по всей губернии останавливали путешественников и почту приезжали в селы грабили помещичь...,S C S C S S S S C S S S C S S C S S S S S S P,S C S C S S S S C S S S C S S C S S S S S S P
575,василиса егоровна тотчас захотела отправиться в гости к попадье и по совету ивана кузмича взяла с собою и машу чтоб ей не было скучно одной,S S S S S S S S S C S S S C S S S S C S S S S S P,S S S S S S S S S C S S S C S S S S C S S S S S P
262,я велел ехать к коменданту и через минуту кибитка остановилась перед деревянным домиком выстроенным на высоком месте близ деревянной же церкви,S S S S C S S S S S S S C S S S C S S S P,S S S S C S S S S S S S C S S S C S S S P


In [79]:
%%time
get_all_metrics(dataset_df, pred_col_name='pred')

                |comma (`,`)     |excl. (`!`)     |point (`.`)     |question (`?`)  |space (` `)     |
----------------|----------------|----------------|----------------|----------------|----------------|
Precision       |0.971699        |0.802198        |0.973858        |0.860294        |0.992787        |
Recall          |0.965740        |0.414773        |0.989327        |0.932271        |0.994070        |
F1 score        |0.968710        |0.546816        |0.981532        |0.894837        |0.993428        |
CPU times: user 363 ms, sys: 8.72 ms, total: 372 ms
Wall time: 379 ms
