# PolEval 2022 Task 1 - Punctuation prediction from conversational language
this notebook shows code used for training and inference of the model used to achieve the best results during competition.
The result is 83.30 weighted average of F1 scores for each punctuation sign on the reference test set.

Code was run using Google Colaboratory, due to high computational requirements - GPU in use was NvidiaA100 with 40 GB of VRAM memory. Notebook form of this code allows the easiest reproduction of the results in the minimalistic way.

Beware that besides more data processing, different trial & error and finetuning approaches were used before creation of the aforementioned model.

In [1]:
# using A100 GPU, 40GB VRAM
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Sep 14 19:27:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Data used for training, testing and model itself are located here

https://drive.google.com/drive/folders/1MWseXH6Qs19Ej86sYNpSdPHIA5IiBszC?usp=sharing

In [None]:
# from google.colab import drive # way to use your own GDrive as a local disk
# drive.mount('/content/drive')

## Needed software

In [None]:
!pip install transformers
!pip install sacremoses
!pip install simpletransformers

from collections import defaultdict
import os
import sys
import time

import pandas as pd
import torch
import sklearn.metrics
from simpletransformers.ner import NERArgs, NERModel

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## Parser for train data

In [None]:
def parse_train_data(path_file=r"2022-punctuation-prediction/dev-0", save_path=None):
    in_path = os.path.join(os.getcwd(), path_file, "in.tsv")
    exp_path = os.path.join(os.getcwd(), path_file, "expected.tsv")
    print(in_path, exp_path)

    save_path = os.path.join(os.getcwd(), path_file, save_path)
    labels_set = set()
    list_data = []
    with open(in_path, 'r') as f1, open(exp_path, 'r') as f2, open(save_path, 'w') as out:
        xxx = 0
        for sent_id, (in_line, expected) in enumerate(zip(f1.readlines(), f2.readlines())):
            xxx += 1
            if in_line[-1]=='\n': # delete enters
                in_line = in_line[:-1]

            text = in_line.split('\t')[2]
            text = text.replace("'", " ") # remove apostrophe since it's not in the expected output !!!
            text = text.replace("…", "...") # process triple dots to have the same structure as in expected file

            text = text.split()
            text = [j.split(':')[0] for j in text]

            if expected[-1]=='\n': # delete enters
                expected = expected[:-1]
            expected = expected.split()

            if len(text) != len(expected): # little guard
                print(text)
                print(expected)
            assert len(text) == len(expected)

            for i, (in_token, expected_token) in enumerate(zip(text, expected)):
                expected_token = expected_token
                if in_token == expected_token: # if there is not punctuation sign expected
                    label = 'O'
                else:
                    if in_token == expected_token[:-1]: # single char ponctuation signs
                        label = expected_token[-1]
                        labels_set.add(label)
                    elif in_token == expected_token[:-3] and expected_token[-3:] == '...': # fix for triple dot
                        label = expected_token[-3:]
                        labels_set.add(label)
                    else:
                        print('WARNING', in_token, expected_token, file=sys.stderr) # non polish characters warning
                        label = 'O'
                out.write(f'{in_token}\t{label}\n') # save data
                list_data.append([sent_id, in_token, label])
            out.write('\n')

    return pd.DataFrame(data=list_data, columns=['sentence_id', 'words', 'labels'])


train_df = parse_train_data(path_file=r"2022-punctuation-prediction/train", save_path='output_train.txt')
dev_df = parse_train_data(path_file=r"2022-punctuation-prediction/dev-0", save_path='output_dev.txt')

In [42]:
# sample of train dataframe
train_df

Unnamed: 0,sentence_id,words,labels
0,0,I,O
1,0,teraz,O
2,0,mamy,O
3,0,drugi,O
4,0,dzień,O
...,...,...,...
94024,10599,zostawię,O
94025,10599,tutaj,.
94026,10600,Brawo,!
94027,10600,Brawo,!


## model training

In [16]:
# more ponctuation signs than we need, however it yields better results than with smaller list of labels
labels = ['O', ':', ';', ',', '.', '-', '...', '?', '!']


def merge_data(fn, y_true, y_pred, **kwargs):
    """
    merge prediction and ground truth for scorers
    """
    y_true_res = []
    y_pred_res = []
    for t in y_true:
        y_true_res.extend(t)

    for p in y_pred:
        y_pred_res.extend(p)

    return fn(y_true_res, y_pred_res, **kwargs)


def f1_per_label(y_true, y_pred):
    values = merge_data(sklearn.metrics.f1_score, y_true, y_pred, labels=labels[1:], average=None, zero_division=0)
    return {str(i): v for i, v in enumerate(values)}


def pr_per_label(y_true, y_pred):
    values = merge_data(
        sklearn.metrics.precision_score, y_true, y_pred, labels=labels[1:], average=None, zero_division=0
        )
    return {str(i): v for i, v in enumerate(values)}


def rc_per_label(y_true, y_pred):
    values = merge_data(sklearn.metrics.recall_score, y_true, y_pred, labels=labels[1:], average=None, zero_division=0)
    return {str(i): v for i, v in enumerate(values)}


def get_args(model_type, model_name, output_dir):
    """
    get arguments for NER model
    """
    ner_args = NERArgs()
    ner_args.early_stopping_metric = 'f1_weighted'
    ner_args.early_stopping_metric_minimize = False
    ner_args.model_type = model_type
    ner_args.model_name = model_name
    ner_args.train_batch_size = 20
    ner_args.eval_batch_size = 20

    ner_args.learning_rate = 2e-5
    ner_args.num_train_epochs = 5
    ner_args.evaluate_during_training = True
    ner_args.evaluate_during_training_steps = 500
    ner_args.max_seq_length = 256
    ner_args.warmup_steps = 1
    ner_args.save_eval_checkpoints = False
    ner_args.use_multiprocessing = False
    ner_args.use_multiprocessing_for_evaluation = False

    ner_args.loss_type = 'focal'
    ner_args.loss_args = {
        'alpha': 0.25,
        'gamma': 2,
        'reduction': 'mean',
        'ignore_index': -100,
    }
    ner_args.output_dir

    return ner_args


In [18]:
model_name = 'allegro/herbert-base-cased'
model_type = 'herbert'

metrics = {
    'f1_micro': lambda y_true, y_pred: merge_data(sklearn.metrics.f1_score, y_true, y_pred, average='micro',
                                                    zero_division=0, labels=labels[1:]),
    'f1_macro': lambda y_true, y_pred: merge_data(sklearn.metrics.f1_score, y_true, y_pred, average='macro',
                                                    zero_division=0, labels=labels[1:]),
    'f1_weighted': lambda y_true, y_pred: merge_data(sklearn.metrics.f1_score, y_true, y_pred, average='weighted',
                                                        zero_division=0, labels=labels[1:]),
    'confusion_matrix': lambda y_true, y_pred: merge_data(sklearn.metrics.confusion_matrix, y_true, y_pred,
                                                            labels=labels[1:]),
    'f1_class': lambda y_true, y_pred: f1_per_label(y_true, y_pred),
    'pr_class': lambda y_true, y_pred: pr_per_label(y_true, y_pred),
    'rc_class': lambda y_true, y_pred: rc_per_label(y_true, y_pred),
}

output_dir = f'model_dir_{model_name}_{time.time()}'
ner_args = get_args(model_type, model_name, output_dir)

model = NERModel(model_type,
                 model_name,
                 labels=labels,
                 args=ner_args,
                 use_cuda=True)

model.train_model(train_df, output_dir=output_dir, eval_data=dev_df, **metrics)

## prediction

In [None]:
model_type = 'herbert'
model_name = 'herbert-base-cased_1678740852.5044909/checkpoint-1660-epoch-5'
labels = ['O', ':', ';', ',', '.', '-', '...', '?', '!']

model = NERModel(model_type,
                 model_name,
                 labels=labels,
                 use_cuda=True)

In [7]:
def read_tsv_in_file(path_file):
    df = pd.read_csv(os.path.join(os.getcwd(), path_file, "in.tsv"), sep="\t", header=None)

    texts = list(df[2])
    texts = [i.split() for i in texts]
    max_len = 0
    for i, text in enumerate(texts):
        text = [j.split(':')[0] for j in text]
        texts[i] = ' '.join(text)

    texts_test = texts
    return texts_test

texts_dev = read_tsv_in_file(r"2022-punctuation-prediction/dev-0")
texts_test = read_tsv_in_file(r"2022-punctuation-prediction/test-A")
texts_test_b = read_tsv_in_file(r"2022-punctuation-prediction/test-B")

In [8]:
def merge_sent(texts):
    """
    merge sentences that appear to be one longer sentence
    this trick improves results
    """
    sentences_tokens_map = defaultdict(list)
    last_capital = 0
    merged_sents = []
    for i, sent in enumerate(texts):
        sent = sent.split()
        if sent[0][0].isupper() or not sent[0][0].isalpha():
            last_capital = i
            sentences_tokens_map[last_capital].append( tuple([i, len(sent)]) )
            merged_sents.append(sent)
        else:
            sentences_tokens_map[last_capital].append( tuple([i, len(sent)]) )
            merged_sents[-1] = merged_sents[-1] + sent
    merged_sents = [' '.join(i) for i in merged_sents]
    return merged_sents, sentences_tokens_map


merged_texts_dev, map_dev = merge_sent(texts_dev)
merged_texts_test, map_test = merge_sent(texts_test)
merged_texts_test_b, map_test_b = merge_sent(texts_test_b)

In [None]:
logits_dev = model.predict(merged_texts_dev, split_on_space=True)
logits_test = model.predict(merged_texts_test, split_on_space=True)
logits_test_b = model.predict(merged_texts_test_b, split_on_space=True)


In [51]:
def process_logits_list(logits):
    """
    logits processing based on the label, adding proper ponctuation sign
    """
    out = []
    for i, sent in enumerate(logits[0]):
        new_sent = ''
        for j in range(len(sent)):
            for key, value in sent[j].items():
                if value == 'O':
                    new_sent += key + ' '
                else:
                    new_sent += key + value + ' '
        new_sent = new_sent.strip()
        out.append(new_sent)

    return out


pred_dev = process_logits_list(logits_dev)
pred_test = process_logits_list(logits_test)
pred_test_b = process_logits_list(logits_test_b)

In [53]:
def rebuild(results, sentences_map):
    """
    rebuild sentences based on the mergers before, ready for evaluation
    """
    ponctuation = ['.', ',', '?', '!', '-']

    new_res = []
    result_ind, raw_id = 0, 0
    current = ''
    for key, mapping in sentences_map.items():
        if len(mapping) == 1:
            new_res.append(results[result_ind])
            result_ind +=1
        else:
            splitted_current = results[result_ind].split()
            for sent_id, sent_len in mapping:
                new_res.append(' '.join(splitted_current[:sent_len]))
                splitted_current = splitted_current[sent_len:]

            result_ind +=1
    return new_res


pred_dev2 = rebuild(pred_dev, map_dev)
pred_test2 = rebuild(pred_test, map_test)
pred_test2_b = rebuild(pred_test_b, map_test_b)

In [None]:
def save_results(results, out_path):
    with open(out_path, 'w') as out:
        for i, sent in enumerate(results):
            out.write(f'{sent}\n')


save_results(pred_dev2, 'dev_res_herbert1.tsv')
save_results(pred_test2, 'test_res_herbert1.tsv')
save_results(pred_test2_b, 'test_res_b_herbert1.tsv')