In [1]:
import numpy as np
import pandas as pd

In [2]:
import random

In [3]:
from tqdm import tqdm

In [4]:
import re
import os

In [5]:
import time

In [6]:
from collections import Counter

In [7]:
from sklearn.model_selection import train_test_split

#### PyTorch

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, Subset
from torch.utils.data.sampler import SequentialSampler
from torch.utils.data import DataLoader

import torch.optim as optim

In [9]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [10]:
import torchinfo

#### Embeddings

In [11]:
# Should be used by default. Shows best results on intrinsic evaluations.
# Model was trained on large corpus of an literature (~150GB).

# !wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

In [12]:
from navec import Navec

#### Metrics

In [13]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [14]:
from Levenshtein import distance as levenshtein_distance

#### Visualisation

In [15]:
import matplotlib
import matplotlib.pyplot as plt

In [16]:
import scienceplots

plt.style.use('science')
%config InlineBackend.figure_format = 'retina'

lables_fs = 16
ticks_fs = 12

In [17]:
import seaborn as sns

## Load data

In [18]:
prepared_dir = '../data/prepared'
filename_csv = '02_punct_pushkin.csv'

In [19]:
# load saved dataset
data_df = pd.read_csv(os.path.join(prepared_dir, filename_csv), index_col=0)
data_df.shape

(4456, 3)

In [20]:
pd.options.display.max_colwidth = 150
data_df.sample(5)

Unnamed: 0,input,input_lemma,new_target
3319,ну вот же вам ваши деньги отправляйтесь назад,ну вот же вы ваш деньга отправляться назад,S S S S S C S F
4094,эй дуня закричал смотритель поставь самовар да сходи за сливками,эй дуня закричать смотритель поставить самовар да сходить за сливка,C C S C S S S S S F
2540,к вечеру буря утихла и ветер оборотился в противную сторону,к вечер буря утихнуть и ветер оборотиться в противный сторона,S S S C S S S S S F
3605,он упал у колеса разбойники окружили его,он упасть у колесо разбойник окружить он,S S S C S S F
2690,пугачев ночевал на месте сражения на другой день занял дубовку и двинулся к царицыну,пугачев ночевать на место сражение на другой день занять дубовка и двинуться к царицын,S S S S C S S S S S S S S F


### Pretrained Embeddings

In that implementation we will use [`navec`](https://github.com/natasha/navec#evaluation) library of pretrained word embeddings for Russian language.

In [26]:
navec_path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec_embed = Navec.load(navec_path)

In [27]:
EMBED_SIZE = 300

PAD = '<pad>'
UNK = '<unk>'

### DataFrame of bigrams

In [54]:
def get_bigrams_df(data_df):
    bigrams_df = pd.DataFrame()  # dataframe of all bigrams
    
    bigrams_list = []
    bigrams_punc_list = []
    
    for ind_row in tqdm(data_df.index):
        sentence = data_df.loc[ind_row]['input'].split()
        sent_len = len(sentence)
    
        sent_all_punc = data_df.loc[ind_row]['new_target'].split()
    
        assert len(sent_all_punc) == sent_len
        
        for ind_word in range(sent_len):
            if ind_word == sent_len - 1:
                bigram = ' '.join([sentence[ind_word], PAD])
            else:
                bigram = ' '.join([sentence[ind_word], sentence[ind_word + 1]])
    
            bigrams_list.append(bigram)
            bigrams_punc_list.append(sent_all_punc[ind_word])
    
    bigrams_df['input'] = bigrams_list
    bigrams_df['target'] = bigrams_punc_list

    return bigrams_df

In [55]:
bigrams_df = get_bigrams_df(data_df)

100%|██████████████████████████████████████| 4456/4456 [00:00<00:00, 22117.63it/s]


In [65]:
bigrams_df.sample(10)

Unnamed: 0,input,target
39480,троекурову отымают,C
38529,здравствуй володька,C
12744,сказать что,C
52742,почталион погорельского,S
30209,удачно <pad>,F
44020,но дубровского,S
51889,чувствуя затруднительность,S
2145,я в,S
56610,уж вас,S
6847,удержания их,S


## Dataset

**In that approach** we will use _original_ (not lemmatized) sentences as input (`input` column)

In [42]:
IGNORE_ID = -1  # token id to ignore

# punctuation vocab
PUNC_2_ID = {'S': 0, 'C': 1, 'F':2}
ID_2_PUNC = {v: k for k, v in PUNC_2_ID.items()}

In [66]:
class BigramsDataset(Dataset):
    """Custom Dataset for punctuation prediction"""

    def __init__(self, df, input_col, target_col, embed):
        self.bigrams = df[input_col]  # all sentences
        self.targets = df[target_col]  # all targets

        self.embed = embed  # navec embedding

    def __len__(self):
        """Return number of sentences"""
        return len(self.bigrams)

    def __getitem__(self, index):
        """Return one Tensor pair of (input id sequence, punc id sequence)"""
        bigram = self.bigrams[index]
        target = self.targets[index]
        
        words_seq, punc_id = self._preprocess(bigram, target)
        return words_seq, punc_id

    def _preprocess(self, bigram, target):
        """Convert txt sequence to word-id-seq and punc-id-seq"""
        # INPUT
        input_tensor = None
        
        for word in bigram.split():
            if word in self.embed:  # if word in vocab
                word_embed = navec_embed[word]
            else:
                word_embed = navec_embed[UNK]
    
            assert len(word_embed) == EMBED_SIZE
            assert isinstance(word_embed, np.ndarray)
        
            word_embed_tensor = torch.tensor(word_embed)  # size: [embed_size]
            word_embed_tensor = word_embed_tensor.unsqueeze(dim=0)  # size: [1, embed_size]
        
            assert word_embed_tensor.size() == (1, EMBED_SIZE)
        
            if input_tensor is None:
                input_tensor = word_embed_tensor
            else:
                input_tensor = torch.cat(
                    (input_tensor, word_embed_tensor), 
                    dim=0
                )
        # size: [2, embed_size]
        assert input_tensor.size() == (2, EMBED_SIZE)
        
        # OUTPUT
        output = []
        for punc in target.split():
            output.append(PUNC_2_ID.get(punc))

        return input_tensor, torch.LongTensor(output)

In [67]:
bigrams_ds = BigramsDataset(
    df=bigrams_df, 
    input_col='input', 
    target_col='target',
    embed=navec_embed
)

print(len(bigrams_ds))  # dataset length

58773


### Batches

In [68]:
data_loader = DataLoader(
    bigrams_ds,
    batch_size=10, 
    drop_last=False,
    num_workers=0
)

In [69]:
next(iter(data_loader))[0].size()

torch.Size([10, 2, 300])

## Model

In [194]:
class BigramsClassifier(nn.Module):
    def __init__(
        self, num_class, 
        hidden_size=64,
        kernel_size=5,
        stride=1
    ):
        super().__init__()

        self.num_class = num_class
        # Hyper-parameters
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        self.stride = stride

        left_padding = stride * (kernel_size - 1)
        self.padding = nn.ZeroPad2d((left_padding, 0, 0, 0)) # left padding

        self.conv_1 = nn.Conv1d(
            in_channels=EMBED_SIZE,
            out_channels=self.hidden_size, 
            kernel_size=self.kernel_size, 
            stride=self.stride
        )
        self.conv_2 = nn.Conv1d(
            in_channels=self.hidden_size,
            out_channels=self.hidden_size,
            kernel_size=2,
            # kernel_size=self.kernel_size,
            # stride=self.stride
        )

        # self.conv_3 = nn.Conv1d(
        #     in_channels=self.hidden_size,
        #     out_channels=self.hidden_size // 2, 
        #     kernel_size=2,
        # )
        
        self.fc = nn.Linear(self.hidden_size, self.num_class)

    def forward(self, embedded_input):
        embedded_input = torch.permute(embedded_input, (0, 2, 1))
        # print(f'input: {embedded_input.size()}')
        
        out = self.conv_1(self.padding(embedded_input))
        # print(f'conv_1: {out.size()}')

        # out = self.conv_2(self.padding(out))
        out = self.conv_2(out)
        # print(f'conv_2: {out.size()}')

        # out = self.conv_3(out)
        # print(f'conv_3: {out.size()}')
        
        out = torch.permute(out, (0, 2, 1))
        out = self.fc(out)
        # print(f'output: {out.size()}')
        return out # output tensor should be of shape [batch_size, 2, n_tokens]

In [159]:
# test of conv1d
a = torch.randn(15, 100, 2)  
m = nn.Conv1d(100, 32, kernel_size=2, stride=1) 
out = m(a)
print(out.size())

torch.Size([15, 32, 1])


## Training and Evaluationg loops

In [121]:
def train_fn(model, data_loader, loss_func, optimizer,
             device='cpu', show_process=False):
    '''
    Function to train `model`
    Args:
        model: torch.nn.Module - Neural Network
        data_loader: torch.utils.data.DataLoader - loader (by batches) for the train dataset
        loss_func - loss function
        optimizer: torch.optim
        device: str - device to computate on
        show_process: bool - flag to show (or not) a progress bar
    Returns:
        mean loss by batches
    '''
    model.train()  # activate 'train' mode of a model
    train_loss = []  # to store loss for each batch

    for X, y in tqdm(data_loader, total=len(data_loader),
                                desc='train', position=0,
                                leave=True, disable=not show_process):  # [X, y] - batch
        X, y = X.to(device), y.to(device)
        
        optimizer.zero_grad()

        y_hat = model(X)  # size: [bs, max_sent_length, num_classes]

        # print(f'{y} vs {y_hat}')
        
        y_hat = y_hat.view(-1, y_hat.size(-1))
        loss = loss_func(y_hat, y.view(-1))  # loss calculation for the batch
        
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_norm)
        optimizer.step()

        train_loss.append(loss.item())  # accumulate losses for batches

    return np.mean(train_loss)  # return mean loss of the epoch

In [114]:
def validate_fn(model, data_loader, loss_func,
                device='cpu', show_process=False):
    '''
    Function to train `model`
    Args:
        model: torch.nn.Module - Neural Network
        data_loader: torch.utils.data.DataLoader - loader (by batches) for the validation dataset
        loss_func - loss function
        device: str - device to computate on
        show_process: bool - flag to show (or not) a progress bar
    Returns:
          mean loss by batches
    '''
    model.eval()  # activate 'eval' mode of a model
    val_loss = []  # to store loss for each batch

    for X, y in tqdm(data_loader, total=len(data_loader),
                                desc='validation', position=0,
                                leave=True, disable=not show_process):  # [X, y] - batch
        X, y = X.to(device), y.to(device)

        with torch.no_grad():
            y_hat = model(X)  # size: [bs, max_sent_length, num_classes]

            y_hat = y_hat.view(-1, y_hat.size(-1))
            loss = loss_func(y_hat, y.view(-1))  # loss calculation for the batch

        val_loss.append(loss.item())  # accumulate losses for batches

    return np.mean(val_loss)

## Model training

### DataLoaders

In [87]:
splitting_random_state = 78
test_ratio = 0.25

In [75]:
# data splitting
train_df, test_df = train_test_split(
    data_df, 
    test_size=test_ratio, 
    random_state=splitting_random_state
)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [76]:
bigrams_train_df = get_bigrams_df(train_df)
len(bigrams_train_df)

100%|██████████████████████████████████████| 3342/3342 [00:00<00:00, 25928.50it/s]


44358

In [77]:
bigrams_test_df = get_bigrams_df(test_df)
len(bigrams_test_df)

100%|██████████████████████████████████████| 1114/1114 [00:00<00:00, 15730.21it/s]


14415

In [78]:
len(bigrams_train_df) / (len(bigrams_df))

0.7547343167781124

In [83]:
# datasets
train_ds = BigramsDataset(
    df=bigrams_train_df, 
    input_col='input', 
    target_col='target',
    embed=navec_embed
)

test_ds = BigramsDataset(
    df=bigrams_test_df, 
    input_col='input', 
    target_col='target',
    embed=navec_embed
)

In [175]:
train_bs = 250
val_bs = 1000

train_loader = DataLoader(
    train_ds,
    batch_size=train_bs, 
    drop_last=False,
    num_workers=0
)

val_loader = DataLoader(
    test_ds,
    batch_size=val_bs, 
    drop_last=False,
    num_workers=0
)

### Training model

In [176]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [191]:
# model parameters
num_class = len(PUNC_2_ID)

hidden_size = 32
kernel_size = 3
stride = 1

In [197]:
# MODEL
model = BigramsClassifier(
    num_class, 
    hidden_size,
    kernel_size, stride
).to(device)

# criterion
loss_func = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_ID)
# optimizer
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=1e-3,
    weight_decay=0.0
)
# scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    # factor=0.5,  # default: 0.1
    # patience=2,  # default: 10
)

In [198]:
n_epochs = 20
print_each = 1

start_time = time.time()
prev_val_loss = 100
for epoch in range(n_epochs):
    start_epoch_time = time.time()
    if (epoch == 0) or ((epoch + 1) % print_each == 0):
        print(f'Epoch #{epoch + 1}: ', end='')

    # torch.manual_seed(48)  # for reproducibility
    mean_train_loss = train_fn(model, train_loader, loss_func,
                               optimizer,
                               device=device,
                               show_process=False
                              )  # train the model
    mean_val_loss = validate_fn(model, val_loader, loss_func,
                                device=device,
                                show_process=False
                               )  # evaluate the model
    
    if (epoch == 0) or ((epoch + 1) % print_each == 0):
        log_info = (f'\ttrain - {mean_train_loss:.6f}; ' +
                    f'\tval - {mean_val_loss:.6f}' + 
                    f'\t\ttime - {(time.time() - start_time):.3f} s'
                   )
        print(log_info)

    if prev_val_loss < mean_val_loss:
            break
    prev_val_loss = mean_val_loss

    if scheduler:
        scheduler.step(mean_val_loss)

Epoch #1: 	train - 0.564869; 	val - 0.482642		time - 4.767 s
Epoch #2: 	train - 0.470776; 	val - 0.468384		time - 9.364 s
Epoch #3: 	train - 0.458943; 	val - 0.465848		time - 13.904 s
Epoch #4: 	train - 0.453823; 	val - 0.464603		time - 18.379 s
Epoch #5: 	train - 0.450574; 	val - 0.463622		time - 23.338 s
Epoch #6: 	train - 0.448198; 	val - 0.462771		time - 28.212 s
Epoch #7: 	train - 0.446341; 	val - 0.462022		time - 32.721 s
Epoch #8: 	train - 0.444836; 	val - 0.461363		time - 38.435 s
Epoch #9: 	train - 0.443589; 	val - 0.460785		time - 44.494 s
Epoch #10: 	train - 0.442539; 	val - 0.460281		time - 53.700 s
Epoch #11: 	train - 0.441639; 	val - 0.459859		time - 65.438 s
Epoch #12: 	train - 0.440861; 	val - 0.459519		time - 77.800 s
Epoch #13: 	train - 0.440207; 	val - 0.459176		time - 89.745 s
Epoch #14: 	train - 0.439577; 	val - 0.458992		time - 99.441 s
Epoch #15: 	train - 0.438985; 	val - 0.459039		time - 109.344 s


#### Model metrics

In [199]:
model_test = model

In [200]:
END_PUNC = ['F']
INTR_PUNC = ['S', 'C']

NAMES_PUNC = {
    'S': 'space (` `)',
    'C': 'comma (`,`)',
    'F': 'end of sent',
}

CLASSES = sorted(END_PUNC + INTR_PUNC)  # alphabetic order

In [211]:
def get_predictions_df(model, test_ds):
    test_loader = DataLoader(
        test_ds,
        batch_size=1, 
        drop_last=False,
        num_workers=0
    )  # test DataLoader

    all_test_targets = []  # by markers
    all_test_preds = []
    
    model.eval()
    
    for i, (data) in enumerate(test_loader):
        padded_input, padded_target = data
        all_test_targets.append(' '.join([ID_2_PUNC[ix.item()] for ix in padded_target[0]]))
        
        pred = model(padded_input)
        pred = torch.argmax(pred.view(-1, pred.size(-1)), dim=1)
        all_test_preds.append(' '.join([ID_2_PUNC[ix.item()] for ix in pred]))

        assert len(pred) == len(padded_target[0])

    # DataFrame with results
    target_vs_pred_df = pd.DataFrame()

    target_vs_pred_df['target'] = all_test_targets
    target_vs_pred_df['predicted'] = all_test_preds

    return target_vs_pred_df


def return_separate_punct(target_vs_pred_df):
    test_all_punc_target = []  # list of all punctuation
    test_all_punc_preds = []
    
    for target_this, predicted_this in zip(target_vs_pred_df['target'], target_vs_pred_df['predicted']):
        test_all_punc_target.extend(target_this.split(' '))
        test_all_punc_preds.extend(predicted_this.split(' '))
    
    assert len(test_all_punc_target) == len(test_all_punc_preds)
    
    return test_all_punc_target, test_all_punc_preds


def get_all_metrics(model, test_df):
    target_vs_pred_df = get_predictions_df(model, test_df)
    test_all_punc_target, test_all_punc_preds = return_separate_punct(target_vs_pred_df)

    cm = confusion_matrix(test_all_punc_target, test_all_punc_preds)
    # precision = TP / (TP + FP)
    precision = precision_score(test_all_punc_target, test_all_punc_preds, average=None, zero_division=np.nan)
    # recall = TP / (TP + FN)
    recall = recall_score(test_all_punc_target, test_all_punc_preds, average=None, zero_division=np.nan)
    # f1 = 2TP / (2TP + FP + FN)
    f1 = f1_score(test_all_punc_target, test_all_punc_preds, average=None)

    # PRINT
    metrics_names = ['Precision', 'Recall', 'F1 score']
    metrics = {'Precision': precision, 'Recall': recall, 'F1 score': f1}
    col_w = 18
    
    print(' ' * col_w + '|' + ''.join([f"{NAMES_PUNC[token] + (col_w - len(NAMES_PUNC[token])) * ' '}|" for token in CLASSES]))  # header
    print(''.join(['-' * col_w + '|' for _ in range(len(CLASSES) + 1)]) )
    for ind, metric_name in enumerate(metrics_names):
        row = f"{metric_name + (col_w - len(metric_name)) * ' '}|"
        for score in metrics[metric_name]:
            score_str = f'{score:.6f}'
            row += f"{score_str + (col_w - len(score_str)) * ' '}|"
        print(row)

    # Levenshtein distance
    print('\nLevenshtein distance:')
    target_vs_pred_df['levenshtein'] = target_vs_pred_df.apply(
        lambda row: levenshtein_distance(row.target, row.predicted),
        axis = 1
    )
    print(f"\tMean: {target_vs_pred_df['levenshtein'].mean()}")
    print(f"\tMIN : {target_vs_pred_df['levenshtein'].min()}")
    print(f"\tMAX : {target_vs_pred_df['levenshtein'].max()}\n")

In [212]:
%%time
get_all_metrics(model_test, test_ds)

                  |comma (`,`)       |end of sent       |space (` `)       |
------------------|------------------|------------------|------------------|
Precision         |0.617602          |0.639752          |0.849541          |
Recall            |0.352839          |0.647217          |0.924959          |
F1 score          |0.449103          |0.643463          |0.885647          |

Levenshtein distance:
	Mean: 0.18806798473812
	MIN : 0
	MAX : 1

CPU times: user 6.3 s, sys: 11.2 s, total: 17.5 s
Wall time: 2.42 s
