## Installing modules

In [None]:
import sys
!{sys.executable} -m pip install transformers datasets textaugment gensim==3.6.0

In [None]:
!git clone https://github.com/EugGolovanov/TorchClippedOptimizers.git
!mv /content/TorchClippedOptimizers/optimizers.py ./optimizers.py 

## Imports

In [None]:
from typing import List, Dict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from torch.optim import Adam, AdamW
from optimizers import clipped_SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, MultiStepLR
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from datasets import concatenate_datasets, DatasetDict
from optimizers_refactored import clipped_SGD
import torch
import numpy as np
import random
from transformers import AutoModel, AutoTokenizer
from torch import nn
from torch.utils.data import Dataset, DataLoader
from gensim import downloader
from textaugment.word2vec import Word2vec
from collections import Counter
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

## Set seed

In [None]:
seed = 0xCAFEC0DE
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

## Dataset preview

In [None]:
cola_dataset = load_dataset("glue", "cola")
sst_dataset = load_dataset("glue", "sst2")
rte_dataset = load_dataset("glue", "rte")

datasets_for_classification = {"cola": cola_dataset, "sst2": sst_dataset, "rte":rte_dataset}

In [None]:
for name, dataset_for_classification in datasets_for_classification.items():
    print(f"Dataset {name}")
    
    for subdataset_type in dataset_for_classification.keys():
        counts = Counter(dataset_for_classification[subdataset_type]["label"])
        print(f"Subdataset {subdataset_type}: {', '.join([f'{key} - {value}' for key, value in counts.items()])}")
    
    print("-" * 100)

Dataset cola
Subdataset train: 1 - 6023, 0 - 2528
Subdataset validation: 1 - 721, 0 - 322
Subdataset test: -1 - 1063
----------------------------------------------------------------------------------------------------
Dataset sst2
Subdataset train: 0 - 29780, 1 - 37569
Subdataset validation: 1 - 444, 0 - 428
Subdataset test: -1 - 1821
----------------------------------------------------------------------------------------------------
Dataset rte
Subdataset train: 1 - 1241, 0 - 1249
Subdataset validation: 1 - 131, 0 - 146
Subdataset test: -1 - 3000
----------------------------------------------------------------------------------------------------


## Load pretrain Word2Vec for text augmentation

In [None]:
word2vec_model = downloader.load('fasttext-wiki-news-subwords-300')

In [None]:
word2vec_augmenter = Word2vec(model=word2vec_model, p=0.5)

## Text augmentation

In [None]:
def augment_dataset(word2vec_augmenter):
    def augment(x):
        return {"sentence": word2vec_augmenter.augment(x["sentence"]),
                "label": x["label"]}
    
    return augment

concated_cola_dataset = DatasetDict()
is_augmented = {"train": True, "validation":False}

for dataset_type in ["train", "validation"]:
    if is_augmented[dataset_type]:
        dataset_for_concatenation = cola_dataset[dataset_type].filter(lambda x: x["label"] == 0)
        dataset_for_concatenation = dataset_for_concatenation.map(augment_dataset(word2vec_augmenter))
        
        concated_cola_dataset[dataset_type] = concatenate_datasets([cola_dataset[dataset_type], 
                                                                    dataset_for_concatenation])
    else:
        concated_cola_dataset[dataset_type] = cola_dataset[dataset_type]

## Functions of preparing datasets

In [None]:
def create_prepare_tokens_function(tokenizer, max_length=256):
    def prepare_tokens(samples):
        tokenized_sentence = tokenizer(samples["sentence"],
                                       padding="max_length",
                                       max_length=max_length)
        return tokenized_sentence
    
    return prepare_tokens

In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, data_container: Dataset):
        self.data = data_container
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row_data = self.data[idx]
        
        x = {key:torch.LongTensor(row_data[key]) for key in ["input_ids", "attention_mask"]}
        y = row_data["label"]
        return x, y

## Functions of training an epoch and validation

In [None]:
def train_fn(model, dataloader: DataLoader,
             loss_fn: callable, metrics: Dict[str, callable], optimizer):
    model.train()
    
    running_loss = 0
    running_losses = []
    metrics_values = {key: [] for key in metrics}
    
    for data, label in dataloader:
      # convert data to gpu/cpu
        data = {key: value.to(device) for key, value in data.items()}
        
        optimizer.zero_grad()
        
        output = model(**data).logits
        preds = output.argmax(axis=-1)
        # calculate loss
        loss = loss_fn(output, label.to(device))
        loss.backward()
        
        # nn.utils.clip_grad_norm_(model.parameters(), 1.0) 

        optimizer.step()
        # logging losses for plot
        running_losses.append(loss.item())
        # logging losses for dataframe
        running_loss += loss.item()
        # logging metrics
        for metric_name in metrics:
            if "precision" in metric_name or "recall" in metric_name:
                metrics_values[metric_name].append(metrics[metric_name](label, preds.cpu(), zero_division=0))
            else:
                metrics_values[metric_name].append(metrics[metric_name](label, preds.cpu()))
            
    return running_loss / len(dataloader), metrics_values, running_losses

def eval_fn(model, dataloader: DataLoader,
             loss_fn: callable, metrics: Dict[str, callable]):
    model.eval()
    running_loss = 0
    metrics_values = {key: [] for key in metrics}
    running_losses = []
    for data, label in dataloader:
        # convert data to device
        x = {key: value.to(device) for key, value in data.items()}

        with torch.no_grad():
            output = model(**data).logits
        preds = output.argmax(axis=-1)
        # calculate loss
        loss = loss_fn(output, label.to(device))
        # logging loss
        running_loss += loss.item()
        running_losses.append(loss.item())
        # logging metrics
        for metric_name in metrics:
            if "precision" in metric_name or "recall" in metric_name:
                metrics_values[metric_name].append(metrics[metric_name](label, preds.cpu(), zero_division=0))
            else:
                metrics_values[metric_name].append(metrics[metric_name](label, preds.cpu()))
        
    return running_loss / len(dataloader), metrics_values


## Make a pipeline from training and validating functions 

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig

def calculate_weights(labels):
    # counting frequncies of classes
    counts = Counter(labels)
    
    weights = [0 for _ in range(len(counts))]
    
    for key, value in counts.items():
        # initialize the weights by the class frequency
        weights[key] = 2 * len(labels) / value
        
    weights = torch.FloatTensor(weights)
    
    return weights


def check_hypothesis(model_name, dataset, epochs, batch_size=32,
                     weighted=False, metrics_for_logging=dict(),
                     lr=1e-2, n_classes=2, optimizer=None, opt_params=None):
    '''
      input: model_name - > str: name of model for download from huggingface-hub
      input: dataset - > torch.Dataset: data set
      input: epochs - > int: num of epochs
      input: batch_size - > int: batch size
      input: weighted - > bool: flag of initialization weights
      input: metrics_for_logging - > dict: dict of metrics for logging
      input: lr - > float: learning rate
      input: n_classes - > int: number of classes
      input: optimizer - > torch.optim: custom optimizer
      input: opt_params - > dict: params of optimizer 
      output: model - > torch.tensor: model weights
      output: train_losses - > list: train losses logs by epoch
      output: train_metrics - > dict: train metrics by epoch
      output: eval_losses - > list: validation losses by epoch
      output: eval_metrics - > dict: validation metrics by iter
      output: long_train_losses - > list: 
    '''
    # load pretrain model from huggingface-hub
    model_config = AutoConfig.from_pretrained(model_name, num_labels=n_classes)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config,
                                                              ignore_mismatched_sizes=True)
    model = model.to(device)
    # prepare dataset
    tokenized_dataset = dataset.map(create_prepare_tokens_function(tokenizer), batched=True)
    train_dataset = ClassificationDataset(tokenized_dataset["train"])
    eval_dataset = ClassificationDataset(tokenized_dataset["validation"])
    # make weights conditioned on distribution of classes in dataset
    if weighted:
        weights = calculate_weights(dataset["train"]["label"])
    else:
        weights = torch.FloatTensor([1 for _ in range(n_classes)])
    # init loaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_dataloader = DataLoader(eval_dataset, batch_size=32, shuffle=False)
    # make logging lists
    train_losses, train_metrics, eval_losses, eval_metrics, long_train_losses, long_val_accs, long_train_accs = [], [], [], [], [], [], []
    # prepare metric function
    for key in metrics_for_logging.keys():
        if "f1" in key:
            metrics_for_logging[key] = lambda x, y: f1_score(x, y, average="micro")
    # set optimizer and scheduler
    optimizer = optimizer(model.parameters(), **opt_params)
    scheduler = MultiStepLR(optimizer, milestones=[3, 5, 7, 9], gamma=0.25)
    
    for epoch in tqdm(range(epochs)):
        # train 1 epoch
        train_loss, train_metric, long_train_loss = train_fn(model, train_dataloader,
                                            CrossEntropyLoss(weights.to(device)), metrics_for_logging, optimizer)
        # validate 
        eval_loss, eval_metric = eval_fn(model, eval_dataloader, CrossEntropyLoss(weights.to(device)), 
                                         metrics_for_logging)
        scheduler.step()
        # logging for plotting
        long_train_losses.extend(long_train_loss)
        long_train_accs.extend(train_metric['accuracy'])
        long_eval_accs.extend(eval_metric['accuracy'])
        # logging for dataframe
        train_losses.append(np.mean(train_loss))
        train_metrics.append({key: np.mean(value) for key, value in train_metric.items()})
        eval_metrics.append({key: np.mean(value) for key, value in eval_metric.items()})
        # eval_losses.append(eval_loss)
        # eval_metrics.append(eval_metric)

    return model, train_losses, train_metrics, eval_losses, eval_metrics, long_train_losses, long_train_accs, long_eval_accs


## *Optimizer*`s configuration

In [None]:
optimizers = [
    {
        'name': lambda lr: f"Adam, {lr}, eps 1e-8, weight_decay 0.0005",
        'params': {'lr': 1e-4, 'eps': 1e-8, 'weight_decay':0.0005},
        'opt': torch.optim.Adam
    },
    {
        'name': lambda lr: f"SGD, {lr}, 0.9",
        'params': {'lr':0.005, 'momentum':0.9},
        'opt': torch.optim.SGD
    },
    {
        'name': lambda lr: f"clipped_SGD, {lr}, 0.9, norm, 3",
        'params': {'lr':3e-3, 'momentum':0.9, 
                    'clipping_type':"norm", 'clipping_level':3},
        'opt': clipped_SGD
    },
    {
        'name': lambda lr: f"clipped_SGD, {lr}, 0.9, autoclip, 2, 0.75",
        'params': {'lr': 3e-3, 'momentum': 0.9, 'clipping_type': 'auto_clip',
                    'p_autoclip': 0.75},
        'opt': clipped_SGD
    },
    {
        'name': lambda lr: f"clipped_SGD, {lr}, 0.9, linear_stoch_clip_norm, 2, 0.85",
        'params': {'lr': 3e-3, 'momentum': 0.9,
                      'clipping_type': "linear_stoch_norm", 'clipping_level': 2.0, 'beta': 0.85},
        'opt': clipped_SGD
    },
    {
        'name': lambda lr: f"clipped_SGD, {lr}, 0.9, quadratic_stoch_norm, 2, 0.85",
        'params': {'lr': 3e-3, 'momentum': 0.9,
                      'clipping_type': "quadratic_stoch_norm", 'clipping_level': 2.0, 'beta': 0.85},
        'opt': clipped_SGD
    },
    {
        'name': lambda lr: f"clipped_SGD, {lr}, 0.9, linear_stoch_autoclip, 2, 0.85, 0.75",
        'params': {'lr': 3e-3, 'momentum': 0.9,
                    'clipping_type': "linear_stoch_autoclip", 'clipping_level': 1.0, 'p_autoclip': 0.25, 'beta':0.85},
        'opt': clipped_SGD
    },
    {
        'name': lambda lr: f"clipped_SGD, {lr}, 0.9, quadratic_stoch_autoclip, 2, 0.85, 0.75",
        'params': {'lr': 3e-3, 'momentum': 0.9,
                    'clipping_type': "quadratic_stoch_autoclip", 'clipping_level': 1.0, 'p_autoclip': 0.25, 'beta':0.85},
        'opt': clipped_SGD
    }
]

## Check and log hypothesises

In [None]:
import pandas as pd
logging_data = {x: [] for x in ("Model name", "Dataset", "Loss", "Lr",
                               "accuracy", "f1", "precision", "recall", 'opt_name')}
plot_logging_data = []
for opt_data in optimizers:
    for dataset_name, dataset in [("sst", sst_dataset)]:
        for model_name in ['bert-base-uncased']:
            for lr in [2e-2, 3e-3, 5e-5, 5e-3, 1e-4]:
                # prepare optimizer config for init 
                opt_name, opt_params, optimizer = opt_data['name'], opt_data['params'], opt_data['opt']
                opt_params['lr'] = lr
                opt_name = opt_name(lr)

                model, train_losses, train_metrics, eval_losses, eval_metrics, long_train_losses, long_train_metric, long_eval_metric = check_hypothesis(model_name, dataset, 5, 32, True,
                                                                                                                                                         {"accuracy": accuracy_score, "f1": f1_score,
                                                                                                                                                          "precision": precision_score, "recall": recall_score},
                                                                                                                                                          optimizer=optimizer, opt_params=opt_params)
                # logs
                logging_data["Model name"].append(model_name)
                logging_data["Dataset"].append(dataset_name)
                logging_data["Loss"].append(min(eval_losses))
                logging_data["Lr"].append(lr)
                logging_data['opt_name'].append(opt_name)
                logging_data["accuracy"].append(max(map(lambda x: x["accuracy"], eval_metrics)))
                plot_logging_data.append({
                    'name': opt_name,
                    'train_accuracy': long_train_metric,
                    'val_accuracy': long_eval_metric,
                    'train_loss': long_train_losses
                })
                f1_maximum_index = np.argmax(list(map(lambda x: x["f1"], eval_metrics)))
            
                for key, value in eval_metrics[f1_maximum_index].items():
                    if key != "accuracy":
                        logging_data[key].append(value)
                        
                for key, value in logging_data.items():
                    print(f"{key}: {value[-1]}", end=", ")
                print()
            

## Plot logs

In [None]:
def make_plot(main_title, losses, train_acc, val_acc, names_optimizers,
              metric_name="accuracy"):

    fig = plt.figure(figsize=(20, 8))
    fig.suptitle(main_title, fontsize=20)

    ax1 = plt.subplot2grid((2, 5), (0, 0), rowspan=2, colspan=3)
    ax2 = plt.subplot2grid((2, 5), (0, 3), colspan=2)
    ax3 = plt.subplot2grid((2, 5), (1, 3), colspan=2)

    fontdict={'fontsize': 14, 'fontweight': 'medium'}
    ax1.set_title(f"Train loss", fontdict=fontdict)
    ax2.set_title(f"Train {metric_name}", fontdict=fontdict)
    ax3.set_title(f"Valid {metric_name}", fontdict=fontdict)

    for i in range(len(losses)):
        ax1.plot(losses[i], label=names_optimizers[i],  alpha=0.5)
    ax1.legend()
    ax1.grid()

    for i in range(len(train_acc)):
        ax2.plot(train_acc[i], label=names_optimizers[i],  alpha=0.5)
    ax2.legend()
    ax2.grid()

    for i in range(len(val_acc)):
        ax3.plot(val_acc[i], label=names_optimizers[i],alpha=0.5)
    ax3.legend()
    ax3.grid()

In [None]:
def item_from_arrofdict(arr_dct, item):
    for dct in arr_dct:
        yield dct[item]

In [None]:
make_plot('BERT + CoLA&SST-2', item_from_arrofdict(plot_logging_data, 'train_loss'), item_from_arrofdict(plot_logging_data, 'train_accuracy'),
           item_from_arrofdict(plot_logging_data, 'val_accuracy'), item_from_arrofdict(plot_logging_data, 'name'))

## Export log as DataFrame

In [None]:
df = pd.DataFrame(logging_data)

In [None]:
df.head(10)

In [None]:
df.to_csv("logs_classification.csv", index=False)