In [None]:
import pandas as pd
import numpy as np
import project_functions as pf
import torch
import time
import datetime

from transformers import (AutoTokenizer,
                          BertForSequenceClassification,
                          RobertaForSequenceClassification,
                          get_linear_schedule_with_warmup)

from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

In [None]:
# Class definition for Dataset and evaluation function

class TextDataset(Dataset):

    def __init__(self, encodings, labels):

        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):

        return len(self.labels)

    def get_labels(self):

        return self.labels

In [None]:
# loading data
train_val_data = pd.read_csv('train_val.csv')
test_data = pd.read_csv('test.csv')

In [None]:
# splitting reviews into sentences and whole reviews - train set
sentences_train = pf.transform_sentences(train_val_data).reset_index(drop=True)
texts_train = pf.transform_text(train_val_data)

In [None]:
# splitting reviews into sentences and whole reviews - test set
sentences_test = pf.transform_sentences(test_data)
texts_test = pf.transform_text(test_data)

In [None]:
# loading extra, back-translated data
translated_data = pd.read_csv('translated_train.csv')
translated_data = pd.concat([translated_data, sentences_train.iloc[:,1:]], axis=1)
sentences_train = pd.concat([sentences_train, translated_data], axis=0).reset_index(drop=True)

In [None]:
# defining model parameters and seed for comparison purposes
model_path = 'sdadas/polish-roberta-large-v2'
ModelClass = RobertaForSequenceClassification

use_gpu_if_available = True
batch_size_train = 8
batch_size_eval = 8
max_tokenizer_length = 512

num_epochs = 5
warming_steps = 100
lr = 2e-5
weight_decay = 0.01
eval_steps_per_epoch = 1

torch.manual_seed(123)

In [None]:
# setting up device for training and testing
device = 'cuda' if torch.cuda.is_available() and use_gpu_if_available else 'cpu'
print(f'Training device set to: {device}')

In [None]:
# defining the number of labels for the multilabel task
classes = sentences_train.columns[1:]
labels = sentences_train.columns[1:]
labels = [s.strip() for s in labels]

NUM_LABELS = len(labels)
id2label = {idx: label for idx, label in enumerate(sorted(labels))}
label2id = {label: idx for idx, label in enumerate(sorted(labels))}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path, max_length=max_tokenizer_length)
model = ModelClass.from_pretrained(model_path, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)
model.train()

In [None]:
# preparing data for training 
train_texts = list(sentences_train['text'])
test_texts = list(sentences_test['text'])

train_labels = sentences_train.iloc[:,1:].astype(int).to_numpy()
test_labels = sentences_test.iloc[:,1:].astype(int).to_numpy()

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_tokenizer_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_tokenizer_length)

In [None]:
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size_eval, shuffle=False, pin_memory=True)

In [None]:
# model setup
eval_step_list = np.linspace(0, len(train_dataloader), eval_steps_per_epoch+1).astype(int).tolist()[1:]
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warming_steps, num_training_steps=total_steps)
criterion = torch.nn.BCEWithLogitsLoss()

In [None]:
# Training loop
for epoch in range(num_epochs):
    total_train_loss = 0.0
    loss_step_counter = 0

    for step, batch in enumerate(tqdm(train_dataloader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels = labels.float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()
        loss_step_counter += 1

        if (step + 1) in eval_step_list:
            eval_time = time.perf_counter()

            eval_time = time.perf_counter() - eval_time
            print(f'Epoch [{epoch + 1}/{num_epochs}], '
                  f'Step [{step + 1}/{len(train_dataloader)}], '
                  f'Train Loss: {total_train_loss / loss_step_counter:.2f}, '
                  f'lr: {optimizer.param_groups[0]["lr"]:.3e}, '
                  f'Eval Time: {eval_time:.2f}')

            total_train_loss = 0.0
            loss_step_counter = 0

# saving the model
# model.save_pretrained(f'{path}roberta_translation_{epoch}')
# tokenizer.save_pretrained(f'{path}roberta_translation_{epoch}')

In [None]:
# getting raw data from model
labels_test, predictions_test = pf.get_test_data(model, test_dataloader, device)

In [None]:
# splitting predictions based on predefined split value
split_val = 0.25

labels_test = np.array(labels_test)
predictions_test = np.array(predictions_test)
preds_test = (predictions_test > split_val).astype(int)

In [None]:
# final evaluation
test_dict = pf.get_test_evaluation(labels_test, preds_test, classes)

### Text model

In [None]:
# reusing sentence model and setting new parameters
model_path = translation_model_path
ModelClass = RobertaForSequenceClassification

use_gpu_if_available = True
batch_size_train = 2
batch_size_eval = 2
max_tokenizer_length = 512

num_epochs = 4
warming_steps = 100
lr = 2e-5
weight_decay = 0.01
eval_steps_per_epoch = 1 

torch.manual_seed(123)

In [None]:
# setting up device for training and testing
device = 'cuda' if torch.cuda.is_available() and use_gpu_if_available else 'cpu'
print(f'Training device set to: {device}')

In [None]:
# defining the number of labels for the multilabel task
classes = sentences_train.columns[1:]
labels = sentences_train.columns[1:]
labels = [s.strip() for s in labels]

NUM_LABELS = len(labels)
id2label = {idx: label for idx, label in enumerate(sorted(labels))}
label2id = {label: idx for idx, label in enumerate(sorted(labels))}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path, max_length=max_tokenizer_length)
model = ModelClass.from_pretrained(model_path, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)
model.train()

In [None]:
# preparing data for training 
train_texts = list(texts_train['text'])
test_texts = list(texts_test['text'])

train_labels = texts_train.iloc[:,1:].astype(int).to_numpy()
test_labels = texts_test.iloc[:,1:].astype(int).to_numpy()

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_tokenizer_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_tokenizer_length)

In [None]:
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size_eval, shuffle=False, pin_memory=True)

In [None]:
# model setup
eval_step_list = np.linspace(0, len(train_dataloader), eval_steps_per_epoch+1).astype(int).tolist()[1:]
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warming_steps, num_training_steps=total_steps)
criterion = torch.nn.BCEWithLogitsLoss()

In [None]:
# Training loop
for epoch in range(num_epochs):
    total_train_loss = 0.0
    loss_step_counter = 0

    for step, batch in enumerate(tqdm(train_dataloader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels = labels.float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()
        loss_step_counter += 1

        if (step + 1) in eval_step_list:
            eval_time = time.perf_counter()

            
            eval_time = time.perf_counter() - eval_time
            print(f'Epoch [{epoch + 1}/{num_epochs}], '
                  f'Step [{step + 1}/{len(train_dataloader)}], '
                  f'Train Loss: {total_train_loss / loss_step_counter:.2f}, '
                  f'lr: {optimizer.param_groups[0]["lr"]:.3e}, '
                  f'Eval Time: {eval_time:.2f}')

            total_train_loss = 0.0
            loss_step_counter = 0

# saving the model
# model.save_pretrained(f'{path}roberta_translation_text_{epoch}')
# tokenizer.save_pretrained(f'{path}roberta_translation_text_{epoch}')

In [None]:
# getting raw data from model
labels_test, predictions_test = pf.get_test_data(model, test_dataloader, device)

In [None]:
# splitting predictions based on predefined split value
split_val = 0.20

labels_test = np.array(labels_test)
predictions_test = np.array(predictions_test)
preds_test = (predictions_test > split_val).astype(int)

In [None]:
# final evaluation
test_dict = pf.get_test_evaluation(labels_test, preds_test, classes)