# Semeval offense notebook

# Overview

The tasks were given by
Sub task 1- offensive language detection

# Configuration

In [1]:
DATA_PATH = '.\inputDir\ref'

SAVE_PATH = './save'

TRAIN_PATH = './inputDir/ref/dontpatronizeme_pcl.tsv'


# Function Import

In [2]:
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import emoji
import wordsegment
from dont_patronize_me import DontPatronizeMe
import copy
import datetime
from typing import Dict, List
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from tqdm import tqdm

# Utilities 

In [3]:
def save(toSave, filename, mode='wb'):
    dirname = os.path.dirname(filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    file = open(filename, mode)
    pickle.dump(toSave, file)
    file.close()

def load(filename, mode='rb'):
    file = open(filename, mode)
    loaded = pickle.load(file)
    file.close()
    return loaded

def tokenizationFunction(sents, pad_token):
    sents_padded = []
    lens = lensFinderFunction(sents)
    max_len = max(lens)
    sents_padded = [sents[i] + [pad_token] * (max_len - l) for i, l in enumerate(lens)]
    return sents_padded

def sortingFunction(sents, reverse=True):
    sents.sort(key=(lambda s: len(s)), reverse=reverse)
    return sents

def maskFinderFunction(sents, unmask_idx=1, mask_idx=0):
    lens = lensFinderFunction(sents)
    max_len = max(lens)
    mask = [([unmask_idx] * l + [mask_idx] * (max_len - l)) for l in lens]
    return mask

def lensFinderFunction(sents):
    return [len(sent) for sent in sents]

#def getMaskLength(sents):
#    max_len = max([len(sent) for sent in sents])
#    return max_len

#def truncateLengthArray(sents, length):
#    sents = [sent[:length] for sent in sents]
#    return sents

def get_loss_weight(labels, label_order):
    nums = [np.sum(labels == lo) for lo in label_order]
    loss_weight = torch.tensor([n / len(labels) for n in nums])
    return loss_weight


# Data.py

Data import

In [4]:
dpm = DontPatronizeMe('.', 'dontpatronizeme_pcl.tsv')
# This method loads the subtask 1 data
dpm.load_task1()

# which we can then access as a dataframe
dpm.train_task1_df.head()

data=dpm.train_task1_df
trainData, testData = train_test_split(data, test_size=0.2, random_state=42, shuffle=False)

wordsegment.load()


def readPatronizationFile(filepath: str):
    ids = np.array(trainData['ids'].values)
    text = np.array(trainData['text'].values)
    label_1 = np.array(trainData['labels'].values)
    
    
    # Process text
    text = textProcessingFunction(text)
    nums = len(trainData)
    return ids,nums, text, label_1# title#, label_b, label_c



def testDataCreationFunction(task, tokenizer, truncate=512):
    ids = np.array(testData['ids'].values)
    texts = np.array(testData['text'].values)
    label_1 = np.array(testData['labels'].values)
    
    # Process text
    texts = textProcessingFunction(texts)
    nums = len(testData)
    token_ids = [tokenizer.encode(text=texts[i], add_special_tokens=True, max_length=truncate) for i in range(nums)]
    mask = np.array(maskFinderFunction(token_ids))
    lens = lensFinderFunction(token_ids)
    token_ids = np.array(tokenizationFunction(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_1
'''
def testDataCreationFunction_all(tokenizer, truncate=512):
    df = pd.read_csv(os.path.join(DATA_PATh, 'testset-levela.tsv'), sep='\t')
    df_a = pd.read_csv(os.path.join(DATA_PATh, 'labels-levela.csv'), sep=',')
    ids = np.array(df['id'].values)
    textLines = np.array(df['tweet'].values)
    label_a = np.array(df_a['label'].values)
    nums = len(df)

    # Process textLines
    textLines = textProcessingFunction(textLines)

    df_b = pd.read_csv(os.path.join(DATA_PATh, 'labels-levelb.csv'), sep=',')
    df_c = pd.read_csv(os.path.join(DATA_PATh, 'labels-levelc.csv'), sep=',')
    label_data_b = dict(zip(df_b['id'].values, df_b['label'].values))
    label_data_c = dict(zip(df_c['id'].values, df_c['label'].values))
    #label_b = [label_data_b[id] if id in label_data_b.keys() else 'NULL' for id in ids]
    #label_c = [label_data_c[id] if id in label_data_c.keys() else 'NULL' for id in ids]

    token_ids = [tokenizer.encode(text=textLines[i], add_special_tokens=True, max_length=truncate) for i in range(nums)]
    mask = np.array(maskFinderFunction(token_ids))
    lens = lensFinderFunction(token_ids)
    token_ids = np.array(tokenizationFunction(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_a#, label_b, label_c'''

def textProcessingFunction(textLines):
    # Process textLines
    #textLines = emoji2word(textLines)
    #textLines = replace_rare_words(textLines)
    textLines = remove_replicates(textLines)
    #textLines = segment_hashtag(textLines)
    textLines = remove_useless_punctuation(textLines)
    textLines = np.array(textLines)
    return textLines

#def emoji2word(sents):
#    return [emoji.demojize(sent) for sent in sents]

def remove_useless_punctuation(sents):
    for i, sent in enumerate(sents):
        sent = sent.replace(':', ' ')
        sent = sent.replace('_', ' ')
        sent = sent.replace('...', ' ')
        sents[i] = sent
    return sents

def remove_replicates(sents):
    # if there are multiple `@USER` tokens in a tweet, replace it with `@USERS`
    # because some textLines contain so many `@USER` which may cause redundant
    for i, sent in enumerate(sents):
        if sent.find('@USER') != sent.rfind('@USER'):
            sents[i] = sent.replace('@USER', '')
            sents[i] = '@USERS ' + sents[i]
    return sents
'''
def replace_rare_words(sents):
    rare_words = {
        'URL': 'http'
    }
    for i, sent in enumerate(sents):
        for w in rare_words.keys():
            sents[i] = sent.replace(w, rare_words[w])
    return sents

def segment_hashtag(sents):
    # E.g. '#LunaticLeft' => 'lunatic left'
    for i, sent in enumerate(sents):
        sent_tokens = sent.split(' ')
        for j, t in enumerate(sent_tokens):
            if t.find('#') == 0:
                sent_tokens[j] = ' '.join(wordsegment.segment(t))
        sents[i] = ' '.join(sent_tokens)
    return sents
'''
def all_tasks(filepath: str, tokenizer, truncate=512):
    nums, ids, textLines, label_a = readPatronizationFile(filepath)#''', label_b, label_c'''
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    token_ids = [tokenizer.encode(text=textLines[i], add_special_tokens=True, max_length=truncate) for i in range(nums)]
    mask = np.array(maskFinderFunction(token_ids))
    lens = lensFinderFunction(token_ids)
    token_ids = np.array(tokenizationFunction(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_a, label_b, label_c

#Above will need to be redifined
def task_1(filepath: str, tokenizer, truncate=512):
    ids,nums, textLines, label_1= readPatronizationFile(filepath)
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    token_ids = [tokenizer.encode(text=textLines[i], add_special_tokens=True, max_length=truncate) for i in range(nums)]
    mask = np.array(maskFinderFunction(token_ids))
    lens = lensFinderFunction(token_ids)
    token_ids = np.array(tokenizationFunction(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_1

#Below willl need to be fixed up with the other path
def task_2(filepath: str, tokenizer, truncate=512):
    ids, textLines, label_b,  = readPatronizationFile(filepath)
    # Only part of the textLines are useful for task b

    useful = label_b != 'NULL'
    ids = ids[useful]
    textLines = textLines[useful]
    label_b = label_b[useful]

    nums = len(label_b)
    # Tokenize
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    token_ids = [tokenizer.encode(text=textLines[i], add_special_tokens=True, max_length=truncate) for i in range(nums)]
    # Get mask
    mask = np.array(maskFinderFunction(token_ids))
    # Get lengths
    lens = lensFinderFunction(token_ids)
    # Pad tokens
    token_ids = np.array(tokenizationFunction(token_ids, tokenizer.pad_token_id))

    return ids, token_ids, lens, mask, label_b


# Datasets.py

In [5]:
import torch
from torch.utils.data import Dataset
#from config import LABEL_DICT

class PatronizationDataset(Dataset):
    def __init__(self, input_ids, lens, mask, labels, task):
        self.input_ids = torch.tensor(input_ids)
        self.lens = lens
        self.mask = torch.tensor(mask, dtype=torch.float32)
        self.labels = labels
        self.task = task

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        #this_LABEL_DICT = LABEL_DICT[self.task]
        input = self.input_ids[idx]
        length = self.lens[idx]
        mask = self.mask[idx]
        #label = torch.tensor(this_LABEL_DICT[self.labels[idx]])
        label = self.labels[idx]
        return input, length, mask, label
'''
class HuggingfaceMTDataset(Dataset):
    def __init__(self, input_ids, lens, mask, labels, task):
        self.input_ids = torch.tensor(input_ids)
        self.lens = lens
        self.mask = torch.tensor(mask, dtype=torch.float32)
        self.labels = labels

    def __len__(self):
        return self.labels['a'].shape[0]

    def __getitem__(self, idx):
        input = self.input_ids[idx]
        mask = self.mask[idx]
        length = self.lens[idx]
        label_1 = self.labels[idx]
        #label_A = torch.tensor(LABEL_DICT['a'][self.labels['a'][idx]])
        #label_B = torch.tensor(LABEL_DICT['b'][self.labels['b'][idx]])
        #label_C = torch.tensor(LABEL_DICT['c'][self.labels['c'][idx]])
        #return input, length, mask, label_A, label_B, label_C
        return input, length, mask, label_1
'''
class ImbalancedDatasetSampler(torch.utils.data.sampler.Sampler):
    """
    Samples elements randomly from a given list of indices for imbalanced dataset
    Arguments:
        indices (list, optional): a list of indices
        num_samples (int, optional): number of samples to draw
    """

    def __init__(self, dataset, indices=None, num_samples=None):
        # if indices is not provided,
        # all elements in the dataset will be considered
        self.indices = list(range(len(dataset.labels))) \
            if indices is None else indices

        # if num_samples is not provided,
        # draw `len(indices)` samples in each iteration
        self.num_samples = len(self.indices) \
            if num_samples is None else num_samples

        # distribution of classes in the dataset
        label_to_count = {}
        for idx in self.indices:
            label = self._get_label(dataset, idx)
            if label in label_to_count:
                label_to_count[label] += 1
            else:
                label_to_count[label] = 1

        # weight for each sample
        weights = [1.0 / label_to_count[self._get_label(dataset, idx)] for idx in self.indices]
        self.weights = torch.DoubleTensor(weights)

    def _get_label(self, dataset, id_):
        return dataset.labels[id_]

    def __iter__(self):
        return (self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True))

    def __len__(self):
        return self.num_samples


# trainer.py

In [16]:
class Trainer():
    '''
    The trainer for training models.
    It can be used for both single and multi task training.
    Every class function ends with _m is for multi-task training.
    '''
    def __init__(
        self,
        model: nn.Module,
        epochs: int,
        dataloaders: Dict[str, DataLoader],
        criterion: nn.Module,
        loss_weights: List[float],
        clip: bool,
        optimizer: torch.optim.Optimizer,
        scheduler: torch.optim.lr_scheduler,
        device: str,
        patience: int,
        task_name: str,
        model_name: str,
        seed: int
    ):
        self.model = model
        self.epochs = epochs
        self.dataloaders = dataloaders
        self.criterion = criterion
        self.loss_weights = loss_weights
        self.clip = clip
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.patience = patience
        self.task_name = task_name
        self.model_name = model_name
        self.seed = seed
        self.datetimestr = datetime.datetime.now().strftime('%Y-%b-%d_%H:%M:%S')

        # Evaluation results
        self.train_losses = []
        self.test_losses = []
        self.train_f1 = []
        self.test_f1 = []
        self.best_train_f1 = 0.0
        self.best_test_f1 = 0.0

        # Evaluation results for multi-task
        self.best_train_f1_m = np.array([0, 0, 0], dtype=np.float64)
        self.best_test_f1_m = np.array([0, 0, 0], dtype=np.float64)

    def train(self):
        for epoch in range(self.epochs):
            print(f'Epoch number {epoch}')
            print('=' * 20)
            print('/' * 10,'\\'*10)
            self.train_one_epoch()
            self.test()
            print(f'Best test f1: {self.bestTestF1Score:.4f}')
            print('\\'*10,'/' * 10)
            print('=' * 20)
        print('Saving results ...')
        save(
            (self.trainLosses, self.testingLosses, self.train_f1, self.testF1Score, self.best_train_f1, self.bestTestF1Score),
            f'./save/results/single_{self.task_name}_{self.dateTimeString}_{self.bestTestF1Score:.4f}.pt'
        )

    def train_one_epoch(self):
        self.model.train()
        dataloader = self.dataloaders['train']
        y_pred_all = None
        labels_all = None
        loss = 0
        iters_per_epoch = 0
        for inputs, lens, mask, labels in tqdm(dataloader, desc='Training'):
            iters_per_epoch += 1

            if labels_all is None:
                labels_all = labels.numpy()
            else:
                labels_all = np.concatenate((labels_all, labels.numpy()))

            inputs = inputs.to(device=self.device)
            lens = lens.to(device=self.device)
            mask = mask.to(device=self.device)
            labels = labels.to(device=self.device)

            self.optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # Forward
                logits = self.model(inputs, lens, mask, labels)
                _loss = self.criterion(logits, labels)
                loss += _loss.item()
                y_pred = logits.argmax(dim=1).cpu().numpy()

                if y_pred_all is None:
                    y_pred_all = y_pred
                else:
                    y_pred_all = np.concatenate((y_pred_all, y_pred))

                # Backward
                _loss.backward()
                if self.clip:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10)
                self.optimizer.step()
                if self.scheduler is not None:
                    self.scheduler.step()

        loss /= iters_per_epoch
        f1 = f1_score(labels_all, y_pred_all, average='macro')

        print(f'loss = {loss:.4f}')
        print(f'Macro-F1 = {f1:.4f}')

        self.trainLosses.append(loss)
        self.train_f1.append(f1)
        if f1 > self.best_train_f1:
            self.bestTrainF1ScoreFound= f1

    def test(self):
        self.model.eval()
        dataloader = self.dataloaders['test']
        y_pred_all = None
        labels_all = None
        loss = 0
        iters_per_epoch = 0
        for inputs, lens, mask, labels in tqdm(dataloader, desc='Testing'):
            iters_per_epoch += 1

            if labels_all is None:
                labels_all = labels.numpy()
            else:
                labels_all = np.concatenate((labels_all, labels.numpy()))

            inputs = inputs.to(device=self.device)
            lens = lens.to(device=self.device)
            mask = mask.to(device=self.device)
            labels = labels.to(device=self.device)

            with torch.set_grad_enabled(False):
                logits = self.model(inputs, lens, mask, labels)
                _loss = self.criterion(logits, labels)
                y_pred = logits.argmax(dim=1).cpu().numpy()
                loss += _loss.item()

                if y_pred_all is None:
                    y_pred_all = y_pred
                else:
                    y_pred_all = np.concatenate((y_pred_all, y_pred))

        loss /= iters_per_epoch
        f1 = f1_score(labels_all, y_pred_all, average='macro')

        print(f'loss = {loss:.4f}')
        print(f'Macro-F1 = {f1:.4f}')

        self.testingLosses.append(loss)
        self.testF1Score.append(f1)
        if f1 > self.bestTestF1Score:
            self.bestTestF1Score = f1
            self.save_model()

    def train_m(self):
        for epoch in range(self.epochs):
            print(f'Epoch {epoch}')
            print('=' * 20)
            print('/' * 10,'\\'*10)
            self.train_one_epoch_m()
            self.test_m()
            print(f'Best test results A: {self.bestTestF1Score_m[0]:.4f}')
            print(f'Best test results B: {self.bestTestF1Score_m[1]:.4f}')
            print(f'Best test results C: {self.bestTestF1Score_m[2]:.4f}')
            print('=' * 20)
            print('\\'*10,'/' * 10)

        print('Saving results ...')
        save(
            (self.trainLosses, self.testingLosses, self.train_f1, self.testF1Score, self.best_train_f1_m, self.bestTestF1Score_m),
            f'./save/results/mtl_{self.dateTimeString}_{self.bestTestF1Score_m[0]:.4f}.pt'
        )

    def train_one_epoch_m(self):
        self.model.train()
        dataloader = self.dataloaders['train']

        y_pred_all_A = None
        #y_pred_all_B = None
        #y_pred_all_C = None
        labels_all_A = None
        #labels_all_B = None
        #labels_all_C = None

        loss = 0
        iters_per_epoch = 0
        for inputs, lens, mask, label_A, label_B, label_C in tqdm(dataloader, desc='Training M'):
            iters_per_epoch += 1

            inputs = inputs.to(device=self.device)
            lens = lens.to(device=self.device)
            mask = mask.to(device=self.device)
            label_A = label_A.to(device=self.device)
            label_B = label_B.to(device=self.device)
            label_C = label_C.to(device=self.device)

            self.optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # Forward
                # logits_A, logits_B, logits_C = self.model(inputs, mask)
                all_logits = self.model(inputs, lens, mask)
                y_pred_A = all_logits[0].argmax(dim=1).cpu().numpy()
                y_pred_B = all_logits[1][:, 0:2].argmax(dim=1)
                y_pred_C = all_logits[2][:, 0:3].argmax(dim=1)

                Non_null_index_B = label_B != LABEL_DICT['b']['NULL']
                Non_null_label_B = label_B[Non_null_index_B]
                Non_null_pred_B = y_pred_B[Non_null_index_B]

                Non_null_index_C = label_C != LABEL_DICT['c']['NULL']
                Non_null_label_C = label_C[Non_null_index_C]
                Non_null_pred_C = y_pred_C[Non_null_index_C]

                labels_all_A = label_A.cpu().numpy() if labels_all_A is None else np.concatenate((labels_all_A, label_A.cpu().numpy()))
                labels_all_B = Non_null_label_B.cpu().numpy() if labels_all_B is None else np.concatenate((labels_all_B, Non_null_label_B.cpu().numpy()))
                labels_all_C = Non_null_label_C.cpu().numpy() if labels_all_C is None else np.concatenate((labels_all_C, Non_null_label_C.cpu().numpy()))

                y_pred_all_A = y_pred_A if y_pred_all_A is None else np.concatenate((y_pred_all_A, y_pred_A))
                y_pred_all_B = Non_null_pred_B.cpu().numpy() if y_pred_all_B is None else np.concatenate((y_pred_all_B, Non_null_pred_B.cpu().numpy()))
                y_pred_all_C = Non_null_pred_C.cpu().numpy() if y_pred_all_C is None else np.concatenate((y_pred_all_C, Non_null_pred_C.cpu().numpy()))

                # f1[0] += self.calc_f1(label_A, y_pred_A)
                # f1[1] += self.calc_f1(Non_null_label_B, Non_null_pred_B)
                # f1[2] += self.calc_f1(Non_null_label_C, Non_null_pred_C)

                _loss = self.loss_weights[0] * self.criterion(all_logits[0], label_A)
                _loss += self.loss_weights[1] * self.criterion(all_logits[1], label_B)
                _loss += self.loss_weights[2] * self.criterion(all_logits[2], label_C)
                loss += _loss.item()

                # Backward
                _loss.backward()
                if self.clip:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10)
                self.optimizer.step()
                if self.scheduler is not None:
                    self.scheduler.step()

        loss /= iters_per_epoch
        f1_A = f1_score(labels_all_A, y_pred_all_A, average='macro')
        f1_B = f1_score(labels_all_B, y_pred_all_B, average='macro')
        f1_C = f1_score(labels_all_C, y_pred_all_C, average='macro')

        print(f'loss = {loss:.4f}')
        print(f'A: {f1_A:.4f}')
        print(f'B: {f1_B:.4f}')
        print(f'C: {f1_C:.4f}')

        self.trainLosses.append(loss)
        self.train_f1.append([f1_A, f1_B, f1_C])

        if f1_A > self.best_train_f1_m[0]:
            self.best_train_f1_m[0] = f1_A
        if f1_B > self.best_train_f1_m[1]:
            self.best_train_f1_m[1] = f1_B
        if f1_C > self.best_train_f1_m[2]:
            self.best_train_f1_m[2] = f1_C

    def test_m(self):
        self.model.eval()
        dataloader = self.dataloaders['test']
        loss = 0
        iters_per_epoch = 0

        y_pred_all_A = None
        y_pred_all_B = None
        y_pred_all_C = None
        labels_all_A = None
        labels_all_B = None
        labels_all_C = None

        for inputs, lens, mask, label_A, label_B, label_C in tqdm(dataloader, desc='Test M'):
            iters_per_epoch += 1

            labels_all_A = label_A.numpy() if labels_all_A is None else np.concatenate((labels_all_A, label_A.numpy()))
            labels_all_B = label_B.numpy() if labels_all_B is None else np.concatenate((labels_all_B, label_B.numpy()))
            labels_all_C = label_C.numpy() if labels_all_C is None else np.concatenate((labels_all_C, label_C.numpy()))

            inputs = inputs.to(device=self.device)
            lens = lens.to(device=self.device)
            mask = mask.to(device=self.device)
            label_A = label_A.to(device=self.device)
            label_B = label_B.to(device=self.device)
            label_C = label_C.to(device=self.device)

            with torch.set_grad_enabled(False):
                all_logits = self.model(inputs, lens, mask)
                y_pred_A = all_logits[0].argmax(dim=1).cpu().numpy()
                y_pred_B = all_logits[1].argmax(dim=1).cpu().numpy()
                y_pred_C = all_logits[2].argmax(dim=1).cpu().numpy()

                # f1[0] += self.calc_f1(label_A, y_pred_A)
                # f1[1] += self.calc_f1(label_B, y_pred_B)
                # f1[2] += self.calc_f1(label_C, y_pred_C)

                y_pred_all_A = y_pred_A if y_pred_all_A is None else np.concatenate((y_pred_all_A, y_pred_A))
                y_pred_all_B = y_pred_B if y_pred_all_B is None else np.concatenate((y_pred_all_B, y_pred_B))
                y_pred_all_C = y_pred_C if y_pred_all_C is None else np.concatenate((y_pred_all_C, y_pred_C))

                _loss = self.loss_weights[0] * self.criterion(all_logits[0], label_A)
                _loss += self.loss_weights[1] * self.criterion(all_logits[1], label_B)
                _loss += self.loss_weights[2] * self.criterion(all_logits[2], label_C)
                loss += _loss.item()

        loss /= iters_per_epoch
        f1_A = f1_score(labels_all_A, y_pred_all_A, average='macro')
        f1_B = f1_score(labels_all_B, y_pred_all_B, average='macro')
        f1_C = f1_score(labels_all_C, y_pred_all_C, average='macro')

        print(f'loss = {loss:.4f}')
        print(f'A: {f1_A:.4f}')
        print(f'B: {f1_B:.4f}')
        print(f'C: {f1_C:.4f}')

        self.testingLosses.append(loss)
        self.testF1Score.append([f1_A, f1_B, f1_C])

        if f1_A > self.bestTestF1Score_m[0]:
            self.bestTestF1Score_m[0] = f1_A
            self.save_model()
        if f1_B > self.bestTestF1Score_m[1]:
            self.bestTestF1Score_m[1] = f1_B
        if f1_C > self.bestTestF1Score_m[2]:
            self.bestTestF1Score_m[2] = f1_C
    def calc_f1(self, labels, y_pred):
        return np.array([
            f1_score(labels.cpu(), y_pred.cpu(), average='macro'),
            f1_score(labels.cpu(), y_pred.cpu(), average='micro'),
            f1_score(labels.cpu(), y_pred.cpu(), average='weighted')
        ], np.float64)

    def printing(self, loss, f1):
        print(f'loss = {loss:.4f}')
        print(f'Macro-F1 = {f1[0]:.4f}')
        # print(f'Micro-F1 = {f1[1]:.4f}')
        # print(f'Weighted-F1 = {f1[2]:.4f}')

    def save_model(self):
        print('Saving model...')
        if self.task_name == 'all':
            filename = f'./save/models/{self.task_name}_{self.model_name}_{self.bestTestF1Score_m[0]}_seed{self.seed}.pt'
        else:
            filename = f'./save/models/{self.task_name}_{self.model_name}_{self.bestTestF1Score}_seed{self.seed}.pt'
        save(copy.deepcopy(self.model.state_dict()), filename)


In [10]:
def train_one_epoch(self):
        self.model.train()
        dataloader = self.dataloaders['train']
        y_pred_all = None
        labels_all = None
        loss = 0
        iters_per_epoch = 0
        for inputs, lens, mask, labels in tqdm(dataloader, desc='Training'):
            iters_per_epoch += 1

            if labels_all is None:
                labels_all = labels.numpy()
            else:
                labels_all = np.concatenate((labels_all, labels.numpy()))

            inputs = inputs.to(device=self.device)
            lens = lens.to(device=self.device)
            mask = mask.to(device=self.device)
            labels = labels.to(device=self.device)

            self.optimizer.zero_grad()

In [11]:
def train_one_epoch(self):
        self.model.train()
        dataloader = self.dataloaders['train']
        y_pred_all = None
        labels_all = None
        loss = 0
        iters_per_epoch = 0
        for inputs, lens, mask, labels in tqdm(dataloader, desc='Training'):
            iters_per_epoch += 1

            if labels_all is None:
                labels_all = labels.numpy()
            else:
                labels_all = np.concatenate((labels_all, labels.numpy()))

            inputs = inputs.to(device=self.device)
            lens = lens.to(device=self.device)
            mask = mask.to(device=self.device)
            labels = labels.to(device=self.device)

            self.optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # Forward
                logits = self.model(inputs, lens, mask, labels)
                _loss = self.criterion(logits, labels)
                loss += _loss.item()
                y_pred = logits.argmax(dim=1).cpu().numpy()

                if y_pred_all is None:
                    y_pred_all = y_pred
                else:
                    y_pred_all = np.concatenate((y_pred_all, y_pred))

                # Backward
                _loss.backward()
                if self.clip:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10)
                self.optimizer.step()
                if self.scheduler is not None:
                    self.scheduler.step()

#  train.py

Below works perfectly, Just need to add remaining functions above

Splitting train.py up and running it line by line 

In [12]:
#python train.py -bs=32 -lr=3e-6 -ep=20 -pa=3 --model=bert --task=a --clip --cuda=1
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, RobertaTokenizer, get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split

TRAIN_PATH = './inputDir/ref/dontpatronizeme_pcl.tsv'

In [13]:
if __name__ == '__main__':
    #Values for High Accuracy run
    #args = {'cuda':"1",'seed':69,'batch_size':32,'learning_rate':3e-6,'epochs':20,'patience':3,'model':'bert','task':'a','model_size':'base','truncate':100,'weight_decay':10,'hidden_dropout':0,'attention_dropout':0,'ckpt':'','scheduler':0,'loss_weights':1,'clip':1}
    args = {'cuda':"1",'seed':69,'batch_size':32,'learning_rate':3e-6,'epochs':20,'patience':5,'model':'bert','task':1,'model_size':'base','truncate':50,'weight_decay':0,'hidden_dropout':0.2,'attention_dropout':0.5,'ckpt':'','scheduler':0,'loss_weights':[1, 1, 1, 1] ,'clip':1}
    bs = args['batch_size']
    lr = args['learning_rate']
    task = args['task']
    model_name = args['model']
    model_size = args['model_size']
    truncate = args['truncate']
    epochs = args['epochs']
    wd = args['weight_decay']
    patience = args['patience']

    # Fix seed for reproducibility
    seed = args['seed']
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    # Set device
    os.environ["CUDA_VISIBLE_DEVICES"] = args['cuda']
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    num_labels = 5 if task == 2 else 2

    # Set tokenizer for different models
    if model_name == 'bert':
        if task == 'all':
            model = MTL_Transformer_LSTM(model_name, model_size, args=args)
        else:
            model = BERT(model_size, args=args, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased')
    elif model_name == 'roberta':
        if task == 'all':
            model = MTL_Transformer_LSTM(model_name, model_size, args=args)
        else:
            model = RoBERTa(model_size, args=args, num_labels=num_labels)
        tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}')
    elif model_name == 'bert-gate' and task == 'all':
        model_name = model_name.replace('-gate', '')
        model = GatedModel(model_name, model_size, args=args)
        tokenizer = BertTokenizer.from_pretrained(f'bert-{model_size}-uncased')
    elif model_name == 'roberta-gate' and task == 'all':
        model_name = model_name.replace('-gate', '')
        model = GatedModel(model_name, model_size, args=args)
        tokenizer = RobertaTokenizer.from_pretrained(f'roberta-{model_size}')
    # Move model to correct device
    model = model.to(device=device)

    if args['ckpt'] != '':   #This can be removed TODO
        model.load_state_dict(load(args['ckpt']))
    if task in [1, 2]:
        data_methods = {1: task_1, 2: task_2}
        ids, token_ids, lens, mask, labels = data_methods[task](TRAIN_PATH, tokenizer=tokenizer, truncate=truncate)
        test_ids, test_token_ids, test_lens, test_mask, test_labels = testDataCreationFunction(task, tokenizer=tokenizer, truncate=truncate)
        _Dataset = PatronizationDataset

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
if __name__ == '__main__':
    datasets = {
        'train': _Dataset(
            input_ids=token_ids,
            lens=lens,
            mask=mask,
            labels=labels,
            task=task
        ),
        'test': _Dataset(
            input_ids=test_token_ids,
            lens=test_lens,
            mask=test_mask,
            labels=test_labels,
            task=task
        )
    }

    sampler = ImbalancedDatasetSampler(datasets['train']) if task in [1,2] else None
    dataloaders = {
        'train': DataLoader(
            dataset=datasets['train'],
            batch_size=bs,
            sampler=sampler
        ),
        'test': DataLoader(dataset=datasets['test'], batch_size=bs)
    }

    criterion = torch.nn.CrossEntropyLoss()

    if args['scheduler']:
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
        # A warmup scheduler
        t_total = epochs * len(dataloaders['train'])
        warmup_steps = np.ceil(t_total / 10.0) * 2
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=t_total
        )
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
        scheduler = None

    trainer = Trainer(
        model=model,
        epochs=epochs,
        dataloaders=dataloaders,
        criterion=criterion,
        loss_weights=args['loss_weights'],
        clip=args['clip'],
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        patience=patience,
        task_name=task,
        model_name=model_name,
        seed=args['seed']
    )

    if task in [1,2]:
        trainer.train()
    else:
        trainer.train_m()

Epoch number 0
////////// \\\\\\\\\\


Training:   1%|▌                                                                     | 2/266 [01:08<2:31:05, 34.34s/it]
