Link to Kaggle Challenge: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

# 1. BiLSTM

In [None]:
# importing libraries for BiLSTM

import keras
from keras.layers import Embedding
from keras.layers import Dense, Flatten, LSTM
from keras.layers import Input, GlobalMaxPool1D, Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import BatchNormalization
from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint
from keras import optimizers
import numpy as np 
import os
import pandas as pd

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# reading data files
df_train = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/test.csv')

In [None]:
# dividing training data into features X and label y
X_train = df_train['comment_text'] 
X_test = df_test['comment_text'] 
y_train = df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

### PREPROCESSING

In [None]:
# importing tokenizer class from keras api
from keras.preprocessing.text import Tokenizer

In [None]:
# calculating the vocabulary size which will be given as an input to the Embedding layer

tokens = Tokenizer() 
tokens.fit_on_texts(X_train)
vocab_size = len(tokens.word_index) + 1 
# converting our tokens into sequence of integers
tokenized_train = tokens.texts_to_sequences(X_train) 
tokenized_test = tokens.texts_to_sequences(X_test)

In [None]:
print(X_train[6]) # text
print('---------------------------------------------------------------------------------------------------')
print(tokenized_train[6]) # corresponding comment (vectorized)

COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
----------------------------------------------------------------------------------------------------
[1873, 147, 6, 3476, 324, 15, 29, 141]



we can observe the size of each vector is different,

but our model expects the size of our input data to be same,

so we will be doing padding.


In [None]:
# importing the pad_sequences class from keras api
from keras.preprocessing.sequence import pad_sequences

In [None]:
# max length of the padded sequence that we want 
max_len = 300 
# padding our sequences with zeros
padded_train = pad_sequences(tokenized_train, maxlen = max_len, padding = 'post') 
padded_test = pad_sequences(tokenized_test, maxlen = max_len)

In [None]:
padded_train[:10]

array([[  688,    75,     1, ...,     0,     0,     0],
       [96145,    52,  2635, ...,     0,     0,     0],
       [  412,   437,    73, ...,     0,     0,     0],
       ...,
       [   20,   199,     2, ...,     0,     0,     0],
       [  263,    22,     1, ...,     0,     0,     0],
       [10960,    15,    13, ...,     0,     0,     0]], dtype=int32)

### USING PRE-TRAINED WORD EMBEDDINGS

In [None]:
import numpy as np

embedding_dim = 50
vocab_size = len(tokens.word_index) + 1 
embedding_matrix = np.zeros((vocab_size, embedding_dim))



# THANK YOU SO MUCH FOR LAB 4.1 :D

with open('/content/drive/MyDrive/Colab Notebooks/glove.6B.50d.txt', encoding = 'utf-8') as f:
    for line in f:
        word, *vector = line.split()
        if word in tokens.word_index:
            idx = tokens.word_index[word]
            embedding_matrix[idx] = np.array(vector, dtype = np.float32)[:embedding_dim]

### Implementing Bidirectional LSTM's

In [None]:
# creating a sequential model
model = Sequential()
# adding an embedding layer with pre-trained weights
model.add(Embedding(vocab_size, embedding_dim, weights = [embedding_matrix], input_length = max_len, trainable = False))
# adding a bidirectional LSTM layer with 50 memory units and returning sequences
model.add(Bidirectional(LSTM(50, return_sequences = True)))
# adding a GlobalMaxPool1D layer to extract the maximum value from the sequences
model.add(GlobalMaxPool1D())
# adding a BatchNormalization layer for normalizing the inputs
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(50, activation = "relu"))
model.add(Dropout(0.1))
model.add(Dense(32, activation = "relu"))
model.add(Dropout(0.1))
model.add(Dense(6, activation = 'sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 50)           10516900  
                                                                 
 bidirectional (Bidirection  (None, 300, 100)          40400     
 al)                                                             
                                                                 
 global_max_pooling1d (Glob  (None, 100)               0         
 alMaxPooling1D)                                                 
                                                                 
 batch_normalization (Batch  (None, 100)               400       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                        

In [None]:
import tensorflow as tf

# areating an ExponentialDecay learning rate schedule
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=1000,
    decay_rate=0.05)
optimizer_adam = tf.keras.optimizers.Adam(learning_rate=lr_schedule)


# compiling the model
model.compile(loss = 'binary_crossentropy', optimizer = optimizer_adam , metrics = ['accuracy'])

In [None]:
# fitting the model on the dataset
history = model.fit(padded_train, y_train, epochs = 8, batch_size = 128, validation_split = 0.3)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


### Making predictions on the test data

In [None]:
# list of the output class labels
labels_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# making predictions
y_pred = model.predict(padded_test, verbose = 1, batch_size = 128)



In [None]:
# creating submission

sample_submission = pd.read_csv("/content/drive/MyDrive/Coventry University/NLP CW 2/sample_submission.csv")

sample_submission[labels_list] = y_pred

sample_submission.to_csv("/content/drive/MyDrive/Coventry University/NLP CW 2/BiLSTM_submission.csv", index = False)

In [None]:
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.977985,0.376721,0.913251,0.099233,0.807791,0.266907
1,0000247867823ef7,0.00438,4e-06,0.001858,1.2e-05,0.000617,0.000111
2,00013b17ad220c46,0.01889,0.000412,0.015617,0.000968,0.006317,0.001115
3,00017563c3f7919a,0.010935,3e-06,0.002153,1.4e-05,0.001769,1e-05
4,00017695ad8997eb,0.007744,6e-06,0.003259,3.1e-05,0.000989,1.5e-05


# 2. Bi GRU

In [None]:
# importing lobraries for BiGRU

import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(96)
os.environ["OMP_NUM_THREADS"] = "4"
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.layers import InputSpec, Layer
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten

In [None]:
import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

# custom callback for evaluating ROC-AUC during training
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    # callback function called at the end of each training epoch
    def on_epoch_end(self, epoch, logs={}):
        # evaluating ROC-AUC score at specified intervals
        if epoch % self.interval == 0:
            # predicting probabilities on the validation set
            y_pred = self.model.predict(self.X_val, verbose=0)
            
            # calculating ROC-AUC score using sklearn's roc_auc_score
            score = roc_auc_score(self.y_val, y_pred)
            
            # printing the ROC-AUC score for monitoring
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/test.csv')
glove_path = "/content/drive/MyDrive/Colab Notebooks/glove.6B.50d.txt"

embedding_size = 50
max_feat = 100000
max_len = 150

In [None]:
labels_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = df_train[labels_list].values
df_train['comment_text'].fillna('no comment')
df_test['comment_text'].fillna('no comment')

# 80% train 20% test split
X_train, X_val, Y_train, Y_val = train_test_split(df_train, y, test_size=0.2, random_state=96)

## Preprocessing

In [None]:
orig_text_train = X_train['comment_text'].str.lower()
orig_text_val = X_val['comment_text'].str.lower()
orig_text_test = df_test['comment_text'].str.lower()

tokenizer = Tokenizer(num_words = max_feat, lower = True)
tokenizer.fit_on_texts(orig_text_train)
X_train['comment_sequence'] = tokenizer.texts_to_sequences(orig_text_train)
X_val['comment_sequence'] = tokenizer.texts_to_sequences(orig_text_val)
df_test['comment_sequence'] = tokenizer.texts_to_sequences(orig_text_test)

X_train = pad_sequences(X_train.comment_sequence, maxlen = max_len)
X_val = pad_sequences(X_val.comment_sequence, maxlen = max_len)
df_test = pad_sequences(df_test.comment_sequence, maxlen = max_len)

In [None]:
def get_coeffs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coeffs(*o.strip().split(" ")) for o in open(glove_path))

In [None]:
# getting word index from the tokenizer
word_index = tokenizer.word_index

# getting the number of words to consider based on max features
n_words = min(max_feat, len(word_index))

# initialising an embedding matrix with zeros
embedding_matrix = np.zeros((n_words, embedding_size))

# iterating through the word index
for word, i in word_index.items():
    # checking if the index is within the specified maximum features
    if i >= max_feat:
        continue
    
    # getting the embedding vector for the current word from the pre-trained embeddings
    embedding_vector = embedding_index.get(word)
    
    # setting it in the embedding matrix if embedding vector exists for the word
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = 'model_best_BiGRU.hdf5'

# model checkpoint callback to save the best model based on validation loss
check_point = ModelCheckpoint(file_path,
                              monitor = 'val_loss',
                              verbose = 1,
                              save_best_only = True,
                              mode = 'min')

# RocAucEvaluation callback for computing ROC-AUC score on validation data
ra_val = RocAucEvaluation(validation_data=(X_val, Y_val), interval = 1)

# EarlyStopping callback to stop training if validation loss does not improve for a certain number of epochs
early_stop = EarlyStopping(monitor = 'val_loss',
                           mode = 'min',
                           patience = 5)

# Implementing BiGRU

In [None]:

from keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

def BiGRU_model(optimizer=None, units=0, dr=0.0):
    # input layer for sequences of max_len length
    inp = Input(shape=(max_len,))

    # embedding layer with pre-trained word embeddings
    x = Embedding(max_feat, embedding_size, weights=[embedding_matrix], trainable=False)(inp)

    # spatial dropout layer for regularization
    x = SpatialDropout1D(dr)(x)

    # bidirectional GRU layer for capturing bidirectional dependencies
    x = Bidirectional(GRU(units, return_sequences=True))(x)

    # 1D Convolutional layer with 64 filters and kernel size 2
    x = Conv1D(64, kernel_size=2, padding='valid', kernel_initializer='he_uniform')(x)

    # global average pooling and global max pooling
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)

    # concatenating average pooling and max pooling outputs
    x = concatenate([avg_pool, max_pool])

    # Dense layer with sigmoid activation for multi-label classification
    x = Dense(6, activation='sigmoid')(x)

    # Create the Keras model
    model = Model(inputs=inp, outputs=x)
    
    # Return the model
    return model

    if optimizer is None:
        # if no optimizer is provided, create one with a learning rate schedule
        initial_learning_rate = 0.01
        decay_steps = 1000  # You need to set this value based on your dataset and training configuration
        decay_rate = 0.05  # Set to 0 to disable decay

        # defining the learning rate schedule
        lr_schedule = ExponentialDecay(
            initial_learning_rate=initial_learning_rate,
            decay_steps=decay_steps,
            decay_rate=decay_rate,
            staircase=True)

        # creating the Adam optimizer with the learning rate schedule
        optimizer = Adam(learning_rate=lr_schedule)

    # compiling the model with binary cross-entropy loss, the specified optimizer, and accuracy metric
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # training the model on the training data with specified batch size, epochs, and validation data
    history = model.fit(X_train, Y_train,
                        batch_size=128,
                        epochs=4,
                        validation_data=(X_val, Y_val),
                        verbose=1,
                        callbacks=[ra_val, check_point, early_stop])

    # loading the best model saved during training
    model = load_model(file_path)

    return model


# running BiGRU_model
model = BiGRU_model(units=128, dr=0.2)


Epoch 1/4
 ROC-AUC - epoch: 1 - score: 0.967607

Epoch 1: val_loss improved from inf to 0.05253, saving model to model_best_BiGRU.hdf5
Epoch 2/4
  3/998 [..............................] - ETA: 36s - loss: 0.0729 - accuracy: 0.9714

  saving_api.save_model(


 ROC-AUC - epoch: 2 - score: 0.981039

Epoch 2: val_loss improved from 0.05253 to 0.04871, saving model to model_best_BiGRU.hdf5
Epoch 3/4
  3/998 [..............................] - ETA: 37s - loss: 0.0439 - accuracy: 0.9453

  saving_api.save_model(


 ROC-AUC - epoch: 3 - score: 0.981103

Epoch 3: val_loss improved from 0.04871 to 0.04849, saving model to model_best_BiGRU.hdf5
Epoch 4/4
  1/998 [..............................] - ETA: 41s - loss: 0.0738 - accuracy: 0.9375

  saving_api.save_model(


 ROC-AUC - epoch: 4 - score: 0.981103

Epoch 4: val_loss did not improve from 0.04849


### Making predictions

In [None]:
pred = model.predict(df_test, batch_size = 1024, verbose = 1)



In [None]:
submission = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/sample_submission.csv')
submission[labels_list] = (pred)
submission.to_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/BiGRU_submission.csv', index = False)

In [None]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.98196,0.238642,0.971401,0.040829,0.836097,0.106463
1,0000247867823ef7,0.003628,0.000121,0.00218,3e-05,0.001275,0.000493
2,00013b17ad220c46,0.00346,8.5e-05,0.00124,1.7e-05,0.001388,0.000732
3,00017563c3f7919a,0.001377,8e-06,0.000259,1.4e-05,0.000321,1.9e-05
4,00017695ad8997eb,0.006817,0.000305,0.002621,0.000231,0.001868,0.000683


# 3. DistilBERT

In [1]:
# importing DistilBERT libraries

import time
import sys
import copy
import torch
import numpy as np
import pandas as pd
from scipy.sparse import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pyarrow as pa
import torch.nn as nn
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset,DataLoader
from transformers import DistilBertConfig,DistilBertTokenizer,DistilBertModel
from sklearn.model_selection import train_test_split



In [3]:

df_train = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/test.csv')

sample_submission = pd.read_csv("/content/drive/MyDrive/Coventry University/NLP CW 2/sample_submission.csv")
test_labels = pd.read_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/test_labels.csv')


df_train.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# initializing a tokenizer using DistilBERT pre-trained weights for tokenizing text
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [6]:
# calculating comment lengths
df_train['comment_length'] = df_train['comment_text'].apply(lambda x: len(tokenizer.tokenize(x)))

# displaying summary stats
print(df_train['comment_length'].describe())


count    159571.000000
mean         97.726022
std         155.622601
min           2.000000
25%          26.000000
50%          52.000000
75%         106.000000
max        4948.000000
Name: comment_length, dtype: float64


In [7]:
y = df_train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].astype(float)
X = df_train['comment_text']

# 80 train, 20 validation split
X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=96)

In [8]:
print('X_train shape is {}' .format({X_train.shape}))
print('X_val shape is {}' .format({X_val.shape}))
print('y_train shape is {}' .format({y_train.shape}))

X_train shape is {(127656,)}
X_val shape is {(31915,)}
y_train shape is {(127656, 6)}


In [9]:
X_train = X_train.values
X_val = X_val.values
y_train = y_train.values
y_val = y_val.values

In [10]:
# defining custom function to calculate accuracy

def accuracy_threshold(y_pred, y_true, thresh:float=0.4, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: y_pred = y_pred.sigmoid()
    return np.mean(((y_pred>thresh).float()==y_true.float()).float().cpu().numpy(), axis=1).sum()

In [11]:
# initializing a DistilBertConfig

config = DistilBertConfig(#vocab_size=32000,
                          dropout=0.1,
                          num_labels=6,
                          n_layers=12,
                          n_heads=12,
                          hidden_dim=300)

In [12]:
# defining a PyTorch module for fine-tuning DistilBERT for sequence classification.
# includes loading the pre-trained DistilBERT model,
# adding additional layers for classification, and specifying the forward pass logic. 


class DistilBertForSequenceClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_labels = config.num_labels

        # loading pre-trained DistilBERT model
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-cased')

        # additional layers for sequence classification
        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

        # initializing classifier weights
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids=None, attention_mask=None, head_mask=None, labels=None):
        # forward pass through DistilBERT
        distilbert_output = self.distilbert(input_ids=input_ids,
                                            attention_mask=attention_mask,
                                            head_mask=head_mask)
        hidden_state = distilbert_output[0]

        # pooling and additional layers
        pooled_output = hidden_state[:, 0]
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)

        # final classification layer
        logits = self.classifier(pooled_output)

        return logits


In [13]:
# The TextDataset class is a custom dataset class that inherits from PyTorch's Dataset class.
# It is designed to handle text data and is used in the context of training neural network models,
# particularly those using transformers like DistilBERT


max_seq_length = 128

class TextDataset(Dataset):
    def __init__(self, x, y, transform=None):

        self.x = x
        self.y = y
        self.transform = transform

    def __getitem__(self, index):

        tokenized_comment = tokenizer.tokenize(self.x[index])

        # pad the tokenized comment to the maximum sequence length
        if len(tokenized_comment) > max_seq_length:
            tokenized_comment = tokenized_comment[:max_seq_length]

        ids_review = tokenizer.convert_tokens_to_ids(tokenized_comment)

        # padding to achieve the desired sequence length
        padding = [0] * (max_seq_length - len(ids_review))
        ids_review += padding

        assert len(ids_review) == max_seq_length

        ids_review = torch.tensor(ids_review)

        # retrieving the label for the current index
        hcc = self.y[index]
        list_of_labels = [torch.from_numpy(hcc)]

        return ids_review, list_of_labels[0]

    def __len__(self):

        return len(self.x)


In [14]:
text_dataset(X_train, y_train).__getitem__(9)[1]   ### Testing index 9 to see output


tensor([0., 0., 0., 0., 0., 0.], dtype=torch.float64)

In [15]:
batch_size = 32

# creating training and test datasets
training_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_val, y_val)

# creating DataLoader instances for training and validation
dataloaders_dict = {
    'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=False),
    'val': torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
}

# storing the sizes of the training and validation datasets
dataset_sizes = {'train': len(X_train), 'val': len(X_val)}

# choosing the device; gpu if available, otherwise local cpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# initializing the DistilBERT model for sequence classification
model = DistilBertForSequenceClassification(config)

# moving the model to the chosen device
model.to(device)

# printing the chosen device (GPU or CPU)
print(device)


Downloading model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

cuda:0


In [16]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=2):
    model.train()
    since = time.time()
    print('Running')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # setting model to training mode
            else:
                model.eval()   # setting model to evaluate mode

            running_loss = 0.0
            beta_score_accuracy = 0.0
            micro_roc_auc_acc = 0.0

            # iterating over data
            for inputs, hcc in dataloaders_dict[phase]:
                inputs = inputs.to(device)
                hcc = hcc.to(device)

                optimizer.zero_grad()


                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)

                    loss = criterion(outputs, hcc.float())

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        scheduler.step()  

                running_loss += loss.item() * inputs.size(0)
                micro_roc_auc_acc += accuracy_threshold(outputs.view(-1, 6), hcc.view(-1, 6))

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_micro_roc_acc = micro_roc_auc_acc / dataset_sizes[phase]

            print('{} Total Loss: {:.4f} '.format(phase, epoch_loss))
            print('{} micro_roc_auc_accuracy: {:.4f}'.format(phase, epoch_micro_roc_acc))


        print()

    time_elapsed = time.time() - since
    print('Training completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Accuracy: {:4f}'.format(float(best_loss)))

    # loading best model weights
    model.load_state_dict(best_model_wts)
    return model

In [17]:
lr = 0.001  # Learning rate

# define the optimizer with Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr)

# fine-tuning optimizer
ft_optimizer = optimizer

# defining the loss criterion as Binary Cross Entropy with Logits Loss
criterion = nn.BCEWithLogitsLoss()

# defining the learning rate scheduler with a step size of 3 and a gamma of 0.1
exp_lr_scheduler = lr_scheduler.StepLR(ft_optimizer, step_size=3, gamma=0.1


In [18]:
# running model
ft_model = train_model(model,
                        criterion,
                        ft_optimizer,
                        exp_lr_scheduler,
                        num_epochs=4)

Running
Epoch 1/4
----------


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


train Total Loss: 0.1480 
train micro_roc_auc_accuracy: 0.9630
val Total Loss: 0.1437 
val micro_roc_auc_accuracy: 0.9638

Epoch 2/4
----------
train Total Loss: 0.1478 
train micro_roc_auc_accuracy: 0.9632
val Total Loss: 0.1437 
val micro_roc_auc_accuracy: 0.9638

Epoch 3/4
----------
train Total Loss: 0.1477 
train micro_roc_auc_accuracy: 0.9632
val Total Loss: 0.1437 
val micro_roc_auc_accuracy: 0.9638

Epoch 4/4
----------
train Total Loss: 0.1478 
train micro_roc_auc_accuracy: 0.9632
val Total Loss: 0.1437 
val micro_roc_auc_accuracy: 0.9638

Training completed in 109m 38s
Best val Accuracy: 100.000000


# Making Predictions

In [20]:
x_test = df_test['comment_text']
y_test = np.zeros(x_test.shape[0]*6).reshape(x_test.shape[0],6)

test_dataset = text_dataset(x_test,y_test)
prediction_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

# custom function to loop over data and predict probabilities
def prediction(model,test_loader):
    predictions_list = []
    for inputs, sentiment in test_loader:
        inputs = inputs.to(device)
        sentiment = sentiment.to(device)
        with torch.no_grad():
            outputs = model(inputs)
            outputs = torch.sigmoid(outputs)
            predictions_list.append(outputs.cpu().detach().numpy().tolist())
    return predictions_list

In [21]:
predictions_list = prediction(model=ft_model, test_loader=prediction_dataloader)
predictions_list = np.array(predictions_list)[:,0]

In [22]:
submission = pd.DataFrame(predictions_list, columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
df_test[['toxic','severe_toxic','obscene','threat','insult','identity_hate']] = submission
final_df = df_test[['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']]
final_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.508469,0.642524,0.599179,0.352346,0.491483,0.516694
1,0000247867823ef7,0.512211,0.604253,0.635151,0.451885,0.458584,0.487915
2,00013b17ad220c46,0.512489,0.579687,0.654915,0.4701,0.484542,0.507224
3,00017563c3f7919a,0.529257,0.60625,0.591508,0.429365,0.527066,0.496211
4,00017695ad8997eb,0.484127,0.628111,0.649418,0.481576,0.490005,0.512863


In [23]:
final_df.to_csv('/content/drive/MyDrive/Coventry University/NLP CW 2/distilBERT_submission.csv', index=False)