<a href="https://colab.research.google.com/github/Dhruv-2020EE30592/Extra-Notebooks/blob/main/Hate_Span_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import transformers
import os
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import DistilBertTokenizerFast, DistilBertModel, AdamW

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/hatenorm/train.csv', delimiter='|')
df_test = pd.read_csv('/kaggle/input/hatenorm/test.csv', delimiter='|')
df.head(1).sentence

In [None]:
# def remove_special_characters(text):
#     cleaned_text = ''
#     for char in text:
#         if char.isalnum() or char==' ':
#             cleaned_text += char
#     return cleaned_text

# df['sentence'] = df['sentence'].apply(lambda x: remove_special_characters(x))
# df_test['sentence'] = df_test['sentence'].apply(lambda x: remove_special_characters(x))

In [None]:
class CustomDataset_testing(Dataset):

    def __init__(self, dataframe):
        self.data = dataframe
        self.input = self.data.sentence
        self.max_len = 512
        self.Id = self.data.Id
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.input)

    def __getitem__(self, index):

        encoding = self.tokenizer.encode_plus(
            self.input.iloc[index],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        tag_to_idx = {"B":0, "I":1, "O":2}

        ids = (encoding['input_ids'])
        mask = (encoding['attention_mask'])
        word_to_token_len_dict =  [-1 for _ in range(512)]

        tokens = (self.tokenizer.convert_ids_to_tokens(encoding['input_ids']))
        token_type_ids = (encoding['token_type_ids'])
        original_sentence = self.input.iloc[index].split()
        start_ind = 1
        end_ind = 1

        for ind in range(len(original_sentence)):
            word_encoding = self.tokenizer.encode_plus(
                original_sentence[ind],
                max_length=self.max_len,
                padding=False,
                truncation=True
            )

            num_of_word_tokens = len(word_encoding['input_ids'])-2
            word_to_token_len_dict[ind] = num_of_word_tokens
#             print(num_of_word_tokens, " ",original_sentence[ind],  "word_encodings: ", word_encoding)
            end_ind = start_ind + num_of_word_tokens
            start_ind = end_ind

        return {
            'Id':torch.tensor(self.Id.iloc[index]),
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'tokens': tokens,
            'word_to_token_len_dict': torch.tensor(word_to_token_len_dict, dtype=torch.long)

        }

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe):
        self.data = dataframe
        self.Id = self.data.Id
        self.input = self.data.sentence
        self.output = self.data.bio
        self.max_len = 512
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.input)

    def __getitem__(self, index):

        encoding = self.tokenizer.encode_plus(
            self.input.iloc[index],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        tag_to_idx = {"B":0, "I":1, "O":2}

        ids = (encoding['input_ids'])
        mask = (encoding['attention_mask'])
        tokens = (self.tokenizer.convert_ids_to_tokens(encoding['input_ids']))
        token_type_ids = (encoding['token_type_ids'])
        targets = [tag_to_idx[tag] for tag in self.output.iloc[index].replace(" ", "")]
        custom_targets = [[0, 1] for _ in range(len(encoding['input_ids']))]
        custom_targets_tags_only = [2 for _ in range(len(encoding['input_ids']))]
        word_to_token_len_dict = [-1 for _ in range(512)]
        original_sentence = self.input.iloc[index].split()
        start_ind = 1
        end_ind = 1

        for ind in range(len(original_sentence)):
            word_encoding = self.tokenizer.encode_plus(
                original_sentence[ind],
                max_length=self.max_len,
                padding=False,
                truncation=True
            )
            num_of_word_tokens = len(word_encoding['input_ids'])-2
            word_to_token_len_dict[ind] = num_of_word_tokens
#             print(num_of_word_tokens, " ",original_sentence[ind],  "word_encodings: ", word_encoding)
            end_ind = start_ind + num_of_word_tokens
            try:
                if(targets[ind] == 0):
                    curr_cusotm_targets = [[1, 0] for _ in range(num_of_word_tokens)]
                    curr_cusotm_targets_tags_only = [0 for _ in range(num_of_word_tokens)]
#                     curr_cusotm_targets[0] = [1, 0, 0]
#                     curr_cusotm_targets_tags_only[0] = 0

                    custom_targets[start_ind:end_ind] = (curr_cusotm_targets)
                    custom_targets_tags_only[start_ind:end_ind] = curr_cusotm_targets_tags_only


                elif(targets[ind] == 1):
                    curr_cusotm_targets = [[1, 0] for _ in range(num_of_word_tokens)]
                    curr_cusotm_targets_tags_only = [0 for _ in range(num_of_word_tokens)]
                    custom_targets[start_ind:end_ind] = (curr_cusotm_targets)
                    custom_targets_tags_only[start_ind:end_ind] = curr_cusotm_targets_tags_only

                else:
                    curr_cusotm_targets = [[0, 1] for _ in range(num_of_word_tokens)]
                    curr_cusotm_targets_tags_only = [1 for _ in range(num_of_word_tokens)]
                    custom_targets[start_ind:end_ind] = (curr_cusotm_targets)
                    custom_targets_tags_only[start_ind:end_ind] = curr_cusotm_targets_tags_only
            except:
                print(f'Error occured for input: { self.input.iloc[index]}')
                print(" ")
                print(f'The original_sentence: {original_sentence}')
                print(" ")
                print(f'targets: {targets}')
                print(" ")
                print(f'ids: {ids}')
                print(" ")
                print(f'Error index: {ind} and {original_sentence[ind]}' )
#             print("Custom Targets: ", custom_targets)
            start_ind = end_ind

        return {
            'Id':torch.tensor(self.Id.iloc[index]),
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'tokens': tokens,
            'targets': torch.tensor(custom_targets, dtype=torch.long),
            'word_to_token_len_dict': torch.tensor(word_to_token_len_dict, dtype=torch.long),
            'custom_targets_tags_only': torch.tensor(custom_targets_tags_only, dtype=torch.long)
#                 print(f'Error in creating tensor for target {custom_target}')
        }

In [None]:
# from transformers import BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# demo_data = {'Id':[1, 2], 'sentence': ['Hello', 'Say it loud , say it clear , illegal #immigrants are not welcome here @user'],
#         'bio': ['0', 'O O O O O O O O B I O O O O O']}
# demo_df = pd.DataFrame(demo_data)

# # Creating an instance of the dataset
# dataset = CustomDataset(demo_df)
# sentence = (dataset[1]['ids'])
# print((dataset[1]))
# print((dataset[1]['targets']))
# print((dataset[1]['ids']))
# print((dataset[1]['targets']))


In [None]:
train_data, val_data = train_test_split(df, test_size=0.3, random_state=0)
print(len(train_data))
train_dataset = CustomDataset(train_data)
val_dataset = CustomDataset(val_data)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
test_dataset = CustomDataset_testing(df_test)
test_dataloader = DataLoader(test_dataset, shuffle=False)
# print((test_dataset[0]))

In [None]:
# print((train_dataset[0]))
# print(str(train_data.head(1).bio))
# print(len(train_dataset[0]['ids']))
# print((train_dataset[1]['word_to_token_len_dict']))
# print(len(train_dataset[0]['token_type_ids']))
# print(len(train_dataset[0]['tokens']))
# print((train_dataset[0]['targets']))

# print(train_dataloader[0].shape)

In [None]:
# print(len(str(df.head(1).sentence).split(" ")))
# print((str(df.head(1).sentence).split(" ")))
# print(len(str(df.head(1).bio).split(" ")))
# print((str(df.head(1).bio).split(" ")))
# # print(len(val_data))

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
def customise_output(output):
    try:
        _, class_indices = torch.max(output, dim=2)
        return class_indices
    except:
        print(f'Dimenation mismatch: the output shuold be of the dimention torch.tensor([{batch_size}, 512, 3])')

class CustomModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = DistilBertModel.from_pretrained('distilbert-base-uncased', output_attentions=False, output_hidden_states=True)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 2)

    def forward(self, ids, mask):
        _, output_1= self.l1(ids, attention_mask = mask, return_dict=False)
#         print("Outout from bert layer: ", _)
#         print("Output: ", output_1)
        output_2 = self.l2(_)
        output = self.l3(output_2)
        class_indices = customise_output(output)

#         probabilities = F.softmax(output, dim=-1)  # Apply softmax on the last dimension to convert logits to probabilities
        return output, class_indices

model = CustomModel()
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-05)

In [None]:
def train(epoch, num_epochs):
    model.train()
    train_loop = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{num_epochs}, Training')
#     print(enumerate(train_loop))
    for _,data in enumerate(train_loop):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs, class_indices = model(ids, mask)
#         print("outputs shape: ", outputs.shape, " ", outputs[0])
#         print("targets shape: ", targets.shape,  " ", targets[0])
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

def checkpoint(epoch):
    checkpoint_filename = f"checkpoint_epoch_{epoch+1}.pth"
    torch.save({
        'epoch': epoch+1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_filename)

def validation(epoch, num_epochs):
    model.eval()
    val_loop = tqdm(val_dataloader, desc=f'Epoch {epoch+1}/{num_epochs}, Validation')
    fin_targets=[]
    fin_outputs=[]

    #bio tags outputs for tokenised sentence
    fin_bio_outputs=[]
    fin_targets_one_hot_encoded = []
    with torch.no_grad():
        for _, data in enumerate(val_loop):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets_one_hot_encoded = data['targets'].to(device, dtype = torch.float)
            targets = data['custom_targets_tags_only'].to(device, dtype = torch.float)
            outputs, classfication_indices = model(ids, mask)
            fin_targets_one_hot_encoded.extend(targets_one_hot_encoded.cpu().detach().numpy().tolist())
            fin_bio_outputs.extend(classfication_indices)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets, fin_bio_outputs, fin_targets_one_hot_encoded

def convert_islands(tags):
    # Create a copy of the list to avoid modifying the input list directly
    converted_tags = tags[:]

    # Use enumerate to get both index and value in the list
    for i, tag in enumerate(converted_tags):
        # Check if current tag is 'I'
        if tag == 'I':
            # Change it to 'B' if it's the start of the list or the previous tag is not 'I'
            if i == 0 or tags[i - 1] != 'I':
                converted_tags[i] = 'B'

    return converted_tags

def testing():
    model.eval()
    val_loop = tqdm(test_dataloader, desc=f'Testing')
    fin_targets=[]
    fin_outputs=[]
    submission = {}
    submission_2 = {}
    #bio tags outputs for tokenised sentence
    fin_bio_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(val_loop):
            Id = data['Id'].to(device, dtype = torch.long)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#             targets_one_hot_encoded = data['targets'].to(device, dtype = torch.float)
#             targets = data['custom_targets_tags_only'].to(device, dtype = torch.float)
            word_to_token_len_dict = data['word_to_token_len_dict']
            outputs, classfication_indices = model(ids, mask)
            start_ind = 0
            final_tags = []
#             print(classfication_indices.shape)
            word_to_token_len_dict_int_list = [t.item() for t in word_to_token_len_dict[0]]
            #convert back output tokens to sentence length of test dataset
            for token_len in word_to_token_len_dict_int_list:
                if token_len != -1:
                    word_tags = [t.item() for t in classfication_indices[0][start_ind:start_ind+token_len]]
                    if 0 in word_tags:
                        final_tags.append('I')
                    else:
                        final_tags.append('O')
                    start_ind += token_len

            submission_2[Id[0].item()] = final_tags
            final_tags = convert_islands(final_tags)

            final_output_string = ' '.join(final_tags)
            submission[Id[0].item()] = final_output_string

            #bio tags outputs for tokenised sentence
#             fin_targets_one_hot_encoded.extend(targets_one_hot_encoded.cpu().detach().numpy().tolist())
            fin_bio_outputs.extend(classfication_indices)
#             fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets, fin_bio_outputs, submission, submission_2

In [None]:
count = 0
for _ in iter(train_dataloader):
    pass
print(count)

In [None]:
num_epochs = 3
for epoch in range(num_epochs):
    train(epoch, num_epochs)
    checkpoint(epoch)
    outputs, targets, bio_outputs, targets_one_hot_encoded  = validation(epoch, num_epochs)
    bio_outputs_list =  [t.tolist() for t in bio_outputs]
    print("targets shape:", len(targets[0]))
    print("output shape: ",len(outputs[0]))
    print("targets_one_hot_encoded shape: ",type(targets_one_hot_encoded[0]), type(targets_one_hot_encoded))
    print("bio_output: ",type(bio_outputs[0]), type(bio_outputs))

    # change the targets and outputs to list
#     print("Final output size: ", (outputs))
#     print("Final target size: ", (targets))
#     outputs = np.array(outputs) >= 0.5

#     bio_outputs_tensor = torch.tensor(torch.stack(bio_outputs, dim=0), dtype=torch.float32, device=torch.device(device))
#     targets_tensor = torch.tensor( targets, dtype=torch.float32, device=torch.device(device))
    targets_tensor = torch.tensor( targets_one_hot_encoded, dtype=torch.float32, device=torch.device(device))
    outputs_tensor = torch.tensor( outputs, dtype=torch.float32, device=torch.device(device))
    targets_flat = targets_tensor.view(-1, 2)
    outputs_flat = outputs_tensor.view(-1, 2)
    print(outputs_tensor.shape)
    print(targets_tensor.shape)
    losses  = F.cross_entropy(input=outputs_flat, target=targets_flat.argmax(dim=1), reduction='none')
    print(losses)
    print("Validation Loss: ", (torch.mean(loss_fn(outputs_flat, targets_flat))))
#     print("Precision: ", precision_score(targets, fin_bio_outputs_list, average=None))
#     print("Recall: ", recall_score(targets.cpu(), fin_bio_outputs_list, average=None))
#     print("F1-score: ", f1_score(targets, fin_bio_outputs_list, average=None))
#     print("Micro Avg F1-score: ", f1_score(targets.cpu(), fin_bio_outputs_list, average='micro'))
#     print("Macro Avg F1-score: ", f1_score(targets.cpu(), fin_bio_outputs_list, average='macro'))

In [None]:
# submission = {'id': [], 'tagged_sentence' : []} # dictionary to store tag predictions
# # NOTE ---> ensure that tagged_sentence's corresponing 'id' is same as 'id' of corresponding 'untagged_sentence' in training data
# def store_submission(sent_id, tagged_sentence):

#     global submission
#     submission['id'].append(sent_id)
#     submission['tagged_sentence'].append(tagged_sentence)

# def clear_submission():
#     global submission
#     submission = {'id': [], 'tagged_sentence' : []}

In [None]:
num_epochs = 3
# for epoch in range(num_epochs):
outputs, targets, bio_outputs, submission, submission_2  = testing()

path_to_directory = '/kaggle/working/'
pd.DataFrame(list(submission.items()), columns=['Id', 'bio']).to_csv(path_to_directory +'label_submission.csv', index = False)

In [None]:
print(submission_2)