# Configure the Library

In [None]:
import numpy as np
import pandas as pd
import os
import random
import time

import re
import string
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks", context="talk")
plt.style.use('dark_background')

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as func
from torch.utils.data import dataloader, dataset

import transformers
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score, mean_squared_error, roc_auc_score, roc_curve, auc
from sklearn.model_selection import KFold 

import warnings
warnings.simplefilter('ignore')

# load dataset

In [None]:
train = pd.read_csv('./dataset/train.csv.zip', nrows=200)
test = pd.read_csv('./dataset/test.csv.zip', nrows=200)
test_label = pd.read_csv('./dataset/test_labels.csv.zip', nrows=200)
train.head()
#test.head()
#test_label.head()

# Clean the dataset -- helper function for removing non-sense fragment

In [None]:
def clean_text(text):

    text = re.sub('\[.*?\]', '', text)
    #pattern = [zero or more character]

    text = re.sub('https?://\S+|www\.\S+', '', text)
    #pattern = removes (http),://, 'and' www.
    
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #pattern = any punctionation

    text = re.sub('\n', '', text)
    #pattern = any new line

    text = re.sub('\w*\d\w*', '', text)
    #pattern = any from[a-zA-Z0-9_], any from[0-9], any from [a-zA-Z0-9_]

    return text


train['clean'] = train['comment_text'].apply(str).apply(lambda x: clean_text(x))
test['clean'] = test['comment_text'].apply(str).apply(lambda x: clean_text(x))

# DataLoader

In [None]:
class BertDataSet(dataset.Dataset):
    
    def __init__(self, texts, labels, tokenizer, max_len, if_train=True):
        self.texts = texts
        #self.labels = labels.to_numpy()
        self.if_train = if_train
        if self.if_train:
            self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts.iloc[item])
        
        if self.if_train:
            label = self.labels.iloc[item]

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            truncation=True
        )

        if self.if_train:
            return {
                #'text': text,
                'input_ids':torch.tensor(inputs['input_ids'], dtype=torch.long),
                'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'labels': torch.tensor(label, dtype=torch.float)
            }
        else:
            return{
            'input_ids':torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)
            }
     

# Test

In [None]:
#senten_len = []
#for sentence in tqdm(train['clean']):
#    token_words = tokenizer.encode_plus(sentence, padding=True, max_length = 768)['input_ids'] # Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors
#    senten_len.append(len(token_words))


In [None]:
token = next(iter(train['clean']))
print(type(train['clean']))

# Model definition

In [None]:
train_batch = 32
valid_batch = 32

epochs = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(f"The devices using is : {device}")

#Using K-folder
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# fold_index = 0

fold_losses = []
fold_valid = []

best_model  = None
best_loss = None
best_index = 0

loss_fn = nn.BCEWithLogitsLoss() # for sigmoid on multi-label tasks
loss_fn.to(device)

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 6)
model.to(device)

# train

In [None]:
for train_index, valid_index in kf.split(train):
    train_kf = train.iloc[train_index]
    valid_kf = train.iloc[valid_index]
    
    train_dataset = BertDataSet(train_kf['clean'], train_kf[['toxic', 'severe_toxic','obscene', 'threat', 'insult','identity_hate']], tokenizer, 77)
    valid_dataset = BertDataSet(valid_kf['clean'], valid_kf[['toxic', 'severe_toxic','obscene', 'threat', 'insult','identity_hate']], tokenizer, 77)
    
    #train_dataset = BertDataSet(train_kf, tokenizer, eval_mode = False)
    #valid_dataset = BertDataSet(valid_kf, tokenizer, eval_mode = True)
    
    train_dataloader = dataloader.DataLoader(train_dataset, batch_size = train_batch, pin_memory = True, num_workers = 0, shuffle = True)
    valid_dataloader = dataloader.DataLoader(valid_dataset, batch_size = valid_batch, pin_memory = True, num_workers = 0, shuffle = False)
    
    optimizer = AdamW(model.parameters(), lr=1e-5)

    training_losses = []
    valid_losses = []
    
    
    for epoch in range(epochs):   
        
        optimizer.zero_grad()
        model.train()
        total_loss = 0
        valid_loss = 0
        with torch.cuda.amp.autocast():
     
            for batch in tqdm(train_dataloader):
                
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask, labels = labels)
                outputs = outputs['logits']
                
                toxic_labels = batch['labels'].to(device, non_blocking=True)
                loss = loss_fn(outputs, toxic_labels)
                total_loss += loss.item()
                
                loss.backward()
                optimizer.step()
                
            avg_loss = total_loss / len(train_dataloader)
            training_losses.append(avg_loss)
            print(f"Training Epoch {epoch+1}/{epochs}, Loss: {avg_loss}. {len(training_losses)}")
        
            model.eval()
            true_labels = []
            predictions = []
            
            for batch in valid_dataloader:
                
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['mask'].to(device)
                labels = batch['labels'].to(device)
                        
                
                with torch.no_grad():
                    outputs = model(input_ids, attention_mask = attention_mask)
                    
                outputs = outputs['logits'].squeeze(-1)
                
                toxic_labels = batch['labels'].to(device, non_blocking=True)
                loss = loss_fn(outputs, toxic_labels)
                valid_loss += loss.item()
                
            avg_valid_loss = valid_loss / len(valid_dataloader)
            valid_losses.append(avg_valid_loss)   
            #f1 = f1_score(true_labels, predictions, average='weighted')  #Error : Classification metrics can't handle a mix of multilabel-indicator and binary targets       
            print(f"Validating Epoch {epoch+1}/{epochs}, Loss: {avg_valid_loss}")
        
        if( best_loss == None or avg_valid_loss > best_loss ):
            
            best_model = model
            best_loss = avg_valid_loss
            fold_losses = training_losses
            fold_valid = valid_losses
            print("Find the better model")

    
best_model.save_pretrained('./fintune_bert')    
    

In [None]:
# draw the losses of the best model
best_train_losses = fold_losses
best_valid_losses = valid_losses

plt.plot(best_train_losses, color='navy')
plt.plot(best_valid_losses, color = 'blue')
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.title('Loss Plot')
plt.show()


In [None]:
print(len(best_train_losses))

# Evaluation

In [25]:
tokenizer = None
best_model = None
model = None
torch.cuda.empty_cache()

In [None]:
from transformers import BertModel, BertConfig

#loading test_dataset
test_dataset = BertDataSet(test['clean'], None, tokenizer, 77, if_train = False)
test_dataloader = dataloader.DataLoader(test_dataset, batch_size = valid_batch, pin_memory = True, num_workers = 0, shuffle = True)

config_path = './fintune_bert/config.json'
model_path = './fintune_bert/model.safetensors'

config = BertConfig.from_json_file(config_path)

best_model = BertModel.from_pretrained(model_path, config=config).to(device)

for test in test_dataloader:
    
    ids = test['input_ids'].to(device)
    mask = test['mask'].to(device)
    outputs = best_model(ids, mask)
    
    #print(outputs.keys())
    #results = outputs['logits']
    results = outputs['last_hidden_state']
    #results = outputs
    raw_predictions = torch.sigmoid(results)
    
    prediction = torch.where(raw_predictions > 0.5, -1, 0)
    print(prediction)
    #print(type(raw_predictions))
    #break

In [None]:
ground_truth = torch.tensor(test_label[['toxic', 'severe_toxic','obscene', 'threat', 'insult','identity_hate']].values)
print(ground_truth)

#