### Import Libraries

In [10]:
import torch
import pandas as pd
import numpy as np
import transformers
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel

### Setting up Config Variables

In [2]:
MAX_LEN = 320
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cpu


## Load dataframe

In [3]:
train_data = pd.read_csv('cleaned_data.csv')
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_data.dropna(inplace=True)
#train_data['labels'] = train_data[label_columns].values.tolist()
#train_data = train_data.drop(label_columns+['id','comment_text','cleaned_text_ps','cleaned_text_sb','comment_length'],axis=1)

In [4]:
#train_data.to_csv('train_data.csv',index=False)

In [5]:
train_size = 0.85

train_df = train_data.sample(frac=train_size, random_state=42)
val_df = train_data.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

print("Orig Dataset: {}".format(train_data.shape))
print("Training Dataset: {}".format(train_df.shape))
print("Validation Dataset: {}".format(val_df.shape))

Orig Dataset: (159525, 12)
Training Dataset: (135596, 12)
Validation Dataset: (23929, 12)


###  Creating custom pytorch dataset

In [6]:
# Preprocess data
class ToxicCommentDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, labels, max_len):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.texts[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        labels = torch.tensor(self.labels[index], dtype=torch.float32)
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': labels
        }

    def __len__(self):
        return len(self.texts)

### Creating dataloaders from the custom datasets. 

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

# Create a DataLoader for the training set
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': False,
               'num_workers': 4
                }

# Create a DataLoader for the training set
train_data = ToxicCommentDataset(tokenizer, train_df['comment_text'], train_df[label_columns].values, MAX_LEN)
train_dataset, test_dataset = torch.utils.data.random_split(train_data, [int(len(train_data) * 0.8), len(train_data) - int(len(train_data) * 0.8)])
train_dataloader = DataLoader(train_dataset, **train_params)

# Create a DataLoader for the validation set
val_data = ToxicCommentDataset(tokenizer, val_df['comment_text'], val_df[label_columns].values, MAX_LEN)
val_dataloader = DataLoader(val_data, **val_params)

# Split data into train and test sets
test_dataloader = DataLoader(test_dataset, **val_params)

###  Model Design

In [8]:
"""# Define BERT model architecture
class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(768, 6)
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        out = hidden_state[:, 0]
        out = self.classifier(out)
        return out
    
model = BERTModel()
model.to(DEVICE);"""

"# Define BERT model architecture\nclass BERTModel(nn.Module):\n    def __init__(self):\n        super(BERTModel, self).__init__()\n        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')\n        self.classifier = torch.nn.Sequential(\n            torch.nn.Linear(768, 768),\n            torch.nn.ReLU(),\n            torch.nn.Dropout(0.1),\n            torch.nn.Linear(768, 6)\n        )\n        \n    def forward(self, input_ids, attention_mask, token_type_ids):\n        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n        hidden_state = output_1[0]\n        out = hidden_state[:, 0]\n        out = self.classifier(out)\n        return out\n    \nmodel = BERTModel()\nmodel.to(DEVICE);"

In [9]:
#Smaller version of the Bert Transformer model.
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(768, 6)
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        out = hidden_state[:, 0]
        out = self.classifier(out)
        return out

model = DistilBERTClass()
model.to(DEVICE);

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Training, predicting and evaluation of the model

In [12]:
# Define optimizer and loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

def train(epochs=EPOCHS, lr=LEARNING_RATE, device=DEVICE):
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    batch_size = 32
    accumulation_steps = 4
    
    # Train the model
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for i, batch in enumerate(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
   
            # Compute the loss
            loss = loss_fn(outputs, labels)
            loss = loss / accumulation_steps
            
            # Backward pass
            loss.backward()
            total_loss += loss.item()

            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                total_loss += loss.item()
                model.zero_grad()
                
            if (i + 1) % 100 == 0:
                print(f'Epoch {epoch + 1}/{epochs}, Batch {i + 1} / {len(train_dataloader)}, Loss: {total_loss / (i + 1)}')
        
        print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {total_loss / len(train_dataloader)}')
        

def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for data in dataloader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)

            # Compute the loss
            loss = loss_fn(outputs, labels)

            logits = outputs.detach().cpu().numpy()
            label_ids = labels.to(device).numpy()

            # Store predictions and true labels
            predictions.extend(logits)
            true_labels.extend(label_ids)

    # Compute metrics
    pred_labels = np.argmax(predictions, axis=1)

    avg_loss = loss.item() / len(dataloader)

    return avg_loss, true_labels, pred_labels

def predict(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            predicted = torch.round(torch.sigmoid(outputs))
            predictions.append(predicted.cpu().detach().numpy())

    predictions = torch.from_numpy(np.concatenate(predictions, axis=0))
    return predictions

def predict_text(text, model, tokenizer, device=DEVICE):
    encoded_text = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True
    )
    # Tokenize the text and convert to input IDs
    input_ids = torch.tensor(encoded_text['input_ids'], dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = torch.tensor(encoded_text['attention_mask'], dtype=torch.long).unsqueeze(0).to(device)
    token_type_ids = torch.tensor(encoded_text['token_type_ids'], dtype=torch.long).unsqueeze(0).to(device)

    # Generate the attention mask
    attention_mask = (input_ids != 0).float()

    # Make the prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    # Convert the logits to probabilities
    probs = torch.sigmoid(outputs)

    # Convert the probabilities to binary predictions
    preds = probs.detach().cpu().numpy()

    # Convert the binary predictions to class labels
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    results = {label: round(pred.item(),4) for label, pred in zip(labels, preds[0])}

    return results


In [13]:
#Training and saving model state
"""
train(EPOCHS)
PATH = "toxic_comment_1.pkl"
torch.save(model.state_dict(), PATH)"""

'#Training and saving model state\ntrain(EPOCHS)\nPATH = "toxic_comment_1.pkl"\ntorch.save(model.state_dict(), PATH)'

### Loading Saved model

In [14]:
model = DistilBERTClass()
model.load_state_dict(torch.load("toxic_comment_1.pkl", map_location=torch.device('cpu'))) #ignor the map_location paramerer if on a gpu
model.to(DEVICE);

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
text = "nobody it sucks"
predict_text(text ,model, tokenizer, DEVICE)

{'toxic': 0.6366,
 'severe_toxic': 0.0076,
 'obscene': 0.1221,
 'threat': 0.0095,
 'insult': 0.1998,
 'identity_hate': 0.0225}