Bert finetuning

In [None]:
## Plotting history 
import matplotlib.pyplot as plt

def plot_training_history(history):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history['accuracy'])
    plt.plot(history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    # Plotting training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history['train_loss'])
    plt.plot(history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    plt.tight_layout()
    plt.show()


In [None]:
## Plotting confusion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(y_val, y_pred, title):
    y_pred_classes = y_pred
    y_true_classes = y_val

    # Create and plot confusion matrix
    cm = confusion_matrix(y_true_classes, y_pred_classes)

    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix:  ' + title)
    plt.show()



In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("../../data/mendeley/HateSpeechDatasetBalanced.csv")
df.columns = df.columns.str.lower()

df.head()

Unnamed: 0,content,label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1


In [15]:
from transformers import BertTokenizer

# Create a tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and trim content
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    tokens = tokens[:128]  # 
    return tokens

df["tokens"] = df["content"].apply(preprocess_text)
df.head()

Unnamed: 0,content,label,tokens,input_features
0,denial of normal the con be asked to comment o...,1,"[denial, of, normal, the, con, be, asked, to, ...","{'input_ids': [tensor(14920), tensor(1997), te..."
1,just by being able to tweet this insufferable ...,1,"[just, by, being, able, to, t, ##wee, ##t, thi...","{'input_ids': [tensor(2074), tensor(2011), ten..."
2,that is retarded you too cute to be single tha...,1,"[that, is, re, ##tar, ##ded, you, too, cute, t...","{'input_ids': [tensor(2008), tensor(2003), ten..."
3,thought of a real badass mongol style declarat...,1,"[thought, of, a, real, bad, ##ass, mongol, sty...","{'input_ids': [tensor(2245), tensor(1997), ten..."
4,afro american basho,1,"[afro, american, bash, ##o]","{'input_ids': [tensor(17694), tensor(2137), te..."


In [16]:
import torch

def create_input_features(tokens):
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)
    segment_ids = [0] * len(input_ids)  #Single sentence, all zeros
    padding_length = 128 - len(input_ids)
    
    # Pad sequences
    input_ids += [0] * padding_length
    input_mask += [0] * padding_length
    segment_ids += [0] * padding_length

    return {
        "input_ids": torch.tensor(input_ids),
        "attention_mask": torch.tensor(input_mask),
        "token_type_ids": torch.tensor(segment_ids)
    }

df["input_features"] = df["tokens"].apply(create_input_features)

# Now df["input_features"] contains the BERT input features for each row
df.head()

Unnamed: 0,content,label,tokens,input_features
0,denial of normal the con be asked to comment o...,1,"[denial, of, normal, the, con, be, asked, to, ...","{'input_ids': [tensor(14920), tensor(1997), te..."
1,just by being able to tweet this insufferable ...,1,"[just, by, being, able, to, t, ##wee, ##t, thi...","{'input_ids': [tensor(2074), tensor(2011), ten..."
2,that is retarded you too cute to be single tha...,1,"[that, is, re, ##tar, ##ded, you, too, cute, t...","{'input_ids': [tensor(2008), tensor(2003), ten..."
3,thought of a real badass mongol style declarat...,1,"[thought, of, a, real, bad, ##ass, mongol, sty...","{'input_ids': [tensor(2245), tensor(1997), ten..."
4,afro american basho,1,"[afro, american, bash, ##o]","{'input_ids': [tensor(17694), tensor(2137), te..."


In [17]:
# Extract labels
from torch.utils.data import TensorDataset

labels = torch.tensor(df["label"].values)

# Create TensorDataset
all_input_ids = torch.stack(df["input_features"].apply(lambda x: x['input_ids']).values.tolist())
all_attention_mask = torch.stack(df["input_features"].apply(lambda x: x['attention_mask']).values.tolist())
all_token_type_ids = torch.stack(df["input_features"].apply(lambda x: x['token_type_ids']).values.tolist())

dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, labels)


In [18]:
from torch.utils.data import DataLoader, random_split

# Split data into train and validation sets
train_size = int(0.9 * len(df))
val_size = len(df) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoader for training and validation
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [19]:
import pickle

# Save train_dataset
with open("../../preprocessed/bert_mendeley_train_dataset.pkl", "wb") as train_file:
    pickle.dump(train_dataset, train_file)

# Save val_dataset
with open("../../preprocessed/bert_mendeley_val_dataset.pkl", "wb") as val_file:
    pickle.dump(val_dataset, val_file)
    
print("Datasets saved successfully.")



Datasets saved successfully.


## Finetuning BERT model

In [2]:
import pickle
from torch.utils.data import DataLoader

# Load datasets
with open("../../preprocessed/bert_mendeley_train_dataset.pkl", "rb") as train_file:
    train_dataset = pickle.load(train_file)

with open("../../preprocessed/bert_mendeley_val_dataset.pkl", "rb") as val_file:
    val_dataset = pickle.load(val_file)

# Create DataLoader for training and validation
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print("Datasets loaded successfully.")

Datasets loaded successfully.


In [3]:
## Prepare bert model
import torch
from transformers import BertForSequenceClassification, AdamW

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
## Train and validate the model
from tqdm import tqdm
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Initialize lists to store training history
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

# Training loop
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    correct_train_preds = 0
    total_train_samples = 0
    
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        token_type_ids = batch[2].to(device)
        labels = batch[3].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_train_loss += loss.item()
        
        preds = torch.argmax(logits, dim=1).flatten()
        correct_train_preds += (preds == labels).sum().item()
        total_train_samples += labels.size(0)

        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = correct_train_preds / total_train_samples
    train_losses.append(avg_train_loss)
    train_accuracies.append(train_accuracy)

    # Validation
    model.eval()
    total_val_loss = 0
    correct_val_preds = 0
    total_val_samples = 0
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{epochs}"):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            token_type_ids = batch[2].to(device)
            labels = batch[3].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=1).flatten()
            correct_val_preds += (preds == labels).sum().item()
            total_val_samples += labels.size(0)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = correct_val_preds / total_val_samples
    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Save the model
model.save_pretrained('../../models/bert_mendeley_model/model')
tokenizer.save_pretrained('../../models/bert_mendeley_model/tokenizer')

# Save training history
history = {
    'loss': train_losses,
    'val_loss': val_losses,
    'accuracy': train_accuracies,
    'val_accuracy': val_accuracies
}
with open("../../histories/bert_mendeley_finetuned_history.pkl", "wb") as f:
    pickle.dump(history, f)

print("Training completed and model saved successfully.")


In [None]:
## plot training history
import matplotlib.pyplot as plt
import pickle

# Load training history
with open("training_history.pkl", "rb") as f:
    history = pickle.load(f)

# Plot training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(history['loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.figure(figsize=(12, 6))
plt.plot(history['accuracy'], label='Train Accuracy')
plt.plot(history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Load pre-trained BERT model
model_path = "../../models/bert_mendeley_model/model" 
model = BertForSequenceClassification.from_pretrained(model_path)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load validation data (assuming you have val_dataset)
val_input_ids, val_attention_mask, val_labels = val_dataset.values()

# Predictions
with torch.no_grad():
    logits = model(val_input_ids, attention_mask=val_attention_mask).logits
    predicted_labels = torch.argmax(logits, dim=1)

# Create confusion matrix
conf_matrix = confusion_matrix(val_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(val_labels, predicted_labels)
print("\nClassification Report:")
print(class_report)

In [None]:
plot_training_history(history)  # Replace with your actual function

# Plot confusion matrix (use your existing function)
plot_confusion_matrix(val_labels, predicted_labels, title="Bert finetuned")  # Replace with your actual 

### Gemini solution

In [15]:
import pickle
import transformers
from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader


In [16]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:

# Load datasets
with open("../../preprocessed/bert_mendeley_train_dataset.pkl", "rb") as train_file:
    train_dataset = pickle.load(train_file)

with open("../../preprocessed/bert_mendeley_val_dataset.pkl", "rb") as val_file:
    val_dataset = pickle.load(val_file)

# Create DataLoader for training and validation
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print("Datasets loaded successfully.")

Datasets loaded successfully.


In [26]:
train_dataset[:10]

(tensor([[ 1045,  2572,  1999,  ...,     0,     0,     0],
         [22555, 11924, 18059,  ...,     0,     0,     0],
         [ 1045,  2001,  2025,  ...,     0,     0,     0],
         ...,
         [ 8011,  2080,  2017,  ...,     0,     0,     0],
         [ 2515,  2025,  2655,  ...,     0,     0,     0],
         [ 1051,  5737,  3844,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([0, 0, 0, 0, 1, 0, 1, 1, 1, 1]))

In [18]:

training_args = TrainingArguments(
    output_dir="../../models/bert_mendeley_hatespeech_finetuned",
    logging_dir="../../models/bert_mendeley_hatespeech_finetuned/logs",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
)

In [28]:
from datasets import Dataset

# Convert tensors in tuples to lists
input_features = [t[0].tolist() for t in train_dataset]
attention_masks = [t[1].tolist() for t in train_dataset]

# Create a dictionary with list values
data_dict = {'input_features': input_features, 'attention_masks': attention_masks}

# Convert to Hugging Face Dataset
train_data = Dataset.from_dict(data_dict)


In [29]:
# Convert tensors in tuples to lists for validation data
val_input_features = [t[0].tolist() for t in val_dataset]
val_attention_masks = [t[1].tolist() for t in val_dataset]

# Create a dictionary with list values
val_data_dict = {'input_features': val_input_features, 'attention_masks': val_attention_masks}

# Convert to Hugging Face Dataset
val_data = Dataset.from_dict(val_data_dict)


In [30]:

# Define the evaluation metric
metric = load_metric("glue", "mrpc")

# Define the evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


In [None]:

# prompt: save the history of bert as training accuracy, train loss, val accuracy, val loss, plot the accuracy and loss separately, then computer confusion matrix and plot that too

# Save the history of bert as training accuracy, train loss, val accuracy, val loss
history = trainer.state.history

# Plot the accuracy and loss separately
import matplotlib.pyplot as plt

plt.plot(history["train_accuracy"], label="train_accuracy")
plt.plot(history["val_accuracy"], label="val_accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

plt.plot(history["train_loss"], label="train_loss")
plt.plot(history["val_loss"], label="val_loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Compute confusion matrix and plot that too
from sklearn.metrics import confusion_matrix

y_pred = trainer.predict(val_data).predictions.argmax(-1)
y_true = val_data["labels"]

cm = confusion_matrix(y_true, y_pred)

plt.imshow(cm, cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()