In [None]:
#BERT IMPLEMENTATION with Data Augmentation
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
import torch.optim as optim
import nltk
from nltk.corpus import wordnet
import random

nltk.download('wordnet')
nltk.download('omw-1.4')

# Config Class
class Config:
  """
  Configuration class for model training and augmentation.
  Synonym replacement replacement based on the Wordnet lexicon in NLTK is inspired by Easy Data Augmentation (EDA) techniques
  as described by Wei and ou (2019) in EDA: Easy Data Augmentation for Boosting Performane on Text Classification Tasks.
  """
    train_tsv = "train.tsv"
    val_tsv = "validation_preprocessed.tsv"
    test_tsv = "test_preprocessed.tsv"
    batch_size = 16
    max_len = 128 # to capture context
    num_epochs = 10
    learning_rate = 0.00001  # learning rate for smoother optimization
    weight_decay = 0.0005  # weight decay for better regularization
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_col = "2_way_label"
    text_col = "clean_title"
    patience = 3  # Patience for early stopping
    augment_prob = 0.2 #Probability for synonym replacement
    augment_ratio = 0.3 #Fraction of data to augment

    @staticmethod
    def synonym_replacement(sentence, prob=0.2):
      """
      Replaces words in the sentence with their synonyms based on a given probability.
      Args:
            sentence (str): Input sentence.
            prob (float): Probability of replacing a word with its synonym.
      Returns:
            str : Sentence with words replaced by synonyms.
      """

      words = sentence.split()
      augmented_sentence = []

      for word in words:
        if random.random() < prob:
          synonyms = wordnet.synsets(word)
          if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym != word:
              augmented_sentence.append(synonym)
              continue
        augmented_sentence.append(word)
      return " ".join(augmented_sentence)


    @staticmethod
    def preprocess_tsv(input_tsv, output_tsv, augment=False):
      """
      load and preprocess the data
      Args:
            input_tsv (str): Input TSV file path.
            output_tsv (str): Output TSV file path.
            augment (bool): Whether to perform data augmentation by default set to False
      Returns:
            None
      """
        df = pd.read_csv(input_tsv, sep='\t')
        df = df.dropna(subset=[Config.text_col, Config.label_col])
        df = df[[Config.text_col, Config.label_col]]

        if augment:
          augment_size = int(Config.augment_ratio * len(df))
          augment_indices = random.sample(range(len(df)), augment_size)
          for idx in augment_indices:
            df.at[idx, Config.text_col] = Config.synonym_replacement(df.at[idx, Config.text_col], Config.augment_prob)

        df.to_csv(output_tsv, sep='\t', index=False)


Config.preprocess_tsv("train_preprocessed.tsv", "train.tsv", augment=True)

# Dataset Class
class TextOnlyDataset(Dataset):
    def __init__(self, tsv_file, tokenizer, max_len):
        self.df = pd.read_csv(tsv_file, sep='\t')
        self.df = self.df.dropna(subset=[Config.text_col, Config.label_col])
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row[Config.text_col])
        label = row[Config.label_col]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        label = torch.tensor(label, dtype=torch.long)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

# Tokenizer Initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Datasets and DataLoaders for training and Validation set
train_dataset = TextOnlyDataset(Config.train_tsv, tokenizer, Config.max_len)
val_dataset = TextOnlyDataset(Config.val_tsv, tokenizer, Config.max_len)

train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=Config.batch_size, shuffle=False, num_workers=4)

# Model Definition
class TextClassifier(nn.Module):
  """
  Fine tuning BERT model for text classification.
  Based on the architecture adn recommendations outlined in Devlin et al. (2019) for BERT.
  """
    def __init__(self, text_model):
        super(TextClassifier, self).__init__()
        self.text_model = text_model
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(256, 2)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_emb)
        return logits

# Load Pretrained BERT Model
text_model = BertModel.from_pretrained('bert-base-uncased').to(Config.device)
model = TextClassifier(text_model).to(Config.device)

# Loss Function and Optimizer with Weight Decay
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=Config.learning_rate, weight_decay=Config.weight_decay)

# Learning Rate Scheduler
total_steps = len(train_loader) * Config.num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

# Training and Evaluation Functions
def train_model(model, dataloader, optimizer, criterion):
  """
  Implements the training loop with gradient clipping to ensure stable updates.

  Parameters:
  - model: The PyTorch model to be trained.
  - dataloader: DataLoader for the training dataset.
  - optimizer: The optimizer used for updating model parameters.
  """
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(Config.device)
        attention_mask = batch['attention_mask'].to(Config.device)
        labels = batch['label'].to(Config.device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item() * labels.size(0)
        preds = outputs.argmax(dim=1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    return avg_loss, accuracy

def evaluate_model(model, dataloader, criterion):
  """
  Evaluate the model on a given dataset.

  Parameters:
  - model: The PyTorch model to be evaluated.
  - dataloader: DataLoader for the evaluation dataset.
  - criterion: The loss function used for evaluation.
  """
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(Config.device)
            attention_mask = batch['attention_mask'].to(Config.device)
            labels = batch['label'].to(Config.device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    return avg_loss, accuracy

# Training Loop with Early Stopping
if __name__ == "__main__":
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(Config.num_epochs):
        train_loss, train_acc = train_model(model, train_loader, optimizer, criterion)
        val_loss, val_acc = evaluate_model(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{Config.num_epochs}")
        print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}")
        print(f"Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_acc:.4f}")

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pt")
            print("Validation loss improved. Saved the best model!")
        else:
            patience_counter += 1
            print(f"No improvement in validation loss. Patience counter: {patience_counter}/{Config.patience}")
            if patience_counter >= Config.patience:
                print("Early stopping triggered. Stopping training.")
                break

    print("Training complete. Best model saved as 'best_model.pt'")

# Testing the Model
test_dataset = TextOnlyDataset(Config.test_tsv, tokenizer, Config.max_len)
test_loader = DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=False, num_workers=4)

model.load_state_dict(torch.load("best_model.pt"))
model.to(Config.device)

test_loss, test_acc = evaluate_model(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.4f}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch 1/10
Train Loss: 0.6820 | Train Accuracy: 0.5570
Validation Loss: 0.6433 | Validation Accuracy: 0.6130
Validation loss improved. Saved the best model!
Epoch 2/10
Train Loss: 0.6360 | Train Accuracy: 0.6440
Validation Loss: 0.5822 | Validation Accuracy: 0.6470
Validation loss improved. Saved the best model!
Epoch 3/10
Train Loss: 0.5795 | Train Accuracy: 0.6830
Validation Loss: 0.5348 | Validation Accuracy: 0.7450
Validation loss improved. Saved the best model!
Epoch 4/10
Train Loss: 0.5209 | Train Accuracy: 0.7530
Validation Loss: 0.4992 | Validation Accuracy: 0.7610
Validation loss improved. Saved the best model!
Epoch 5/10
Train Loss: 0.4599 | Train Accuracy: 0.8110
Validation Loss: 0.4853 | Validation Accuracy: 0.7640
Validation loss improved. Saved the best model!
Epoch 6/10
Train Loss: 0.4023 | Train Accuracy: 0.8430
Validation Loss: 0.4733 | Validation Accuracy: 0.7800
Validation loss improved. Saved the best model!
Epoch 7/10
Train Loss: 0.3380 | Train Accuracy: 0.8740
Val

  model.load_state_dict(torch.load("best_model.pt"))


Test Loss: 0.4528 | Test Accuracy: 0.8020


In [None]:
def predict_and_display_with_text(model, dataloader, tokenizer, label_mapping):
    """
    Predict and display results with news text, predicted labels, and ground truth.

    Parameters:
    - model: The trained PyTorch model
    - dataloader: DataLoader for the test dataset
    - tokenizer: Tokenizer used for encoding (e.g., BertTokenizer)
    - label_mapping: Dictionary mapping label indices to human-readable labels (e.g., {0: "Real", 1: "Fake"})
    """
    model.eval()
    predictions = []
    ground_truth = []
    texts = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(Config.device)
            attention_mask = batch['attention_mask'].to(Config.device)
            labels = batch['label'].to(Config.device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)

            decoded_texts = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)

            predictions.extend(preds.cpu().numpy())
            ground_truth.extend(labels.cpu().numpy())
            texts.extend(decoded_texts)

    print("Predictions with Text:")
    for i, (text, pred, actual) in enumerate(zip(texts, predictions, ground_truth)):
        pred_label = label_mapping[pred]
        actual_label = label_mapping[actual]
        print(f"News: {text}")
        print(f"Predicted: {pred_label}, Actual: {actual_label}\n")

label_mapping = {0: "Real", 1: "Fake"}

# Load the best saved model
model.load_state_dict(torch.load("best_model.pt"))
model.to(Config.device)

# Call the prediction function for the test set
predict_and_display_with_text(model, test_loader, tokenizer, label_mapping)

  model.load_state_dict(torch.load("best_model.pt"))


Predictions with Text:
News: the pick
Predicted: Real, Actual: Real

News: me jumping in a big leaf pile
Predicted: Real, Actual: Fake

News: this lemon that has a face
Predicted: Fake, Actual: Fake

News: amazon is selling cck rings now
Predicted: Fake, Actual: Real

News: hot farmgirl mocking a cow
Predicted: Fake, Actual: Fake

News: saddam hussein is executed
Predicted: Real, Actual: Real

News: the one yellow kernel in my white corn
Predicted: Fake, Actual: Fake

News: no more mr nice guy
Predicted: Real, Actual: Real

News: bill oreilly says same sex marriage foes are just a bunch of bible thumpers
Predicted: Real, Actual: Fake

News: beer me
Predicted: Real, Actual: Real

News: leaked documents from kavanaughs time in white house discuss abortion and affirmative action
Predicted: Fake, Actual: Fake

News: its money making good
Predicted: Real, Actual: Real

News: parent of the year when his daughter got her first period this super dad shit his pants and explained its pretty much