In [None]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

In [None]:


# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [None]:

# Define paths
DATA_DIR = "liar_dataset"  # Change this to your LIAR dataset directory
OUTPUT_DIR = "distilbert_fake_news_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define label mapping for LIAR dataset
# LIAR has 6 labels: pants-fire, false, barely-true, half-true, mostly-true, true
LABEL_MAP = {
    'pants-fire': 0,
    'false': 1,
    'barely-true': 2,
    'half-true': 3,
    'mostly-true': 4,
    'true': 5
}


In [None]:

# Dataset class for LIAR
class LiarDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Convert batch dimension tensor to regular tensor
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding['labels'] = torch.tensor(label)

        return encoding


In [None]:

def load_liar_data(data_dir):
    # LIAR dataset has these columns:
    # id, label, statement, subject, speaker, job_title, state_info, party_affiliation,
    # barely_true_counts, false_counts, half_true_counts, mostly_true_counts, pants_on_fire_counts, context

    # Loading training data
    train_path = os.path.join(data_dir, "/content/train.tsv")
    train_df = pd.read_csv(train_path, sep='\t', header=None)

    # Loading validation data
    val_path = os.path.join(data_dir, "/content/valid.tsv")
    val_df = pd.read_csv(val_path, sep='\t', header=None)

    # Loading test data
    test_path = os.path.join(data_dir, "/content/test.tsv")
    test_df = pd.read_csv(test_path, sep='\t', header=None)

    # Extract relevant columns (label and statement)
    train_texts = train_df[2].tolist()  # statement is 3rd column (index 2)
    train_labels = train_df[1].map(LABEL_MAP).tolist()  # label is 2nd column (index 1)

    val_texts = val_df[2].tolist()
    val_labels = val_df[1].map(LABEL_MAP).tolist()

    test_texts = test_df[2].tolist()
    test_labels = test_df[1].map(LABEL_MAP).tolist()

    print(f"Train examples: {len(train_texts)}")
    print(f"Validation examples: {len(val_texts)}")
    print(f"Test examples: {len(test_texts)}")

    return train_texts, train_labels, val_texts, val_labels, test_texts, test_labels


In [18]:

def main():
    # Load tokenizer and model
    model_name = "distilbert-base-uncased"
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    # Load LIAR dataset
    train_texts, train_labels, val_texts, val_labels, test_texts, test_labels = load_liar_data(DATA_DIR)

    # Create datasets
    train_dataset = LiarDataset(train_texts, train_labels, tokenizer)
    val_dataset = LiarDataset(val_texts, val_labels, tokenizer)
    test_dataset = LiarDataset(test_texts, test_labels, tokenizer)

    # Load pre-trained model with classification head
    num_labels = len(LABEL_MAP)
    model = DistilBertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    # Move model to device
    model = model.to(device)

    # Define training arguments
    training_args = TrainingArguments(
      output_dir=OUTPUT_DIR,
      num_train_epochs=2,  # Reduce epochs (faster but may affect performance)
      per_device_train_batch_size=32,  # Increase batch size (if GPU memory allows)
      per_device_eval_batch_size=128,  # Increase for faster evaluation
      gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch sizes
      warmup_steps=0,  # Remove warmup unless necessary
      weight_decay=0.01,
      logging_dir=os.path.join(OUTPUT_DIR, "logs"),
      logging_steps=500,  # Reduce logging frequency for faster training
      eval_steps=1000,  # Evaluate less frequently
      save_steps=1000,  # Save checkpoints less often
      evaluation_strategy="epoch",  # Evaluate only at the end of each epoch
      save_strategy="epoch",  # Save only at epoch level
      load_best_model_at_end=True,
      metric_for_best_model="accuracy",
      fp16=True,  # Enable mixed precision training (faster on GPUs)
      dataloader_num_workers=4,  # Use multiple workers for data loading

    )

    # Define compute_metrics function
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        accuracy = accuracy_score(labels, predictions)
        return {"accuracy": accuracy}

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train the model
    print("Starting training...")
    trainer.train()

    # Save the model
    model.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))

    # Evaluate on test set
    print("Evaluating on test set...")
    trainer.eval_dataset = test_dataset
    test_results = trainer.evaluate()
    print(f"Test results: {test_results}")

    # Generate predictions for a more detailed evaluation
    test_predictions = trainer.predict(test_dataset)
    preds = np.argmax(test_predictions.predictions, axis=1)

    # Reverse label mapping for readable report
    reverse_label_map = {v: k for k, v in LABEL_MAP.items()}
    label_names = [reverse_label_map[i] for i in range(len(LABEL_MAP))]

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(test_labels, preds, target_names=label_names))

if __name__ == "__main__":
    main()

Train examples: 10240
Validation examples: 1284
Test examples: 1267


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.692194,0.262461


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.692194,0.262461
2,No log,1.685267,0.257009


Evaluating on test set...


Test results: {'eval_loss': 1.677376389503479, 'eval_accuracy': 0.26203630623520124, 'eval_runtime': 244.2125, 'eval_samples_per_second': 5.188, 'eval_steps_per_second': 0.041, 'epoch': 2.0}

Classification Report:
              precision    recall  f1-score   support

  pants-fire       0.00      0.00      0.00        92
       false       0.27      0.38      0.31       249
 barely-true       0.28      0.08      0.12       212
   half-true       0.25      0.32      0.28       265
 mostly-true       0.26      0.53      0.35       241
        true       0.31      0.04      0.08       208

    accuracy                           0.26      1267
   macro avg       0.23      0.22      0.19      1267
weighted avg       0.25      0.26      0.22      1267



In [32]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Define paths to model and tokenizer
model_path = "/content/distilbert_fake_news_model/final_model"  # Adjust if different
tokenizer_path = "/content/distilbert_fake_news_model/tokenizer"

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)

# Load trained model
model = DistilBertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Define label mapping
LABEL_MAP = {
    0: 'pants-fire',
    1: 'false',
    2: 'barely-true',
    3: 'half-true',
    4: 'mostly-true',
    5: 'true'
}

def predict(text):
    """Function to predict the class of a given text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to device

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()

    return LABEL_MAP[predicted_label]

# Example Usage
text = "NASA confirms that the moon is not made of cheese."
prediction = predict(text)
print(f"Prediction: {prediction}")


Prediction: false
