In [1]:
import pandas as pd
import numpy as np
import random
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import confusion_matrix, classification_report




In [2]:
# Set random seed for reproducibility
random.seed(42)

# Load dataset
fake_df = pd.read_csv("D:/Masters/Sem 2/AI/Project/Dataset/archive/Fake.csv")
true_df = pd.read_csv("D:/Masters/Sem 2/AI/Project/Dataset/archive/True.csv")

fake_df["label"] = 1
true_df["label"] = 0

combined_df = pd.concat([fake_df, true_df], ignore_index=True)
combined_df = combined_df[["text", "label"]]

combined_df.isna().sum()

text     0
label    0
dtype: int64

In [3]:
# Convertting to Dataset
dataset = Dataset.from_pandas(combined_df)
print(type(dataset))

# Shuffle the full dataset
shuffled_dataset = dataset.shuffle(seed=42)

<class 'datasets.arrow_dataset.Dataset'>


In [4]:
# Calculate total size
total_size = len(shuffled_dataset)
print(f"Total size after cleaning: {total_size}")

# Splitting into train, validation, and test sets (70% train, 15% val, 15% test)
train_val_test_split = shuffled_dataset.train_test_split(test_size=0.3, seed=42)  # 70% train, 30% test
val_test_split = train_val_test_split["test"].train_test_split(test_size=0.5, seed=42)  # 15% val, 15% test

Total size after cleaning: 44898


In [5]:
dataset = {
    "train": train_val_test_split["train"],      # 70% of total_size
    "validation": val_test_split["train"],       # 15% of total_size
    "test": val_test_split["test"]              # 15% of total_size 
}

print(f"Train size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")
print("Sample:", dataset["train"][0])

Train size: 31428
Validation size: 6735
Test size: 6735
Sample: {'text': 'A protester tried to attack Trump today at a rally in Dayton, OH but was subdued by the Secret Service. The crowd went nuts! Trump s reaction is PRICELESS:', 'label': 1}


In [6]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# tokenizer functions
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,  
        return_tensors="pt"
    )

In [7]:
# Tokenize dataset by mapping the fuctions
tokenized_dataset = {
    "train": dataset["train"].map(preprocess_function, batched=True),
    "validation": dataset["validation"].map(preprocess_function, batched=True),
    "test": dataset["test"].map(preprocess_function, batched=True)
}
print(tokenized_dataset["train"][0])

Map:   0%|          | 0/31428 [00:00<?, ? examples/s]

Map:   0%|          | 0/6735 [00:00<?, ? examples/s]

Map:   0%|          | 0/6735 [00:00<?, ? examples/s]

{'text': 'A protester tried to attack Trump today at a rally in Dayton, OH but was subdued by the Secret Service. The crowd went nuts! Trump s reaction is PRICELESS:', 'label': 1, 'input_ids': [101, 1037, 6186, 2121, 2699, 2000, 2886, 8398, 2651, 2012, 1037, 8320, 1999, 14700, 1010, 2821, 2021, 2001, 20442, 2011, 1996, 3595, 2326, 1012, 1996, 4306, 2253, 12264, 999, 8398, 1055, 4668, 2003, 3976, 3238, 1024, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
# Setting format for PyTorch
for split in tokenized_dataset:
    tokenized_dataset[split].set_format("torch", columns=["input_ids", "attention_mask", "label"])

# sample
print("Sample from train split:", tokenized_dataset["train"][0])

Sample from train split: {'label': tensor(1), 'input_ids': tensor([  101,  1037,  6186,  2121,  2699,  2000,  2886,  8398,  2651,  2012,
         1037,  8320,  1999, 14700,  1010,  2821,  2021,  2001, 20442,  2011,
         1996,  3595,  2326,  1012,  1996,  4306,  2253, 12264,   999,  8398,
         1055,  4668,  2003,  3976,  3238,  1024,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
     

In [9]:
# Loading the DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2  # Binary classification: 0 (real), 1 (fake)
)

# Move model to GPU
model.to(device)

# Verify it’s on CUDA
print(f"Model is on: {next(model.parameters()).device}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",             # Where to save model checkpoints
    eval_strategy="epoch",              # Evaluate at the end of each epoch
    learning_rate=2e-5,                 # Standard learning rate for BERT models
    per_device_train_batch_size=8,      # Batch size for training 
    per_device_eval_batch_size=8,       # Batch size for evaluation
    num_train_epochs=1,                 # Number of epochs to train
    weight_decay=0.01,                  # Regularization to prevent overfitting
    logging_dir="./logs",               # Where to save training logs
    logging_steps=10,                   # Log every 10 steps
    fp16=True,                          # Mixed precision for faster training on GPU
    save_strategy="epoch",              # Save model at the end of each epoch
    load_best_model_at_end=True,        # Load the best model based on validation
    metric_for_best_model="accuracy",   # Use accuracy to pick the best model
    report_to="none"                    # Disable WandB to avoid overhead
)

In [11]:
# Define a compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

In [12]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [13]:
trainer.train()

eval_results = trainer.evaluate(tokenized_dataset["test"])
print("Test set evaluation:", eval_results)

predictions = trainer.predict(tokenized_dataset["test"])
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
print(confusion_matrix(labels, preds))
print(classification_report(labels, preds, target_names=["Real", "Fake"]))

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0001,0.006105,0.999109


Test set evaluation: {'eval_loss': 0.0001681193825788796, 'eval_accuracy': 1.0, 'eval_runtime': 126.1276, 'eval_samples_per_second': 53.398, 'eval_steps_per_second': 6.676, 'epoch': 1.0}
[[3192    0]
 [   0 3543]]
              precision    recall  f1-score   support

        Real       1.00      1.00      1.00      3192
        Fake       1.00      1.00      1.00      3543

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735

