In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DebertaV2Tokenizer
from transformers import DebertaV2ForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import os

os.environ["WANDB_DISABLED"] = "true"


In [None]:
# 1. LOAD DATA
def load_dataset(path="../WELFake_Dataset.csv"):
    df = pd.read_csv(path)
    print(df.head())
    print(df.info())
    print(df['label'].value_counts())
    df = df.dropna(subset=['text'])
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['text'].tolist(),
        df['label'].tolist(),
        test_size=0.2,
        random_state=42
    )
    print(f"Training samples: {len(train_texts)}")
    print(f"Validation samples: {len(val_texts)}")
    return train_texts, val_texts, train_labels, val_labels


In [None]:
# 2. TOKENIZE TEXTS
def tokenize_texts(train_texts, val_texts):
    tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-small")
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
    print("Tokenization complete.")
    return train_encodings, val_encodings

In [None]:
# 3. CREATE CUSTOM DATASET
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# 4. MAIN ENTRY POINT
if __name__ == "__main__":
    # Load and preprocess data
    train_texts, val_texts, train_labels, val_labels = load_dataset()

    # Tokenize for DeBERTa
    train_encodings, val_encodings = tokenize_texts(train_texts, val_texts)

    # Wrap in PyTorch datasets
    train_dataset = FakeNewsDataset(train_encodings, train_labels)
    val_dataset = FakeNewsDataset(val_encodings, val_labels)
    print("Datasets wrapped and ready ✅")

   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Tokenization complete.
Datasets wrapped and ready ✅


In [None]:
# 5. Load DeBERTa Model
model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/deberta-v3-small", num_labels=2)

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 6. Define Training Arguments
training_args = TrainingArguments(
   output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# 7. Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
# 8. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# 9. Train the model
trainer.train()

Step,Training Loss
500,0.1723
1000,0.0855
1500,0.047
2000,0.0593
2500,0.0628
3000,0.0381
3500,0.0412
4000,0.0448
4500,0.0393
5000,0.0401


Step,Training Loss
500,0.1723
1000,0.0855
1500,0.047
2000,0.0593
2500,0.0628
3000,0.0381
3500,0.0412
4000,0.0448
4500,0.0393
5000,0.0401


TrainOutput(global_step=14420, training_loss=0.03301420286194461, metrics={'train_runtime': 9943.6665, 'train_samples_per_second': 11.601, 'train_steps_per_second': 1.45, 'total_flos': 1.5280923668791296e+16, 'train_loss': 0.03301420286194461, 'epoch': 2.0})

In [None]:
# this writes config.json, pytorch_model.bin, etc.
trainer.save_model("deberta-fake-news")


In [None]:
import joblib
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-small")

joblib.dump(tokenizer, "vectorizer.pkl")


['vectorizer.pkl']

In [None]:
# Evaluate on validation set
metrics = trainer.evaluate()

# Print metrics
print("📊 Evaluation Results:")
print(f"Accuracy: {metrics['eval_accuracy']:.4f}")
print(f"F1 Score: {metrics['eval_f1']:.4f}")


📊 Evaluation Results:
Accuracy: 0.9963
F1 Score: 0.9964


In [None]:
# Load the saved model and tokenizer
model_path = "model/deberta-fake-news"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_path)
model = DebertaV2ForSequenceClassification.from_pretrained(model_path)

# Your custom news headline or paragraph
news = "NASA confirms discovery of water on the moon's surface"

# Tokenize
inputs = tokenizer(news, return_tensors="pt", truncation=True, padding=True)

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

# Show result
if predicted_label == 1:
    print("❌ Fake News")
else:
    print("✅ Real News")
