In [8]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
import pandas as pd

In [3]:
df = pd.read_csv("../Data/preprocessed/fakenews_preprocessed.csv")

In [4]:
texts = df["text"].fillna("").astype(str).tolist()
labels = df["real"].astype(int).tolist()

In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")



In [9]:
max_len = 128

In [10]:
train_enc = tokenizer(
    train_texts,
    padding="max_length",
    truncation=True,
    max_length=max_len,
    return_tensors="pt"
)

In [11]:
val_enc = tokenizer(
    val_texts,
    padding="max_length",
    truncation=True,
    max_length=max_len,
    return_tensors="pt"
)

In [12]:
train_input_ids = train_enc["input_ids"]
train_attention = train_enc["attention_mask"]
train_labels_tensor = torch.tensor(train_labels)

In [13]:
val_input_ids = val_enc["input_ids"]
val_attention = val_enc["attention_mask"]
val_labels_tensor = torch.tensor(val_labels)

In [14]:
device = "cpu"

In [15]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=1
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [17]:
criterion = torch.nn.BCEWithLogitsLoss()

In [18]:
batch_size = 8
num_epochs = 2

In [19]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # manual batching
    for i in range(0, len(train_labels), batch_size):
        batch_input = train_input_ids[i:i+batch_size].to(device)
        batch_mask  = train_attention[i:i+batch_size].to(device)
        batch_labels = train_labels_tensor[i:i+batch_size].float().to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=batch_input,
            attention_mask=batch_mask
        )

        logits = outputs.logits.squeeze(-1)
        loss = criterion(logits, batch_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

Epoch 1/2 - Loss: 178.9397
Epoch 2/2 - Loss: 48.3060


In [20]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Create validation batch otherwise, it requires over 50GB of memory to validate the model

In [22]:
from torch.utils.data import TensorDataset, DataLoader

val_dataset = TensorDataset(val_input_ids, val_attention)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [23]:
all_logits = []

In [24]:
with torch.no_grad():
    for batch in val_loader:
        ids = batch[0].to(device)
        mask = batch[1].to(device)

        batch_logits = model(
            input_ids=ids,
            attention_mask=mask
        ).logits.squeeze(-1)

        all_logits.append(batch_logits.cpu())

In [25]:
logits = torch.cat(all_logits)

In [26]:
probs = torch.sigmoid(logits).cpu().numpy()
preds = (probs >= 0.5).astype(int)

In [27]:
acc  = accuracy_score(val_labels, preds)
prec = precision_score(val_labels, preds)
rec  = recall_score(val_labels, preds)
f1   = f1_score(val_labels, preds)

In [28]:
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

Accuracy:  0.9971
Precision: 0.9972
Recall:    0.9978
F1 Score:  0.9975


In [29]:
model.save_pretrained("distilbert_fake_news4")
tokenizer.save_pretrained("distilbert_tokenizer_fake_news")

('distilbert_tokenizer_fake_news\\tokenizer_config.json',
 'distilbert_tokenizer_fake_news\\special_tokens_map.json',
 'distilbert_tokenizer_fake_news\\vocab.txt',
 'distilbert_tokenizer_fake_news\\added_tokens.json',
 'distilbert_tokenizer_fake_news\\tokenizer.json')