In [None]:
#install required packages
!pip install transformers tqdm

import pandas as pd
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

#device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [None]:
#getting the dataset
!pip install kaggle

#upload kaggle.json to Colab
from google.colab import files
files.upload()

#move kaggle.json to correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

#download dataset
!kaggle datasets download -d emineyetm/fake-news-detection-datasets
!unzip fake-news-detection-datasets.zip



Saving Fake.csv to Fake.csv
Saving True.csv to True.csv
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets
License(s): unknown
Downloading fake-news-detection-datasets.zip to /content
100% 41.0M/41.0M [00:01<00:00, 35.4MB/s]
100% 41.0M/41.0M [00:01<00:00, 22.8MB/s]
Archive:  fake-news-detection-datasets.zip
  inflating: News _dataset/Fake.csv  
  inflating: News _dataset/True.csv  


In [None]:
#data preprocessing
def load_data():
    #read and combine the True and Fake datasets
    true_df = pd.read_csv("News _dataset/True.csv")
    fake_df = pd.read_csv("News _dataset/Fake.csv")

    #add labels
    true_df['label'] = 0  # Real news
    fake_df['label'] = 1  # Fake news

    #combine datasets
    df = pd.concat([true_df, fake_df], ignore_index=True)

    #shuffle the data
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    #split into train and test (80/20)
    train_size = int(0.8 * len(df))
    train_df = df[:train_size]
    test_df = df[train_size:]

    #combine title and text
    train_df['content'] = train_df['title'] + " " + train_df['text']
    test_df['content'] = test_df['title'] + " " + test_df['text']

    print(f'Training examples: {len(train_df)}')
    print(f'Test examples: {len(test_df)}')

    return train_df[['content', 'label']], test_df[['content', 'label']]

#custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  # Reduced max_length
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
#BERT classifier model
class BertForFakeNews(nn.Module):
    def __init__(self):
        super(BertForFakeNews, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
#training function with progress bars
def train_model(model, train_dataloader, val_dataloader, epochs=4):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    best_accuracy = 0

    for epoch in range(epochs):
        print(f'\nEpoch {epoch+1}/{epochs}')

        #training
        model.train()
        total_train_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc="Training")

        for batch in train_progress_bar:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            train_progress_bar.set_description(f"Training - Loss: {loss.item():.4f}")

        avg_train_loss = total_train_loss / len(train_dataloader)

        #validation
        model.eval()
        total_val_loss = 0
        val_preds = []
        val_true = []

        val_progress_bar = tqdm(val_dataloader, desc="Validation")

        with torch.no_grad():
            for batch in val_progress_bar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_true.extend(labels.cpu().numpy())

                val_progress_bar.set_description(f"Validation - Loss: {loss.item():.4f}")

        avg_val_loss = total_val_loss / len(val_dataloader)
        accuracy = (np.array(val_preds) == np.array(val_true)).mean()

        print(f'\nAverage training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')
        print(f'Validation Accuracy: {accuracy:.4f}')

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            #save best model
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'New best accuracy! Model saved.')

In [None]:
def main():
    print("Loading data...")
    train_df, test_df = load_data()

    print("\nLoading BERT tokenizer...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    print("\nCreating datasets...")
    train_dataset = NewsDataset(train_df['content'].values, train_df['label'].values, tokenizer)
    test_dataset = NewsDataset(test_df['content'].values, test_df['label'].values, tokenizer)

    print("\nCreating dataloaders...")
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    print("\nInitializing BERT model...")
    model = BertForFakeNews()
    model = model.to(device)

    print("\nStarting training...")
    train_model(model, train_loader, test_loader)

    return model

if __name__ == "__main__":
    model = main()


Loading data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['content'] = train_df['title'] + " " + train_df['text']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['content'] = test_df['title'] + " " + test_df['text']


Training examples: 35918
Test examples: 8980

Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Creating datasets...

Creating dataloaders...

Initializing BERT model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Starting training...

Epoch 1/4


Training - Loss: 0.0005: 100%|██████████| 4490/4490 [31:56<00:00,  2.34it/s]
Validation - Loss: 0.0003: 100%|██████████| 1123/1123 [03:09<00:00,  5.92it/s]



Average training loss: 0.0100
Average validation loss: 0.0026
Validation Accuracy: 0.9996
New best accuracy! Model saved.

Epoch 2/4


Training - Loss: 0.0002: 100%|██████████| 4490/4490 [31:57<00:00,  2.34it/s]
Validation - Loss: 0.0544: 100%|██████████| 1123/1123 [03:12<00:00,  5.84it/s]



Average training loss: 0.0037
Average validation loss: 0.0085
Validation Accuracy: 0.9994

Epoch 3/4


Training - Loss: 0.0000: 100%|██████████| 4490/4490 [31:59<00:00,  2.34it/s]
Validation - Loss: 0.0000: 100%|██████████| 1123/1123 [03:12<00:00,  5.83it/s]



Average training loss: 0.0034
Average validation loss: 0.0010
Validation Accuracy: 0.9997
New best accuracy! Model saved.

Epoch 4/4


Training - Loss: 0.0000: 100%|██████████| 4490/4490 [31:58<00:00,  2.34it/s]
Validation - Loss: 0.0000: 100%|██████████| 1123/1123 [03:11<00:00,  5.88it/s]



Average training loss: 0.0026
Average validation loss: 0.0007
Validation Accuracy: 0.9998
New best accuracy! Model saved.
