In [None]:
#install required packages
!pip install transformers tqdm

import pandas as pd
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold

#device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [5]:
#getting the dataset
!pip install kaggle

#upload kaggle.json to Colab
from google.colab import files
files.upload()

#move kaggle.json to correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

#download dataset
!kaggle datasets download -d emineyetm/fake-news-detection-datasets
!unzip fake-news-detection-datasets.zip



Saving Fake.csv to Fake.csv
Saving True.csv to True.csv
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets
License(s): unknown
Downloading fake-news-detection-datasets.zip to /content
 98% 40.0M/41.0M [00:02<00:00, 24.3MB/s]
100% 41.0M/41.0M [00:02<00:00, 14.9MB/s]
Archive:  fake-news-detection-datasets.zip
  inflating: News _dataset/Fake.csv  
  inflating: News _dataset/True.csv  


In [6]:
#data preprocessing
def load_data():
    true_df = pd.read_csv("News _dataset/True.csv")
    fake_df = pd.read_csv("News _dataset/Fake.csv")

    true_df['label'] = 0
    fake_df['label'] = 1

    df = pd.concat([true_df, fake_df], ignore_index=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df['content'] = df['title'] + " " + df['text']

    #split data
    train_size = int(0.8 * len(df))
    train_df = df[:train_size]
    test_df = df[train_size:]

    print(f'Training examples: {len(train_df)}')
    print(f'Test examples: {len(test_df)}')

    return train_df[['content', 'label']], test_df[['content', 'label']]

#custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):  #reduced max_length
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [7]:
#BERT classifier model
class BertForFakeNews(nn.Module):
    def __init__(self):
        super(BertForFakeNews, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [9]:
def train_model(model, dataset, tokenizer, n_splits=5, epochs=2):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        print(f'\nFold {fold+1}/{n_splits}')

        train_data = dataset.iloc[train_idx]
        val_data = dataset.iloc[val_idx]

        train_dataset = NewsDataset(train_data['content'].values, train_data['label'].values, tokenizer)
        val_dataset = NewsDataset(val_data['content'].values, val_data['label'].values, tokenizer)

        train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=8)

        model.apply(lambda m: m.reset_parameters() if hasattr(m, 'reset_parameters') else None)
        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        criterion = nn.CrossEntropyLoss()

        best_accuracy = 0

        for epoch in range(epochs):
            print(f'\nEpoch {epoch+1}/{epochs}')

            # Training phase
            model.train()
            total_train_loss = 0
            for batch in tqdm(train_dataloader, desc="Training"):
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()

                total_train_loss += loss.item()

            # Validation phase
            model.eval()
            total_val_loss = 0
            val_preds = []
            val_true = []

            with torch.no_grad():
                for batch in tqdm(val_dataloader, desc="Validation"):
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)

                    outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                    total_val_loss += loss.item()

                    preds = torch.argmax(outputs, dim=1).cpu().numpy()
                    val_preds.extend(preds)
                    val_true.extend(labels.cpu().numpy())

            avg_train_loss = total_train_loss / len(train_dataloader)
            avg_val_loss = total_val_loss / len(val_dataloader)
            accuracy = (np.array(val_preds) == np.array(val_true)).mean()

            print(f'\nAverage training loss: {avg_train_loss:.4f}')
            print(f'Average validation loss: {avg_val_loss:.4f}')
            print(f'Validation Accuracy: {accuracy:.4f}')

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                torch.save(model.state_dict(), f'best_model_fold{fold+1}.pt')
                print(f'New best accuracy for fold {fold+1}!')

        fold_scores.append(best_accuracy)

    print("\nCross-validation results:")
    print(f"Mean accuracy: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
    print(f"Individual fold accuracies: {fold_scores}")

In [None]:
def main():
    print("Loading data...")
    train_df, test_df = load_data()

    print("\nLoading BERT tokenizer...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    print("\nInitializing BERT model...")
    model = BertForFakeNews()
    model = model.to(device)

    print("\nStarting cross-validation training...")
    train_model(model, train_df, tokenizer)

if __name__ == "__main__":
    model = main()

Loading data...
Training examples: 35918
Test examples: 8980

Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Initializing BERT model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Starting cross-validation training...

Fold 1/5

Epoch 1/2


Training: 100%|██████████| 3592/3592 [25:30<00:00,  2.35it/s]
Validation: 100%|██████████| 898/898 [02:30<00:00,  5.95it/s]



Average training loss: 0.1517
Average validation loss: 0.0126
Validation Accuracy: 0.9964
New best accuracy for fold 1!

Epoch 2/2


Training: 100%|██████████| 3592/3592 [25:37<00:00,  2.34it/s]
Validation: 100%|██████████| 898/898 [02:29<00:00,  6.00it/s]



Average training loss: 0.0207
Average validation loss: 0.0289
Validation Accuracy: 0.9957

Fold 2/5

Epoch 1/2


Training:  53%|█████▎    | 1921/3592 [13:42<11:56,  2.33it/s]