In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration

In [None]:
# Load the data
data = pd.read_csv('/content/data.csv')
data.dropna(inplace=True)
data

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
...,...,...,...,...
4003,http://beforeitsnews.com/u-s-politics/2017/10/...,CNN and Globalist Exposed - Steve Quayle and A...,"Vietnam Is in Great Danger, You Must Publish a...",0
4004,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4005,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4007,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1


In [None]:
# Initialize tokenizers once
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

class DistilBertFakeNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['Body']
        label = self.data.iloc[index]['Label']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), torch.tensor(label, dtype=torch.long)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
class T5FakeNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['Body']
        label = self.data.iloc[index]['Label']
        label_text = 'true' if label == 1 else 'false'
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        target = self.tokenizer.encode(
            label_text,
            max_length=2,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), target.squeeze(0)


In [None]:
# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Create PyTorch data loaders for the training and test sets
distilbert_train_dataset = DistilBertFakeNewsDataset(train_data, distilbert_tokenizer)
distilbert_train_loader = DataLoader(distilbert_train_dataset, batch_size=32, shuffle=True)

distilbert_test_dataset = DistilBertFakeNewsDataset(test_data, distilbert_tokenizer)
distilbert_test_loader = DataLoader(distilbert_test_dataset, batch_size=32, shuffle=False)

t5_train_dataset = T5FakeNewsDataset(train_data, t5_tokenizer)
t5_train_loader = DataLoader(t5_train_dataset, batch_size=32, shuffle=True)

t5_test_dataset = T5FakeNewsDataset(test_data, t5_tokenizer)
t5_test_loader = DataLoader(t5_test_dataset, batch_size=32, shuffle=False)


In [None]:
class DistilBertFakeNewsClassifier(nn.Module):
    def __init__(self, num_labels=2):
        super(DistilBertFakeNewsClassifier, self).__init__()
        self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits


In [None]:
class T5FakeNewsClassifier(nn.Module):
    def __init__(self):
        super(T5FakeNewsClassifier, self).__init__()
        self.t5 = T5ForConditionalGeneration.from_pretrained('t5-small')

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.t5(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits


In [None]:
def train_epoch_distilbert(model, optimizer, criterion, train_loader, device):
    model.train()
    train_loss = 0
    train_acc = 0

    for input_ids, attention_mask, labels in tqdm(train_loader, desc='Training DistilBERT'):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels).sum().item()

    train_loss /= len(train_loader)
    train_acc /= len(train_loader.dataset)

    return train_loss, train_acc

In [None]:
def train_epoch_t5(model, optimizer, criterion, train_loader, device):
    model.train()
    train_loss = 0
    train_acc = 0

    for input_ids, attention_mask, labels in tqdm(train_loader, desc='Training T5'):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        loss, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(logits, dim=2)
        train_acc += (preds == labels).all(dim=1).sum().item()

    train_loss /= len(train_loader)
    train_acc /= len(train_loader.dataset)

    return train_loss, train_acc

In [None]:
def eval_epoch_distilbert(model, criterion, test_loader, device):
    model.eval()
    test_loss = 0
    test_acc = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(test_loader, desc='Evaluating DistilBERT'):
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            test_acc += (outputs.argmax(1) == labels).sum().item()

    test_loss /= len(test_loader)
    test_acc /= len(test_loader.dataset)

    return test_loss, test_acc

In [None]:
def eval_epoch_t5(model, criterion, test_loader, device):
    model.eval()
    test_loss = 0
    test_acc = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(test_loader, desc='Evaluating T5'):
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            loss, logits = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            test_loss += loss.item()
            preds = torch.argmax(logits, dim=2)
            test_acc += (preds == labels).all(dim=1).sum().item()

    test_loss /= len(test_loader)
    test_acc /= len(test_loader.dataset)

    return test_loss, test_acc



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# DistilBERT model
distilbert_model = DistilBertFakeNewsClassifier().to(device)
distilbert_optimizer = optim.AdamW(distilbert_model.parameters(), lr=2e-5)
distilbert_criterion = nn.CrossEntropyLoss()
for epoch in range(3):
    train_loss, train_acc = train_epoch_distilbert(distilbert_model, distilbert_optimizer, distilbert_criterion, distilbert_train_loader, device)
    print(f'Epoch {epoch + 1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}')

torch.save(distilbert_model.state_dict(), f'distilbert_model.pth')


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training DistilBERT: 100%|██████████| 100/100 [01:11<00:00,  1.40it/s]


Epoch 1: Train Loss=0.2514, Train Acc=0.8966


Training DistilBERT: 100%|██████████| 100/100 [01:10<00:00,  1.42it/s]


Epoch 2: Train Loss=0.0501, Train Acc=0.9846


Training DistilBERT: 100%|██████████| 100/100 [01:11<00:00,  1.40it/s]


Epoch 3: Train Loss=0.0137, Train Acc=0.9969


In [None]:
distilbert_model.load_state_dict(torch.load('distilbert_model.pth'))
distilbert_test_loss, distilbert_test_acc = eval_epoch_distilbert(distilbert_model, distilbert_criterion, distilbert_test_loader, device)
print(f'DistilBERT Test Loss={distilbert_test_loss:.4f}, Test Acc={distilbert_test_acc:.4f}')

Evaluating DistilBERT: 100%|██████████| 25/25 [00:11<00:00,  2.09it/s]

DistilBERT Test Loss=0.0543, Test Acc=0.9837





In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# T5 model
t5_model = T5FakeNewsClassifier().to(device)
t5_optimizer = optim.AdamW(t5_model.parameters(), lr=2e-5)
t5_criterion = nn.CrossEntropyLoss()

for epoch in range(8):
    train_loss, train_acc = train_epoch_t5(t5_model, t5_optimizer, t5_criterion, t5_train_loader, device)
    print(f'Epoch {epoch + 1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}')

torch.save(t5_model.state_dict(), f't5_model.pth')


Training T5: 100%|██████████| 100/100 [00:33<00:00,  2.98it/s]


Epoch 1: Train Loss=7.1364, Train Acc=0.0000


Training T5: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Epoch 2: Train Loss=3.1122, Train Acc=0.0533


Training T5: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Epoch 3: Train Loss=0.8503, Train Acc=0.5185


Training T5: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Epoch 4: Train Loss=0.3317, Train Acc=0.7539


Training T5: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


Epoch 5: Train Loss=0.1945, Train Acc=0.8605


Training T5: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Epoch 6: Train Loss=0.1507, Train Acc=0.8940


Training T5: 100%|██████████| 100/100 [00:32<00:00,  3.11it/s]


Epoch 7: Train Loss=0.1159, Train Acc=0.9207


Training T5: 100%|██████████| 100/100 [00:31<00:00,  3.15it/s]


Epoch 8: Train Loss=0.1009, Train Acc=0.9354


In [None]:
t5_model.load_state_dict(torch.load('t5_model.pth'))
t5_test_loss, t5_test_acc = eval_epoch_t5(t5_model, t5_criterion, t5_test_loader, device)
print(f'T5 Test Loss={t5_test_loss:.4f}, Test Acc={t5_test_acc:.4f}')

Evaluating T5: 100%|██████████| 25/25 [00:04<00:00,  5.94it/s]

T5 Test Loss=0.0714, Test Acc=0.9524



