# Import Required Libraries


In [1]:
!pip install transformers



In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from transformers import DistilBertTokenizer

from transformers import DistilBertForSequenceClassification

# Load the Dataset

Load the dataset into a pandas DataFrame

Dataset structure (labels):

True : 1

Fake : 0

In [3]:
# Load train data
train_data = pd.read_csv('Dataset/train.csv')

# Load test data
test_data = pd.read_csv('Dataset/test.csv')

In [5]:
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [1]:
train_data.head()

NameError: name 'train_data' is not defined

In [4]:
train_data.dropna(inplace=True)


# Data prepration
Prepare the data for the PyTorch model. First, let's define a custom dataset class

In [7]:
''' This class takes in the data, tokenizes it using the DistilBertTokenizer from the transformers library,
 and returns the input IDs, attention masks, and labels.'''


class FakeNewsDataset(Dataset):
    def __init__(self, data, max_len=128):
        self.data = data
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = self.data.iloc[index]['label']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), torch.tensor(label, dtype=torch.long)

In [8]:
# split the data into training and validation sets

train_data, val_data = train_test_split(train_data, test_size=0.2,
                                        random_state=42)

In [9]:
val_data.head()

Unnamed: 0,id,title,author,text,label
11784,11784,Russian Spies and Americas Reality TV Electio...,Finian Cunningham,Russian Spies and Americas Reality TV Electi...,1
6997,6997,A Peek Inside the Strange World of Fake Academ...,Kevin Carey,The caller ID on my office telephone said the ...,0
14903,14903,A Rediscovered Mark Twain Fairy Tale Is Coming...,Alexandra Alter,"One night nearly 140 years ago, Samuel Clemens...",0
14381,14381,’Gays for Trump’ Banned from Participating in ...,Katherine Rodriguez,Members of a gay group say they have been ba...,0
16567,16567,SNIP creó mercado negro e informal,voltairenet.org,Páginas Libres\nSNIP creó mercado negro e info...,1


In [11]:
val_data.shape

(3657, 5)

In [12]:
train_data.shape

(14628, 5)

In [13]:
# Create PyTorch data loaders for the training, validation, and test sets:

train_dataset = FakeNewsDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = FakeNewsDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_dataset = FakeNewsDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

# Define the Model
define the PyTorch model. We'll use the `DistilBertForSequenceClassification` model from the `transformers` library:

In [14]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, num_labels=2):
        super(FakeNewsClassifier, self).__init__()
        self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs[0]

# Train the Model

With the data and model prepared, we can now train the model using PyTorch. We'll define a function to train the model for one epoch

In [15]:
def train_epoch(model, optimizer, criterion, train_loader):
    model.train()
    train_loss = 0
    train_acc = 0

    for input_ids, attention_mask, labels in tqdm(train_loader, desc='Training'):
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels.to(device)).sum().item()

    train_loss /= len(train_loader)
    train_acc /= len(train_loader.dataset)

    return train_loss, train_acc

This function takes in the model, optimizer, loss function, and data loader, and performs a forward pass through the model, calculates the loss, and performs backpropagation and gradient descent to update the model parameters

We'll also define a function to evaluate the model on the validation set:

In [16]:
def eval_epoch(model, criterion, val_loader):
    model.eval()
    val_loss = 0
    val_acc = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(val_loader, desc='Validation'):
            outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
            loss = criterion(outputs, labels.to(device))

            val_loss += loss.item()
            val_acc += (outputs.argmax(1) == labels.to(device)).sum().item()

        val_loss /= len(val_loader)
        val_acc /= len(val_loader.dataset)

    return val_loss, val_acc

This function takes in the model, loss function, and data loader, and performs a forward pass through the model to calculate the loss and accuracy on the validation set.

We define the main training loop:
learning rate = 2e-5

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = FakeNewsClassifier().to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

best_val_acc = 0

for epoch in range(5):
    train_loss, train_acc = train_epoch(model, optimizer, criterion, train_loader)
    val_loss, val_acc = eval_epoch(model, criterion, val_loader)

    print(f'Epoch {epoch + 1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}')

    if val_acc > best_val_acc:
        torch.save(model.state_dict(), 'best_model.pt')
        best_val_acc = val_acc

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 458/458 [06:09<00:00,  1.24it/s]
Validation: 100%|██████████| 115/115 [01:05<00:00,  1.77it/s]


Epoch 1: Train Loss=0.1281, Train Acc=0.9486, Val Loss=0.0553, Val Acc=0.9820


Training: 100%|██████████| 458/458 [06:10<00:00,  1.24it/s]
Validation: 100%|██████████| 115/115 [01:03<00:00,  1.81it/s]


Epoch 2: Train Loss=0.0247, Train Acc=0.9915, Val Loss=0.0337, Val Acc=0.9891


Training: 100%|██████████| 458/458 [06:11<00:00,  1.23it/s]
Validation: 100%|██████████| 115/115 [01:04<00:00,  1.78it/s]


Epoch 3: Train Loss=0.0135, Train Acc=0.9959, Val Loss=0.0343, Val Acc=0.9907


Training: 100%|██████████| 458/458 [06:11<00:00,  1.23it/s]
Validation: 100%|██████████| 115/115 [01:03<00:00,  1.81it/s]


Epoch 4: Train Loss=0.0069, Train Acc=0.9979, Val Loss=0.0355, Val Acc=0.9918


Training: 100%|██████████| 458/458 [06:13<00:00,  1.23it/s]
Validation: 100%|██████████| 115/115 [01:04<00:00,  1.77it/s]

Epoch 5: Train Loss=0.0040, Train Acc=0.9988, Val Loss=0.0558, Val Acc=0.9904





In [20]:
import torch

if torch.cuda.is_available():
    print("GPU is available and PyTorch can use it.")
else:
    print("GPU is not available or PyTorch cannot use it.")

GPU is available and PyTorch can use it.


In [21]:
import torch

# Create a tensor and check its device
tensor = torch.tensor([1, 2, 3])
print(f"Tensor device: {tensor.device}")

# Move the tensor to the default device and check again
tensor = tensor.to(device) # 'device' from your code
print(f"Tensor device after moving: {tensor.device}")

Tensor device: cpu
Tensor device after moving: cuda:0


In [22]:
import torch

num_gpus = torch.cuda.device_count()
print(f"Number of available GPUs: {num_gpus}")

if num_gpus > 0:
    print("GPU is available.")
else:
    print("GPU is not available.")

Number of available GPUs: 1
GPU is available.


In [23]:
# Train and evaluate the model
epochs = 3

# Use train_loader instead of train_dataloader
# Use val_loader instead of validation_dataloader
# Define loss_fn
loss_fn = nn.CrossEntropyLoss() #Use CrossEntropyLoss

for epoch in range(epochs):
    # Train
    model.train()
    total_train_loss = 0
    for step, batch in enumerate(train_loader): # Changed to train_loader
        batch_inputs, batch_masks, batch_labels = batch

        # Move tensors to the correct device
        batch_inputs = batch_inputs.to(device)
        batch_masks = batch_masks.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs, attention_mask=batch_masks)
        loss = loss_fn(outputs, batch_labels) # Changed to loss_fn
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        if step % 100 == 0:
            print(f"Epoch {epoch+1} / {epochs} - Batch {step} / {len(train_loader)} - Loss: {loss.item()}") # Changed to train_loader

    # Evaluate
    model.eval()
    total_validation_loss = 0
    predictions, true_labels = [], []
    for batch in val_loader:  # Changed to val_loader
        batch_inputs, batch_masks, batch_labels = batch

        # Move tensors to the correct device
        batch_inputs = batch_inputs.to(device)
        batch_masks = batch_masks.to(device)
        batch_labels = batch_labels.to(device)

        with torch.no_grad():
            outputs = model(batch_inputs, attention_mask=batch_masks)
            loss = loss_fn(outputs, batch_labels)  # Changed to loss_fn
            total_validation_loss += loss.item()
            predictions += list(torch.argmax(outputs, dim=1).cpu().numpy())
            true_labels += list(batch_labels.cpu().numpy())

    # Print training and validation loss
    average_train_loss = total_train_loss / len(train_loader)  # Changed to train_loader
    average_validation_loss = total_validation_loss / len(val_loader)  # Changed to val_loader
    print(f"Epoch {epoch+1} / {epochs} - Average training loss: {average_train_loss}")
    print(f"Epoch {epoch+1} / {epochs} - Average validation loss: {average_validation_loss}")

    # Print classification report
    # Assuming you have imported classification_report
    from sklearn.metrics import classification_report
    print(classification_report(true_labels, predictions))

    # Save model
    torch.save(model.state_dict(), f"distilbert-fake-news-{epoch+1}.pth")

Epoch 1 / 3 - Batch 0 / 458 - Loss: 0.00013349564687814564
Epoch 1 / 3 - Batch 100 / 458 - Loss: 9.491672972217202e-05
Epoch 1 / 3 - Batch 200 / 458 - Loss: 0.00030501955188810825
Epoch 1 / 3 - Batch 300 / 458 - Loss: 0.00012072759272996336
Epoch 1 / 3 - Batch 400 / 458 - Loss: 8.530859486199915e-05
Epoch 1 / 3 - Average training loss: 0.0021938661417891334
Epoch 1 / 3 - Average validation loss: 0.05415762053231921
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2082
           1       0.99      0.99      0.99      1575

    accuracy                           0.99      3657
   macro avg       0.99      0.99      0.99      3657
weighted avg       0.99      0.99      0.99      3657

Epoch 2 / 3 - Batch 0 / 458 - Loss: 0.00010365120397182181
Epoch 2 / 3 - Batch 100 / 458 - Loss: 9.809255425352603e-05
Epoch 2 / 3 - Batch 200 / 458 - Loss: 0.0005651573301292956
Epoch 2 / 3 - Batch 300 / 458 - Loss: 0.000528122705873102
Epoch 2 / 3 - Ba