In [5]:
# Delete all files and folders in the current Colab environment
# !rm -rf /content/*

# ***STEP:1 _ Setting-up the Environment***

In [6]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# ***STEP 2: Uploading & Extracting Dataset***

In [7]:
df = pd.read_csv("fake_news_dataset.zip")

# Check the columns and data types
print(df.columns)
print(df.head())

Index(['title', 'text', 'date', 'source', 'author', 'category', 'label'], dtype='object')
                                  title  \
0               Foreign Democrat final.   
1   To offer down resource great point.   
2          Himself church myself carry.   
3                  You unit its should.   
4  Billion believe employee summer how.   

                                                text        date    source  \
0  more tax development both store agreement lawy...  2023-03-10  NY Times   
1  probably guess western behind likely next inve...  2022-05-25  Fox News   
2  them identify forward present success risk sev...  2022-09-01       CNN   
3  phone which item yard Republican safe where po...  2023-02-07   Reuters   
4  wonder myself fact difficult course forget exa...  2023-04-03       CNN   

                 author    category label  
0          Paula George    Politics  real  
1           Joseph Hill    Politics  fake  
2        Julia Robinson    Business  fake  
3  Mr.

In [8]:
# EXTRACTING THE ZIP FILE

import zipfile

with zipfile.ZipFile("fake_news_dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("FakeNewsData")

In [9]:
import os

os.listdir("FakeNewsData")

['fake_news_dataset.csv']

In [10]:
# Drop rows with missing text or label (important fields)
df = df.dropna(subset=['text', 'label']).reset_index(drop=True)

# For label column, make sure it's lower case and mapped properly
df['label'] = df['label'].str.lower()

label_mapping = {'fake': 0, 'real': 1}
df['label'] = df['label'].map(label_mapping)

# Check if mapping is successful
print(df['label'].unique())
print(f"Dataset size after cleaning: {df.shape[0]}")


[1 0]
Dataset size after cleaning: 20000


In [11]:
df['content'] = df['title'] + " " + df['text']


In [12]:
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

In [13]:
print(df['label'].unique())

print(df['label'].isnull().sum())

print(f"Final dataset size: {df.shape[0]} samples")


[1 0]
0
Final dataset size: 20000 samples


# ***STEP 3 : Splitting the Dataset***

In [14]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['content'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)


# ***STEP 4: Preprocessing & Tokenization***

In [15]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(
    train_texts, truncation=True, padding=True, max_length=128
)

val_encodings = tokenizer(
    val_texts, truncation=True, padding=True, max_length=128
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# ***STEP 5: Creating Dataset Class***

In [16]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)


# ***STEP 6: Preparing DataLoaders***

In [17]:
train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)


# ***STEP 7: Loading the Model***

In [18]:
from transformers import BertTokenizerFast, BertForSequenceClassification

# Load the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load the classification model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [19]:
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


# ***STEP 8: Training the Model***

In [20]:
pip install accelerate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [21]:
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

model.train()

for epoch in range(num_epochs):
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        with autocast():  # Mixed precision forward pass
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()  # Scaled backprop
        scaler.step(optimizer)         # Scaled optimizer step
        scaler.update()                # Update scaler

        lr_scheduler.step()

        total_loss += loss.item()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")

    torch.cuda.empty_cache()


  scaler = GradScaler()
  with autocast():  # Mixed precision forward pass
Epoch 1: 100%|██████████| 4000/4000 [06:11<00:00, 10.77it/s, loss=0.694]


Epoch 1 - Average Loss: 0.6969


Epoch 2: 100%|██████████| 4000/4000 [05:35<00:00, 11.91it/s, loss=0.677]


Epoch 2 - Average Loss: 0.6949


Epoch 3: 100%|██████████| 4000/4000 [05:29<00:00, 12.14it/s, loss=0.673]

Epoch 3 - Average Loss: 0.6941





# ***STEP 9: Evaluating the Model***

In [23]:
from sklearn.metrics import confusion_matrix, classification_report

model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

print("Classification Report:")
print(classification_report(true_labels, predictions))

print("Confusion Matrix:")
print(confusion_matrix(true_labels, predictions))


100%|██████████| 1000/1000 [00:29<00:00, 34.12it/s]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67      2011
           1       0.00      0.00      0.00      1989

    accuracy                           0.50      4000
   macro avg       0.25      0.50      0.33      4000
weighted avg       0.25      0.50      0.34      4000

Confusion Matrix:
[[2011    0]
 [1989    0]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# ***STEP 10: Saving the Model***

In [24]:
output_dir = "./saved_bert_model/"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./saved_bert_model/


In [25]:
%%writefile requirements.txt
torch
transformers
scikit-learn
tqdm

Writing requirements.txt


In [27]:
!pip freeze > requirements.txt

In [28]:
output_dir = "./saved_bert_model/"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./saved_bert_model/tokenizer_config.json',
 './saved_bert_model/special_tokens_map.json',
 './saved_bert_model/vocab.txt',
 './saved_bert_model/added_tokens.json',
 './saved_bert_model/tokenizer.json')

In [29]:
!zip -r model_files.zip saved_bert_model/

  adding: saved_bert_model/ (stored 0%)
  adding: saved_bert_model/special_tokens_map.json (deflated 42%)
  adding: saved_bert_model/model.safetensors (deflated 7%)
  adding: saved_bert_model/config.json (deflated 49%)
  adding: saved_bert_model/tokenizer.json (deflated 71%)
  adding: saved_bert_model/vocab.txt (deflated 53%)
  adding: saved_bert_model/tokenizer_config.json (deflated 75%)


In [31]:
from google.colab import files

files.download("model_files.zip")
files.download("requirements.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# ***Step 11: predicting on new data***

In [32]:
from transformers import BertForSequenceClassification, BertTokenizer

model_path = "./saved_bert_model/"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [40]:
new_texts = [
    "Breaking news: Scientists discover new element.",
    "i am standing on the moon right now",
    "Click here to win a million dollars easily!"
]


In [41]:
inputs = tokenizer(
    new_texts,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
).to(device)


In [42]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

print("Predicted Labels:", predictions.cpu().numpy())


Predicted Labels: [0 0 0]
