In [1]:
#Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [2]:
!pip install transformers datasets peft accelerate

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuspars

In [3]:
!pip install wandb



In [4]:
from kaggle_secrets import UserSecretsClient
import wandb

# 1. Get the secret key from Kaggle secrets
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("WANDB_API_KEY")

# 2. Login to Weights & Biases using the key
wandb.login(key=wandb_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manjilakshetri[0m ([33manjilakshetri-kathmandu-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Load the Dataset

In [5]:
import json

# Read the file
with open('/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json') as f:
    data = [json.loads(line) for line in f]

#Convert that into Dataframe or easier inspection
df = pd.DataFrame(data)

# View the first item
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [6]:
# Total Number of data
print(f'Total Number of data : {len(df)}')

#Take only headline and short_description
df = df[['headline', 'short_description', 'category']]

#Combine them both in single columns
df['news'] = df['headline'] + ' - ' + df['short_description']

#Remove the headline and short_description
df = df.drop(columns = ['headline', 'short_description'])
df.head()

Total Number of data : 209527


Unnamed: 0,category,news
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li..."
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
3,PARENTING,The Funniest Tweets From Parents This Week (Se...
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...


In [7]:
print(df['news'][0])
print('-' * 110)
print(df['news'][10])
print('-' * 110)
print(df['news'][20])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters - Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
--------------------------------------------------------------------------------------------------------------
World Cup Captains Want To Wear Rainbow Armbands In Qatar - FIFA has come under pressure from several European soccer federations who want to support a human rights campaign against discrimination at the World Cup.
--------------------------------------------------------------------------------------------------------------
Golden Globes Returning To NBC In January After Year Off-Air - For the past 18 months, Hollywood has effectively boycotted the Globes after reports that the HFPA’s 87 members of non-American journalists included no Black members.


## Cleaning the Text

In [8]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and special characters (except words and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['news'] = df['news'].apply(clean_text)

print(df['news'][0])
print('-' * 110)
print(df['news'][10])
print('-' * 110)
print(df['news'][20])

over 4 million americans roll up sleeves for omicrontargeted covid boosters health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the us ordered for the fall
--------------------------------------------------------------------------------------------------------------
world cup captains want to wear rainbow armbands in qatar fifa has come under pressure from several european soccer federations who want to support a human rights campaign against discrimination at the world cup
--------------------------------------------------------------------------------------------------------------
golden globes returning to nbc in january after year offair for the past 18 months hollywood has effectively boycotted the globes after reports that the hfpas 87 members of nonamerican journalists included no black members


### Device

In [9]:
import torch
import torch.nn as nn

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


## Label Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])
print(le.classes_)  # array of category names in order
print(df['label'].value_counts())  # distribution of encoded labels

['ARTS' 'ARTS & CULTURE' 'BLACK VOICES' 'BUSINESS' 'COLLEGE' 'COMEDY'
 'CRIME' 'CULTURE & ARTS' 'DIVORCE' 'EDUCATION' 'ENTERTAINMENT'
 'ENVIRONMENT' 'FIFTY' 'FOOD & DRINK' 'GOOD NEWS' 'GREEN' 'HEALTHY LIVING'
 'HOME & LIVING' 'IMPACT' 'LATINO VOICES' 'MEDIA' 'MONEY' 'PARENTING'
 'PARENTS' 'POLITICS' 'QUEER VOICES' 'RELIGION' 'SCIENCE' 'SPORTS' 'STYLE'
 'STYLE & BEAUTY' 'TASTE' 'TECH' 'THE WORLDPOST' 'TRAVEL' 'U.S. NEWS'
 'WEDDINGS' 'WEIRD NEWS' 'WELLNESS' 'WOMEN' 'WORLD NEWS' 'WORLDPOST']
label
24    35602
38    17945
10    17362
34     9900
30     9814
22     8791
16     6694
25     6347
13     6340
3      5992
5      5400
28     5077
2      4583
17     4320
23     3955
33     3664
36     3653
39     3572
6      3562
18     3484
8      3426
40     3299
20     2944
37     2777
15     2622
41     2579
26     2577
29     2254
27     2206
32     2104
31     2096
21     1756
0      1509
11     1444
12     1401
14     1398
35     1377
1      1339
4      1144
19     1130
7      1074
9      1

In [11]:
# Calculate class counts
counts = df['label'].value_counts().sort_index().values  # sorted by label index
print(counts)

# Compute class weights inversely proportional to frequency
class_weights = 1.0 / counts
class_weights = class_weights / class_weights.sum() * len(class_weights)  # normalize

# Convert to torch tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Use in loss
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

[ 1509  1339  4583  5992  1144  5400  3562  1074  3426  1014 17362  1444
  1401  6340  1398  2622  6694  4320  3484  1130  2944  1756  8791  3955
 35602  6347  2577  2206  5077  2254  9814  2096  2104  3664  9900  1377
  3653  2777 17945  3572  3299  2579]


## FineTuning

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

2025-06-27 13:22:34.061218: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751030554.239691      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751030554.292770      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=42)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Convert to HuggingFace Dataset

In [14]:
from datasets import Dataset

hf_dataset = Dataset.from_pandas(df[["news", "label"]])

## Tokenization

In [15]:
def tokenize_function(examples):
    return tokenizer(examples["news"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/209527 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

## Splitting the Dataset

In [16]:
#Train-Test Split
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

## Training Arguments

In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True
)

## Train and Evaluate the Model

In [18]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

In [19]:
import time
import psutil

def train_and_evaluate(model, name):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )
    
    print(f"\n🟢 Starting: {name}")
    start_time = time.time()
    trainer.train()
    end_time = time.time()
    
    metrics = trainer.evaluate()
    accuracy = metrics['eval_accuracy'] if 'eval_accuracy' in metrics else 'N/A'
    
    print(f"🔍 {name} Results")
    print(f"- Accuracy: {accuracy}")
    print(f"- Training Time: {(end_time - start_time):.2f} sec")
    print(f"- Max Memory Used: {psutil.Process().memory_info().rss / 1024 ** 2:.2f} MB")

In [20]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

In [21]:
if torch.cuda.is_available():
    print(f"- GPU Memory Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

- GPU Memory Allocated: 0.00 MB


In [22]:
# Full Fine-Tuning
full_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=42)
train_and_evaluate(full_model, "Full Fine-Tuning")

# Frozen Encoder
frozen_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=42)
for param in frozen_model.base_model.parameters():
    param.requires_grad = False
train_and_evaluate(frozen_model, "Frozen Encoder")

# LoRA
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=42)
lora_model = get_peft_model(base_model, lora_config)
train_and_evaluate(lora_model, "LoRA")

# BitFit
bitfit_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=42)
for name, param in bitfit_model.named_parameters():
    param.requires_grad = "bias" in name
train_and_evaluate(bitfit_model, "BitFit")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🟢 Starting: Full Fine-Tuning


[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250627_132327-76khdsh3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./results[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/anjilakshetri-kathmandu-university/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/anjilakshetri-kathmandu-university/huggingface/runs/76khdsh3[0m


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1742,1.138239,0.675035
2,0.9702,1.038257,0.700854
3,0.7916,1.02146,0.709588


🔍 Full Fine-Tuning Results
- Accuracy: 0.7095881258053739
- Training Time: 9803.10 sec
- Max Memory Used: 3576.51 MB


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🟢 Starting: Frozen Encoder


Epoch,Training Loss,Validation Loss,Accuracy
1,3.1974,3.174602,0.186131
2,3.1317,3.086037,0.2235
3,3.0897,3.065223,0.224455


🔍 Frozen Encoder Results
- Accuracy: 0.22445473201928126
- Training Time: 3293.46 sec
- Max Memory Used: 3925.92 MB


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.



🟢 Starting: LoRA


Epoch,Training Loss,Validation Loss,Accuracy
1,2.2289,2.109552,0.463514
2,1.9073,1.82364,0.526225
3,1.8451,1.769621,0.537727


🔍 LoRA Results
- Accuracy: 0.5377272944208467
- Training Time: 6890.96 sec
- Max Memory Used: 4218.41 MB


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🟢 Starting: BitFit


Epoch,Training Loss,Validation Loss,Accuracy
1,3.1377,3.070875,0.279196
2,3.0128,2.922835,0.320718
3,2.9508,2.882922,0.338615


🔍 BitFit Results
- Accuracy: 0.33861499546604307
- Training Time: 6855.09 sec
- Max Memory Used: 4555.88 MB


## Inference

In [23]:
# Choose the model you want to use for inference
inference_model = lora_model  # or full_model, bitfit_model, frozen_model
inference_model.eval()
inference_model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DebertaV2ForSequenceClassification(
      (deberta): DebertaV2Model(
        (embeddings): DebertaV2Embeddings(
          (word_embeddings): Embedding(128100, 768, padding_idx=0)
          (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): DebertaV2Encoder(
          (layer): ModuleList(
            (0-11): 12 x DebertaV2Layer(
              (attention): DebertaV2Attention(
                (self): DisentangledSelfAttention(
                  (query_proj): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
        

In [24]:
# Example texts for prediction
texts = [
    "Stock markets show signs of recovery.",
    "New health guidelines released by WHO.",
    "The Lakers win the NBA championship.",
    "Government unveils new education policy.",
    "Tech giants release new AI tools.",
    "Travel restrictions lifted in Europe.",
    "Heavy rains flood several cities.",
    "NASA announces new Moon mission.",
    "Actor wins award for best performance.",
    "Debate intensifies over climate policy."
]

# Perform inference
for i, text in enumerate(texts):
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    # Disable gradient tracking
    with torch.no_grad():
        outputs = inference_model(**inputs)
        predicted_label = torch.argmax(outputs.logits, dim=1).item()

    # Decode label
    category = le.inverse_transform([predicted_label])[0]

    # Print result
    print(f"\n🔎 Example {i+1}")
    print("📝 Text:", text)
    print("📚 Predicted category:", category)


🔎 Example 1
📝 Text: Stock markets show signs of recovery.
📚 Predicted category: BUSINESS

🔎 Example 2
📝 Text: New health guidelines released by WHO.
📚 Predicted category: HEALTHY LIVING

🔎 Example 3
📝 Text: The Lakers win the NBA championship.
📚 Predicted category: SPORTS

🔎 Example 4
📝 Text: Government unveils new education policy.
📚 Predicted category: BUSINESS

🔎 Example 5
📝 Text: Tech giants release new AI tools.
📚 Predicted category: HEALTHY LIVING

🔎 Example 6
📝 Text: Travel restrictions lifted in Europe.
📚 Predicted category: TRAVEL

🔎 Example 7
📝 Text: Heavy rains flood several cities.
📚 Predicted category: SCIENCE

🔎 Example 8
📝 Text: NASA announces new Moon mission.
📚 Predicted category: HEALTHY LIVING

🔎 Example 9
📝 Text: Actor wins award for best performance.
📚 Predicted category: ENTERTAINMENT

🔎 Example 10
📝 Text: Debate intensifies over climate policy.
📚 Predicted category: POLITICS
