In [None]:
# ================================
# Fine-Tune SmolLM-135M on News Category Dataset
# ================================

!pip install -q transformers datasets accelerate evaluate bitsandbytes pandas

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
from google.colab import drive

In [None]:
# ----------------
# 1. Load dataset
# ----------------
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/LLM & SLM/News_Category_Dataset_v3.json'
df = pd.read_json(data_path, lines=True)

# Keep only "headline" and "category"
df = df[["headline", "category"]]
print("Sample rows:\n", df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Sample rows:
                                             headline   category
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY
3  The Funniest Tweets From Parents This Week (Se...  PARENTING
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS


In [None]:
# ----------------
# 2. Encode labels
# ----------------
label_list = sorted(df["category"].unique().tolist())
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

df["label"] = df["category"].map(label2id)

# Train/test split
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Subsample for Colab speed (optional)
train_df = train_df.sample(n=10000, random_state=42)   # 10k samples
test_df = test_df.sample(n=2000, random_state=42)     # 2k samples

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# ----------------
# 3. Load tokenizer & model
# ----------------
model_id = "HuggingFaceTB/SmolLM-135M"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Fix: Ensure tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # reuse EOS as PAD

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Match model config with tokenizer pad token
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM-135M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ----------------
# 4. Preprocess data
# ----------------
def tokenize_function(examples):
    return tokenizer(
        examples["headline"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# ----------------
# 5. Evaluation metric
# ----------------
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# ----------------
# 6. Training setup
# ----------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   # ✅ correct arg name
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    # lr_scheduler_type="cosine",
    logging_dir='./logs',
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
# ----------------
# 7. Train
# ----------------
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.8639,1.784052,0.529
2,1.1888,1.679349,0.5435
3,0.7852,1.710902,0.558


TrainOutput(global_step=7500, training_loss=1.4636306193033854, metrics={'train_runtime': 1443.9922, 'train_samples_per_second': 20.776, 'train_steps_per_second': 5.194, 'total_flos': 1223742504960000.0, 'train_loss': 1.4636306193033854, 'epoch': 3.0})

In [None]:
# ----------------
# 8. Evaluate
# ----------------
results = trainer.evaluate()
print("Final evaluation:", results)

Final evaluation: {'eval_loss': 1.7109018564224243, 'eval_accuracy': 0.558, 'eval_runtime': 22.5277, 'eval_samples_per_second': 88.779, 'eval_steps_per_second': 22.195, 'epoch': 3.0}


In [None]:
# ----------------
# 9. Save fine-tuned model to Drive
# ----------------
save_path = "/content/drive/MyDrive/LLM & SLM/smollm-news"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")

Model saved to /content/drive/MyDrive/LLM & SLM/smollm-news


In [None]:
# ----------------
# 10. Test with a sample (inference)
# ----------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text = "Stocks rally as Federal Reserve signals possible rate cuts."
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1).item()
print("Predicted category:", id2label[prediction])


Predicted category: BUSINESS
