# LLM Project Notebook: Topic Classification on 20 Newsgroups
This notebook implements a complete NLP pipeline including data loading, preprocessing, vectorization, baseline modeling, fine-tuning a transformer model, and pushing it to the Hugging Face Hub.

In [2]:
# ✅ Step 1: Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import evaluate
from huggingface_hub import notebook_login


In [3]:
# ✅ Step 2: Load Dataset
dataset = load_dataset("SetFit/20_newsgroups")
df = pd.DataFrame(dataset['train'])
df.head()


Repo card metadata block was not found. Setting CardData to empty.


Unnamed: 0,text,label,label_text
0,I was wondering if anyone out there could enli...,7,rec.autos
1,A fair number of brave souls who upgraded thei...,4,comp.sys.mac.hardware
2,"well folks, my mac plus finally gave up the gh...",4,comp.sys.mac.hardware
3,\nDo you have Weitek's address/phone number? ...,1,comp.graphics
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,sci.space


In [4]:
# ✅ Step 3: Preprocessing
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)  # remove HTML
    text = re.sub(r"[^a-zA-Z]", " ", text)  # keep only letters
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/snoopy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/snoopy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/snoopy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label,label_text,clean_text
0,I was wondering if anyone out there could enli...,7,rec.autos,wondering anyone could enlighten car saw day d...
1,A fair number of brave souls who upgraded thei...,4,comp.sys.mac.hardware,fair number brave soul upgraded clock oscillat...
2,"well folks, my mac plus finally gave up the gh...",4,comp.sys.mac.hardware,well folk mac plus finally gave ghost weekend ...
3,\nDo you have Weitek's address/phone number? ...,1,comp.graphics,weitek address phone number like get informati...
4,"From article <C5owCB.n3p@world.std.com>, by to...",14,sci.space,article tombaker world std com tom baker under...


In [5]:
# ✅ Step 4: Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]


In [6]:
# ✅ Step 5: Train Baseline Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(clf, "tfidf_logreg_model.pkl")


Accuracy: 0.7344233318603623
              precision    recall  f1-score   support

           0       0.62      0.66      0.64        97
           1       0.66      0.73      0.69       104
           2       0.71      0.70      0.70       115
           3       0.67      0.64      0.66       123
           4       0.83      0.63      0.71       126
           5       0.76      0.86      0.81       106
           6       0.65      0.73      0.69       109
           7       0.80      0.74      0.77       139
           8       0.73      0.75      0.74       122
           9       0.52      0.85      0.65       102
          10       0.90      0.81      0.85       108
          11       0.90      0.85      0.87       125
          12       0.68      0.68      0.68       114
          13       0.81      0.82      0.81       119
          14       0.80      0.82      0.81       127
          15       0.73      0.78      0.75       122
          16       0.78      0.76      0.77       12

['tfidf_logreg_model.pkl']

In [7]:
# ✅ Step 6: Try a Pretrained Inference Pipeline
pipe = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")
sample_texts = df["text"].sample(5).tolist()
for text in sample_texts:
    result = pipe(text[:512])[0]
    print(f"Text: {text[:100]}")
    print(f"Prediction: {result['label']} (Confidence: {result['score']:.4f})\n")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Text: 
Oh, Your Highness?   And exactly why "should" the quotation
marks enclose "laws," not "must."

In c
Prediction: joy (Confidence: 0.3811)

Text: Hi again!

    Okay, am getting an old AT type together as well.
Anyone have a 16 bit MFM HDC they'd
Prediction: joy (Confidence: 0.9962)

Text: 
----------
Prediction: anger (Confidence: 0.6872)

Text: #>In article <1993Apr15.222600.11690@research.nj.nec.com>  
#>>  ...
#>> 	Several chemists already h
Prediction: anger (Confidence: 0.6898)

Text: Oh, excuse me for wasting the bandwidth, but I was referring to 
the original incident, not the rece
Prediction: anger (Confidence: 0.9893)



In [8]:
# ✅ Step 7: Hugging Face Login (Secure)
#notebook_login()
from huggingface_hub import login
login(token="")


In [None]:
# ✅ Step 8: Fine-Tune Transformer Model
# Disable W&B and Tokenizer Parallelism
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Force CPU (avoid MPS backend)
device = torch.device("cpu")

# Load Tiny Model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=20   
).to(device)        # Move model to CPU

# Tokenization Function 
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512   
    )

# Prepare Dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

# Subset Dataset for Fast Training
train_ds = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))  # Smaller training set
eval_ds = tokenized_dataset["test"].shuffle(seed=42).select(range(200))     # Smaller eval set

# Define Evaluation Metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return accuracy.compute(predictions=preds, references=labels)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,        # Safe for small model
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    logging_steps=50,
    push_to_hub=False,                     
    evaluation_strategy="epoch",           # Evaluate at the end of each epoch
    save_strategy="epoch",                 # Save model at the end of each epoch
    load_best_model_at_end=True,
    no_cuda=True                           # Force CPU (disable GPU/MPS)
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train Model
trainer.train()

# Save Model and Tokenizer Locally
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


  0%|          | 0/63 [00:00<?, ?it/s]

{'loss': 3.001, 'grad_norm': 2.634221076965332, 'learning_rate': 4.126984126984127e-06, 'epoch': 0.79}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 3.0000641345977783, 'eval_accuracy': 0.075, 'eval_runtime': 3.8055, 'eval_samples_per_second': 52.555, 'eval_steps_per_second': 3.416, 'epoch': 1.0}
{'train_runtime': 71.3338, 'train_samples_per_second': 14.019, 'train_steps_per_second': 0.883, 'train_loss': 3.0031339251805864, 'epoch': 1.0}


('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.txt',
 './finetuned_model/added_tokens.json',
 './finetuned_model/tokenizer.json')

In [15]:
import random

import wandb

# Start a new wandb run to track this script.
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="alinatendler-self-employed",
    # Set the wandb project where this run will be logged.
    project="LLM-LHL",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": 0.02,
        "architecture": "CNN",
        "dataset": "CIFAR-100",
        "epochs": 10,
    },
)

# Simulate training.
epochs = 10
offset = random.random() / 5
for epoch in range(2, epochs):
    acc = 1 - 2**-epoch - random.random() / epoch - offset
    loss = 2**-epoch + random.random() / epoch + offset

    # Log metrics to wandb.
    run.log({"acc": acc, "loss": loss})

# Finish the run and upload any remaining data.
run.finish()

0,1
acc,▁▃▂▇█▆█▇
loss,██▆▃▂▁▂▁

0,1
acc,0.80961
loss,0.15433
