# NLP: Comparing TF-IDF, BERT, and GPT

In [1]:
! pip install datasets

Collecting datasets
  Using cached datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Using cached datasets-4.1.1-py3-none-any.whl (503 kB)
Installing collected packages: datasets
Successfully installed datasets-4.1.1


In [1]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
small_dataset = dataset["train"].shuffle(seed=42).select(range(500))  # Small sample

In [2]:
small_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 500
})

In [3]:
from pprint import pprint

# Show the first 3 examples
pprint(small_dataset[:5])

{'label': [0, 1, 0, 3, 0],
 'text': ['Bangladesh paralysed by strikes Opposition activists have brought '
          'many towns and cities in Bangladesh to a halt, the day after 18 '
          'people died in explosions at a political rally.',
          'Desiring Stability Redskins coach Joe Gibbs expects few major '
          'personnel changes in the offseason and wants to instill a culture '
          'of stability in Washington.',
          'Will Putin #39;s Power Play Make Russia Safer? Outwardly, Russia '
          'has not changed since the barrage of terrorist attacks that '
          'culminated in the school massacre in Beslan on Sept.',
          'U2 pitches for Apple New iTunes ads airing during baseball games '
          'Tuesday will feature the advertising-shy Irish rockers.',
          'S African TV in beheading blunder Public broadcaster SABC '
          'apologises after news bulletin shows footage of American beheaded '
          'in Iraq.']}


In [4]:
small_dataset.features['label']

ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'])

# TF-IDF

We're training a simple text classification model using TF-IDF (Term Frequency–Inverse Document Frequency) to convert news headlines into numerical features, and then using Logistic Regression to classify them into one of four categories: World, Sports, Business, or Sci/Tech. This approach doesn't understand the context of words, but it's quick and surprisingly effective on short texts.

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
df = pd.DataFrame({
    "text": small_dataset["text"],
    "label": small_dataset["label"]
})

label_names = dataset["train"].features["label"].names
df["label_name"] = df["label"].apply(lambda x: label_names[x])

In [7]:
df.head()

Unnamed: 0,text,label,label_name
0,Bangladesh paralysed by strikes Opposition act...,0,World
1,Desiring Stability Redskins coach Joe Gibbs ex...,1,Sports
2,Will Putin #39;s Power Play Make Russia Safer?...,0,World
3,U2 pitches for Apple New iTunes ads airing dur...,3,Sci/Tech
4,S African TV in beheading blunder Public broad...,0,World


In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

In [9]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
# This step is just so you can see what's going on, but not necessary

feature_names = vectorizer.get_feature_names_out()
X_train_df = pd.DataFrame(X_train_vec.toarray(), columns=feature_names)
X_train_df.head()

Unnamed: 0,000,04,10,100,11,12,13,14,140,150,...,zaloudek,zap,zarqawi,zarqawis,zayed,zdnet,zealand,zee,zero,zherdev
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.26101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

In [12]:
# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=label_names))

              precision    recall  f1-score   support

       World       0.82      0.56      0.67        25
      Sports       0.83      0.78      0.81        32
    Business       0.69      0.58      0.63        19
    Sci/Tech       0.57      0.88      0.69        24

    accuracy                           0.71       100
   macro avg       0.73      0.70      0.70       100
weighted avg       0.74      0.71      0.71       100



# BERT (without fine-tuning)

https://huggingface.co/models

In this section, we’ll use pretrained BERT (without fine-tuning) to classify the same news samples. Instead of training a model end-to-end, we’ll use BERT as a feature extractor — generating sentence embeddings, then training a Logistic Regression classifier on top of those embeddings.

This approach is called "BERT as frozen embeddings" and it's a great bridge between traditional ML and deep learning.

- This method uses no gradient updates — just BERT's built-in "understanding".

- You get context-aware sentence embeddings with zero training on BERT.

- It’s usually better than TF-IDF, especially with fewer data points.


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

In [None]:
# Load tokenizer and model (we're using 'bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Important: we're NOT fine-tuning

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
def get_bert_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        return cls_embedding.squeeze().numpy()

# Convert all texts to BERT embeddings
X_bert = np.array([get_bert_embedding(text) for text in tqdm(df["text"])])
y = df["label"]

100%|██████████| 500/500 [02:14<00:00,  3.72it/s]


In [None]:
pd.DataFrame(X_bert).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.228359,0.011324,-0.028633,-0.316455,-0.414521,0.121089,0.306685,0.896508,-0.317951,0.230683,...,0.306268,0.500503,0.229095,-0.199255,0.117591,-0.176938,-0.082321,-0.571799,0.294347,-0.105267
1,-0.412019,-0.239822,-0.229257,0.081167,-0.576244,-0.363986,0.410183,0.51556,0.096929,-0.667916,...,0.077785,0.035519,0.356043,-0.218685,-0.116622,0.386384,-0.307047,-0.793273,0.35538,0.544011
2,-0.596973,-0.440494,-0.226746,-0.330813,-0.649351,-0.391808,0.132022,0.590866,0.664307,-0.258116,...,0.207182,0.580268,0.308361,-0.274541,0.740087,-0.610487,-0.139965,0.177611,0.656913,0.291045
3,-0.230153,0.104161,-0.029236,0.316301,-0.230702,-0.23888,0.671323,0.563431,-0.002342,-0.390459,...,-0.2807,-0.108429,0.324013,-0.210438,0.316025,0.219309,-0.227503,-0.25208,0.470735,-0.097277
4,-0.798143,0.061648,0.179822,-0.282741,-0.766942,0.177523,0.916237,0.786449,-0.160493,0.204127,...,-0.026612,0.111854,0.662108,-0.391294,0.206335,-0.179098,0.131333,-0.682872,0.666757,-0.044787


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_names))

              precision    recall  f1-score   support

       World       0.83      0.80      0.82        25
      Sports       0.96      0.81      0.88        32
    Business       0.67      0.63      0.65        19
    Sci/Tech       0.65      0.83      0.73        24

    accuracy                           0.78       100
   macro avg       0.78      0.77      0.77       100
weighted avg       0.80      0.78      0.78       100



# BERT with fine-tuning

In this section, we’ll fine-tune BERT on our news classification task. Instead of using BERT as a frozen feature extractor, we’ll now allow the model to update its internal weights based on our specific dataset. This results in significantly better performance, especially on tasks with more nuance.

We'll use Hugging Face’s Trainer API to make the process super clean.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from datasets import Dataset

In [None]:
# Convert to HuggingFace Dataset
dataset_hf = Dataset.from_pandas(df)
dataset_hf = dataset_hf.train_test_split(test_size=0.2)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset_hf.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text", "label_name"])
tokenized_dataset.set_format("torch")

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8717,0.685071
2,0.434,0.424346
3,0.2516,0.352086


TrainOutput(global_step=150, training_loss=0.6473554531733196, metrics={'train_runtime': 33.3398, 'train_samples_per_second': 35.993, 'train_steps_per_second': 4.499, 'total_flos': 78934734028800.0, 'train_loss': 0.6473554531733196, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.35208624601364136,
 'eval_runtime': 0.7627,
 'eval_samples_per_second': 131.11,
 'eval_steps_per_second': 17.044,
 'epoch': 3.0}

In [None]:
predictions = trainer.predict(tokenized_dataset["test"])
y_pred = predictions.predictions.argmax(axis=1)
y_true = predictions.label_ids

In [None]:
label_names = dataset["train"].features["label"].names
print(classification_report(y_true, y_pred, target_names=label_names))

              precision    recall  f1-score   support

       World       1.00      0.81      0.90        27
      Sports       1.00      1.00      1.00        21
    Business       0.95      0.87      0.91        23
    Sci/Tech       0.81      1.00      0.89        29

    accuracy                           0.92       100
   macro avg       0.94      0.92      0.92       100
weighted avg       0.93      0.92      0.92       100



# BERT with transfer learning

In this section, we’ll use a domain-specific BERT model (e.g., trained on biomedical or scientific text) and fine-tune it on the same AG News dataset. This is called Transfer Learning — using a model that already understands a certain domain and adapting it to your specific task.

While AG News isn’t medical or legal, this demo will help you see how choosing the right pre-trained BERT variant (BioBERT, ClinicalBERT, SciBERT, etc.) can improve performance when domains match.

We'll use: allenai/scibert_scivocab_uncased
A BERT variant trained on scientific papers — great for showing domain-specific behavior, even if our AG News dataset isn’t a perfect match.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoTokenizer

# Load tokenizer for SciBERT
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset_hf.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text", "label_name"]) if "label_name" in tokenized_dataset["train"].column_names else tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=4)

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results-scibert",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.9237,0.684864
2,0.5697,0.448764
3,0.3207,0.416972


TrainOutput(global_step=150, training_loss=0.6379006767272949, metrics={'train_runtime': 36.1586, 'train_samples_per_second': 33.187, 'train_steps_per_second': 4.148, 'total_flos': 78934734028800.0, 'train_loss': 0.6379006767272949, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.4169720709323883,
 'eval_runtime': 0.7199,
 'eval_samples_per_second': 138.903,
 'eval_steps_per_second': 18.057,
 'epoch': 3.0}

In [None]:
predictions = trainer.predict(tokenized_dataset["test"])
y_pred = predictions.predictions.argmax(axis=1)
y_true = predictions.label_ids

label_names = dataset["train"].features["label"].names
print(classification_report(y_true, y_pred, target_names=label_names))

              precision    recall  f1-score   support

       World       0.91      0.78      0.84        27
      Sports       0.90      0.90      0.90        21
    Business       0.84      0.91      0.88        23
    Sci/Tech       0.81      0.86      0.83        29

    accuracy                           0.86       100
   macro avg       0.87      0.86      0.86       100
weighted avg       0.86      0.86      0.86       100



# LLM approach with GPT

In this section, we’ll use GPT-3.5 (via OpenAI’s API) to perform prompt-based text classification. Instead of training or fine-tuning a model, we’ll give GPT examples in the prompt and ask it to classify new samples.

This is called few-shot or zero-shot prompting, and it's great for rapid prototyping when:

- You have little or no labeled data

- You need a model that understands natural instructions

- You want to skip training altogether

In [None]:
from openai import OpenAI
import os

In [None]:
key = 'INSERT YOURS'

In [None]:
client = OpenAI(api_key=key)

### Zero-shot prompting

In [None]:
sample_text = "NASA reveals new photos from the Mars rover."

In [None]:
prompt = f"""
Classify the following news article into one of these categories: World, Sports, Business, or Sci/Tech.

Article: "{sample_text}"

Category:
"""

In [None]:
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that classifies news articles."},
        {"role": "user", "content": prompt}
    ],
    temperature=0
)

In [None]:
gpt_output = response.choices[0].message.content.strip()
print("Predicted category:", gpt_output)

Predicted category: Category: Sci/Tech


In [None]:
categories = list(df.label_name.unique())

predicted_labels = []

for text in tqdm(df['text'], desc="Classifying with GPT"):
    prompt = f"""
    Classify the following news article into one of these categories: {', '.join(categories)}.

    Article: "{text}"

    Category:
    """
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that classifies news articles."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        prediction = response.choices[0].message.content.strip()
    except Exception as e:
        print("Error:", e)
        prediction = "Unknown"

    predicted_labels.append(prediction)

Classifying with GPT: 100%|██████████| 500/500 [04:17<00:00,  1.94it/s]


In [None]:
df['gpt_predicted'] = predicted_labels

In [None]:
df['gpt_predicted'].value_counts()

Unnamed: 0_level_0,count
gpt_predicted,Unnamed: 1_level_1
Category: Business,177
Category: Sports,115
Category: World,110
Category: Sci/Tech,74
Sports,19
"This news article would be classified under the category of ""World"".",1
This news article would fall under the category of Sports.,1
Business,1
Category: Entertainment,1
"This article does not fit into any of the provided categories (World, Sports, Sci/Tech, Business) as it appears to be a satirical or fictional piece rather than a news article reporting on real-world events.",1


In [None]:
def veredict_clean(text):
  if text == "Category: Business":
    return "Business"
  if text == "Category: Sports" or text =='Sports':
    return "Sports"
  if text == "Category: World":
    return "World"
  if text == "Category: Sci/Tech":
    return "Sci/Tech"
  return None

In [None]:
df['gtp_clean'] = df['gpt_predicted'].apply(lambda row: veredict_clean(row))

In [None]:
df_clean = df.dropna()

In [None]:
print(classification_report(df_clean['label_name'], df_clean['gtp_clean']))

              precision    recall  f1-score   support

    Business       0.62      0.97      0.76       113
    Sci/Tech       0.92      0.51      0.65       134
      Sports       0.95      0.95      0.95       134
       World       0.85      0.82      0.84       114

    accuracy                           0.81       495
   macro avg       0.84      0.81      0.80       495
weighted avg       0.84      0.81      0.80       495



# Few-Shot prompting

In [None]:
prompt = """
Classify the following news articles into one of these categories: World, Sports, Business, or Sci/Tech.

Examples:
- "The president of France met with the German chancellor to discuss economic policy." → World
- "The stock market closed higher today with gains in the tech sector." → Business
- "The Lakers defeated the Celtics in a close basketball game." → Sports
- "NASA announces a new mission to Europa, one of Jupiter's moons." → Sci/Tech

Now classify this:
"NASA reveals new photos from the Mars rover."

Category:
"""

# Summary

| Method                   | Learns on Your Data? | Understands Context? | Custom to Domain? | Training Time   |
|--------------------------|----------------------|-----------------------|-------------------|-----------------|
| TF-IDF + Logistic        | ✅ Yes               | ❌ No                | ❌ No              | ⚡ Fast          |
| BERT (frozen)            | ❌ No                | ✅ Yes               | ❌ No              | ⚡ Fast          |
| Fine-tuned BERT          | ✅ Yes               | ✅ Yes               | ✅ Yes             | 🕐 Medium        |
| Domain-Specific BERT  (transfer learning)   | ✅ Yes               | ✅ Yes               | ✅✅ Very           | 🕐 Medium        |
| LLM (GPT - zero/few-shot)| ❌ No (prompt only)  | ✅✅ Very             | ✅ Yes (via prompt)| 🚀 Instant (no training) but slower inference |
