# Custom Text Classification with BERT

Now, let’s fine-tune BERT on a real dataset.

Dataset: SMS Spam Classification https://github.com/mohitgupta-1O1/Kaggle-SMS-Spam-Collection-Dataset-/blob/master/spam.csv

We will use the SMS Spam Dataset, where each SMS is labeled as:

* ham (not spam)

* spam (unwanted message)


# Step 1: Import Libraries

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 2: Load and Preprocess Dataset

In [3]:
# import pandas as pd

# # Load dataset using the raw URL and specify the encoding
# df = pd.read_csv("https://raw.githubusercontent.com/mohitgupta-1O1/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", encoding='ISO-8859-1')

# # Rename columns
# df.rename(columns={"v1": "label", "v2": "message"}, inplace=True)

# # Convert labels to binary (0 for ham, 1 for spam)
# df["label"] = df["label"].map({"ham": 0, "spam": 1})

# # Drop unnecessary 'Unnamed' columns
# df.drop(columns=[col for col in df.columns if 'Unnamed' in col], inplace=True)

# # Check the data
# print(df.head())

In [4]:
# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     df["message"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
# )

In [5]:
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Step 3: Prepare Dataset for Training

In [6]:
# class SpamDataset(Dataset):
#     def __init__(self, texts, labels):
#         self.texts = texts
#         self.labels = labels

#     def __len__(self):
#         return len(self.texts)
    
#     def __getitem__(self, idx):
#         encoding = tokenizer(
#             self.texts[idx],
#             padding="max_length",
#             truncation=True,
#             max_length=64,
#             return_tensors="pt"
#         )
#         return {
#             "input_ids": encoding["input_ids"].squeeze(0),
#             "attention_mask": encoding["attention_mask"].squeeze(0),
#             "labels": torch.tensor(self.labels[idx], dtype=torch.long)
#         }

In [7]:
# train_dataset = SpamDataset(train_texts, train_labels)
# val_dataset = SpamDataset(val_texts, val_labels)

# Step 4: Load Pretrained Model

In [8]:
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 5: Define Metrics

In [9]:
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
#     acc = accuracy_score(labels, preds)
#     return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Step 6: Training Arguments

In [10]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     warmup_steps=100,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1"
# )

# Step 7: Train Model

In [11]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics,
# )

In [12]:
# trainer.train()

# Step 8: Save Model

In [13]:
# model.save_pretrained("./bert_spam_model")
# tokenizer.save_pretrained("./bert_spam_model")

# Step 9: Load Model for Prediction

In [14]:
from transformers import AutoModelForSequenceClassification

In [15]:
model = AutoModelForSequenceClassification.from_pretrained("./bert_spam_model")
tokenizer = AutoTokenizer.from_pretrained("./bert_spam_model")

# Step 10: Predict Function

In [16]:
def predict(text):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Encode the text
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64).to(device)

    with torch.no_grad():
        outputs = model(**encoding)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    label = torch.argmax(probs, dim=1).item()
    confidence = probs[0][label].item()
    
    return ("Spam" if label == 1 else "Ham", confidence)

# Step 11: Test Predictions

In [19]:
messages = [
    "Congratulations! You've won a lottery. Claim now!",
    "Hey, are we meeting at 5?",
    "URGENT! Your account has been hacked. Click the link now to secure it.",
    "Don't forget the meeting tomorrow at 10am."
]

In [20]:
for msg in messages:
    print(f"Message: {msg} → Prediction: {predict(msg)}")

Message: Congratulations! You've won a lottery. Claim now! → Prediction: ('Ham', 0.8677036762237549)
Message: Hey, are we meeting at 5? → Prediction: ('Ham', 0.999767005443573)
Message: URGENT! Your account has been hacked. Click the link now to secure it. → Prediction: ('Spam', 0.7624338269233704)
Message: Don't forget the meeting tomorrow at 10am. → Prediction: ('Ham', 0.9997327923774719)
