In [1]:
!pip install -q transformers datasets torch

In [2]:
import pandas as pd
from datasets import Dataset
from sklearn.utils import shuffle
import requests, zipfile, io

# Load UCI dataset
UCI_ZIP_URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

def load_uci_sms():
    r = requests.get(UCI_ZIP_URL)
    with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
        with zf.open("SMSSpamCollection") as f:
            df = pd.read_csv(f, sep="\t", header=None, names=["label", "text"])
    df["label"] = df["label"].map({"ham":0,"spam":1})
    df = shuffle(df).reset_index(drop=True)
    return df

train_df = load_uci_sms()
dataset = Dataset.from_pandas(train_df)
dataset = dataset.train_test_split(test_size=0.2)


In [3]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import os

# Disable Weights & Biases prompts
os.environ["WANDB_DISABLED"] = "true"

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 1) Tokenize WITHOUT fixed padding (let the data collator handle it)
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)  # no padding here

tokenized_data = dataset.map(tokenize, batched=True)
tokenized_data = tokenized_data.rename_column("label", "labels")

# (Optional) keep only needed columns so collator sees clean inputs
cols_to_keep = ["input_ids", "attention_mask", "labels"]
tokenized_data = tokenized_data.remove_columns(
    [c for c in tokenized_data["train"].column_names if c not in cols_to_keep]
)

# 2) Dynamic padding at batch time
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3) Model + training args
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./sms-bert",
    eval_strategy="epoch",     # ‚úÖ fixed argument name
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=20,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    data_collator=data_collator,  # ‚Üê dynamic padding fixes the stack error
    tokenizer=tokenizer,          # enables smart padding/truncation behaviors
)

trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0767,0.037633
2,0.0169,0.039465
3,0.0152,0.032161


TrainOutput(global_step=837, training_loss=0.04031757790297092, metrics={'train_runtime': 5042.5393, 'train_samples_per_second': 2.652, 'train_steps_per_second': 0.166, 'total_flos': 220408158037200.0, 'train_loss': 0.04031757790297092, 'epoch': 3.0})

In [5]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from google.colab import files
import io, os, re
from typing import List, Optional


In [6]:
try:
    _ = model.config.num_labels
    _ = tokenizer.pad_token_id
    print("‚úÖ Using fine-tuned model & tokenizer already in memory.")
except Exception as _:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    if not os.path.isdir("./sms-bert-model"):
        raise RuntimeError(
            "Fine-tuned model not found in memory and './sms-bert-model' does not exist.\n"
            "After training, run:\n"
            "  model.save_pretrained('./sms-bert-model')\n"
            "  tokenizer.save_pretrained('./sms-bert-model')\n"
            "‚Ä¶then re-run this cell."
        )
    tokenizer = AutoTokenizer.from_pretrained("./sms-bert-model")
    model = AutoModelForSequenceClassification.from_pretrained("./sms-bert-model")
    print("‚úÖ Loaded fine-tuned model & tokenizer from ./sms-bert-model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()
print(f"üñ•Ô∏è Using device: {device}")

‚úÖ Using fine-tuned model & tokenizer already in memory.
üñ•Ô∏è Using device: cpu


In [7]:
print("üì§ Upload your CSV (expected: Aman.csv with columns like "
      "'date','time','sender','message typ/message type','message body','spam')")
uploaded = files.upload()
if not uploaded:
    raise RuntimeError("No file uploaded.")
csv_name = next(iter(uploaded))
print(f"Uploaded: {csv_name}")

# Try robust reading (handles utf-8 / latin-1 etc.)
def read_csv_robust(b: bytes) -> pd.DataFrame:
    for enc in ["utf-8", "utf-8-sig", "latin-1"]:
        try:
            return pd.read_csv(io.BytesIO(b), encoding=enc)
        except Exception:
            continue
    # Fallback: let pandas guess
    return pd.read_csv(io.BytesIO(b), engine="python")

df = read_csv_robust(uploaded[csv_name])

üì§ Upload your CSV (expected: Aman.csv with columns like 'date','time','sender','message typ/message type','message body','spam')


Saving Aman_all.csv to Aman_all.csv
Uploaded: Aman_all.csv


In [8]:
def norm(s: str) -> str:
    return re.sub(r"\s+", " ", s.strip().lower())

df.columns = [norm(c) for c in df.columns]

# common variants for message/type columns
msg_candidates = ["message body", "message", "body", "message_body", "msg"]
typ_candidates = ["message typ", "message type", "type", "folder"]
date_candidates = ["date"]
time_candidates = ["time"]

def first_present(cands: List[str], cols: List[str]) -> Optional[str]:
    for c in cands:
        if c in cols:
            return c
    return None

msg_col  = first_present(msg_candidates, df.columns)
typ_col  = first_present(typ_candidates, df.columns)
date_col = first_present(date_candidates, df.columns)
time_col = first_present(time_candidates, df.columns)

if msg_col is None:
    raise ValueError(f"Couldn't find your message text column. "
                     f"Looked for: {msg_candidates}. Found: {list(df.columns)}")

# Keep original for output; build a clean text series
text = df[msg_col].astype(str).fillna("")


In [9]:
print("\n--- CSV quick stats ---")
print(f"Rows: {len(df)}")
if typ_col:
    print("Message type distribution:")
    print(df[typ_col].astype(str).str.strip().value_counts().head(10))
if date_col:
    try:
        dts = pd.to_datetime(df[date_col], errors="coerce")
        print(f"Date range: {dts.min()}  ‚Üí  {dts.max()}")
    except Exception:
        pass



--- CSV quick stats ---
Rows: 2044
Date range: 2023-01-01 00:00:00  ‚Üí  2025-08-09 00:00:00


In [14]:
# ‚úÖ REPLACEMENT for the batching/padding part
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

batch_size = 64

# Tokenize without padding; we‚Äôll pad per-batch
enc = tokenizer(
    text.tolist(),
    truncation=True,
    padding=False,
    return_tensors=None,
)

class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, enc):
        self.enc = enc
    def __len__(self):
        return len(self.enc["input_ids"])
    def __getitem__(self, idx):
        return {k: self.enc[k][idx] for k in self.enc}

# Dynamic padding at batch time
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

loader = DataLoader(
    InferenceDataset(enc),
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

# Inference
probs_spam = np.zeros(len(text), dtype=np.float32)
softmax = torch.nn.Softmax(dim=1)
model.eval()

offset = 0
with torch.no_grad():
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits  # [B,2]
        p = softmax(logits)             # probs over [ham, spam]
        spam_slice = p[:, 1].detach().cpu().numpy()
        bsz = spam_slice.shape[0]
        probs_spam[offset:offset+bsz] = spam_slice
        offset += bsz

# Base labels (you can keep these if you don't want threshold tuning)
pred_labels = (probs_spam >= 0.5).astype(int)
pred_names  = np.where(pred_labels == 1, "spam", "ham")
conf_pct    = (np.maximum(probs_spam, 1.0 - probs_spam) * 100.0)


In [15]:
import re
import numpy as np
import pandas as pd

threshold = 0.70
extreme_spam_cutoff = 0.95

# Base thresholding
base_pred_spam = (probs_spam >= threshold).astype(int)

# Find columns again (normalized earlier)
cols = list(df.columns)
msg_col = next((c for c in ["message body","message","body","message_body","msg"] if c in cols), None)
sender_col = "sender" if "sender" in cols else None
texts   = df[msg_col].astype(str).fillna("")
senders = df[sender_col].astype(str).fillna("") if sender_col else pd.Series([""]*len(df))

sender_keywords = [r"\bSBIUPI\b", r"\bSBI\b", r"\bHDFCBK\b", r"\bICICIBK?\b", r"\bAXISBK?\b",
                   r"\bKOTAK\b", r"\bPNB\b", r"\bBOB\b", r"\bPAYTM\b", r"\bAMAZONPAY\b",
                   r"\bGPAY\b", r"\bPHONEPE\b"]
txn_cues  = [r"\b(credited|debited|credit of|debit of|txn|transaction|ref(?:\.|) no|utr|acct(?:\.|) ending|a/c(?:\.|) .* ending)\b",
             r"\b(upi|imps|neft|rtgs)\b",
             r"\b(otp|one[- ]time password)\b"]
promo_cues = [r"\b(offer|sale|discount|cashback|win|free|deal|limited time|coupon|subscribe)\b"]

sr = re.compile("|".join(sender_keywords), flags=re.I)
tr = re.compile("|".join(txn_cues),      flags=re.I)
pr = re.compile("|".join(promo_cues),    flags=re.I)

consider = []
for s, t, p in zip(senders, texts, probs_spam):
    consider.append(((sr.search(s) or tr.search(t)) and not pr.search(t) and (p < extreme_spam_cutoff)))

consider = np.array(consider, dtype=bool)

final_pred_spam = base_pred_spam.copy()
final_pred_spam[consider] = 0  # override to ham

pred_labels = final_pred_spam
pred_names  = np.where(pred_labels == 1, "spam", "ham")


In [16]:
# (Optional) sanity check before building 'out'
assert len(probs_spam) == len(text), "Size mismatch: tokens vs text"

out = df.copy()
out["pred_spam"]          = pred_labels                  # 0=ham, 1=spam
out["pred_label"]         = pred_names                   # "ham"/"spam"
out["spam_probability_%"] = (probs_spam * 100.0).round(2)
out["confidence_%"]       = (np.maximum(probs_spam, 1.0 - probs_spam) * 100.0).round(2)

# Your existing summary prints are fine


In [18]:
out = df.copy()
out["pred_spam"]          = pred_labels                # 0=ham, 1=spam
out["pred_label"]         = pred_names                 # "ham"/"spam"
out["spam_probability_%"] = (probs_spam * 100.0).round(2)   # P(spam) in %
out["confidence_%"]       = conf_pct.round(2)               # max class prob in %

# ---------- 6) Diagnostics: predicted distribution and simple sanity check ----------
print("\n--- Prediction summary ---")
n = len(out)
spam_frac = float(out["pred_spam"].mean()) if n else 0.0
print(f"Predicted SPAM fraction: {spam_frac*100:.2f}%  ({out['pred_spam'].sum()}/{n})")
print("Top senders (predicted as spam) sample:")
if "sender" in out.columns:
    try:
        print(out.loc[out["pred_spam"]==1, "sender"].astype(str).str.strip().value_counts().head(10))
    except Exception:
        pass

if spam_frac > 0.85:
    print("\n‚ö†Ô∏è Heads-up: Over 85% predicted as spam. This can happen if:")
    print("   - Your source inbox has lots of promotions/OTP/transactional patterns.")
    print("   - Threshold is too strict (we used 0.5). You can tune it, e.g., 0.6/0.7.")
    print("   - Your fine-tuned model overfits or your CSV messages differ a lot from UCI SMS style.")



--- Prediction summary ---
Predicted SPAM fraction: 86.69%  (1772/2044)
Top senders (predicted as spam) sample:
sender
VM-SBIUPI    198
JK-SBIUPI    103
JD-SBIUPI     89
CP-SBIUPI     77
AD-AIRTEL     58
BA-SBIUPI     56
AD-650025     56
AZ-AIRTEL     44
VK-SBIUPI     43
AX-ARTLTV     37
Name: count, dtype: int64

‚ö†Ô∏è Heads-up: Over 85% predicted as spam. This can happen if:
   - Your source inbox has lots of promotions/OTP/transactional patterns.
   - Threshold is too strict (we used 0.5). You can tune it, e.g., 0.6/0.7.
   - Your fine-tuned model overfits or your CSV messages differ a lot from UCI SMS style.


In [19]:
out_name = "aman_with_predictions.csv"
out.to_csv(out_name, index=False, encoding="utf-8")
print(f"\n‚úÖ Saved: {out_name}")
files.download(out_name)


‚úÖ Saved: aman_with_predictions.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>