In [3]:
# Install necessary libraries
!pip install transformers torch scikit-learn onnx onnxruntime

# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import onnx

# Load the spam dataset
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')
data.columns = data.columns.str.strip()  # Strip whitespace from column names
data.rename(columns={'v1': 'Category', 'v2': 'Message'}, inplace=True)
data['Spam'] = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

# Check for class distribution
print(data['Spam'].value_counts())

# Address class imbalance if present
ham_samples = data[data['Spam'] == 0]
spam_samples = data[data['Spam'] == 1]

if len(ham_samples) > len(spam_samples):
    spam_samples = spam_samples.sample(len(ham_samples), replace=True)  # Oversample spam
data = pd.concat([ham_samples, spam_samples]).sample(frac=1, random_state=42)  # Shuffle the dataset

# Split dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Message'], data['Spam'], test_size=0.25, random_state=42
)

# Define the custom dataset class for tokenization
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):  # Corrected __init__
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):  # Corrected __len__
        return len(self.texts)

    def __getitem__(self, idx):  # Corrected __getitem__
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize BertTiny tokenizer and model for sequence classification
model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2  # Binary classification for spam detection
)

# Freeze initial layers to fine-tune the classifier head
def freeze_bert_layers(model, num_layers_to_freeze=2):
    for param in model.bert.embeddings.parameters():
        param.requires_grad = False
    for layer in model.bert.encoder.layer[:num_layers_to_freeze]:
        for param in layer.parameters():
            param.requires_grad = False

freeze_bert_layers(model)

# Prepare the data loaders
train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
val_dataset = SpamDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Set up the device, optimizer, and scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
total_steps = len(train_loader) * 3  # Assuming 3 epochs for training
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
def train_model(model, train_loader, val_loader, device, num_epochs=3):
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch + 1} - Training Loss: {avg_train_loss:.4f}")

# Start the training process
train_model(model, train_loader, val_loader, device, num_epochs=3)

# Testing and Accuracy Evaluation
def evaluate_model(model, val_loader, device):
    model.eval()
    val_preds = []
    val_labels = []
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_val_loss += outputs.loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(val_labels, val_preds)
    print(f'Validation Accuracy: {accuracy:.4f}')
    print("Classification Report:")
    print(classification_report(val_labels, val_preds, target_names=['Ham', 'Spam']))

# Evaluate the model on the validation set
evaluate_model(model, val_loader, device)

# Test the model on new examples
def test_model(model, tokenizer, texts, device):
    model.eval()
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
        return predictions.cpu().numpy()

# Example emails for testing
test_emails = [
    "Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize!",
    "Can we schedule a meeting for next week?",
    "Hurry!!! Halloween is Near Get a Halloween Discount Today USE PROMO CODE 'halloween 2023' ",
    "Hey, just checking in to see how you're doing."
]

# Get predictions for the test emails
predictions = test_model(model, tokenizer, test_emails, device)

# Display predictions
for email, pred in zip(test_emails, predictions):
    label = 'Spam' if pred == 1 else 'Ham'
    print(f"Email: \"{email}\" - Prediction: {label}")

# Save the model in ONNX format
onnx_file_path = "bert_spam_detection.onnx"
dummy_input = torch.ones(1, 128, dtype=torch.int64).to(device)
dummy_attention_mask = torch.ones(1, 128, dtype=torch.int64).to(device)

# Export the model to ONNX with opset version 14
torch.onnx.export(
    model,
    (dummy_input, dummy_attention_mask),
    onnx_file_path,
    input_names=['input_ids', 'attention_mask'],
    output_names=['output'],
    dynamic_axes={'input_ids': {0: 'batch_size'}, 'attention_mask': {0: 'batch_size'}, 'output': {0: 'batch_size'}},
    opset_version=14
)

print(f"Model exported to ONNX format at: {onnx_file_path}")


Collecting transformers
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.1 kB ? eta -:--:--
     ----------------------------------- -- 41.0/44.1 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 44.1/44.1 kB 310.9 kB/s eta 0:00:00
Collecting torch
  Downloading torch-2.5.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting onnx
  Downloading onnx-1.17.0-cp312-cp312-win_amd64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.19.2-cp312-cp312-win_amd64.whl.metadata (4.7 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


Spam
0    4825
1     747
Name: count, dtype: int64


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Training Loss: 0.6314
Epoch 2 - Training Loss: 0.5662
Epoch 3 - Training Loss: 0.5309
Validation Accuracy: 0.9258
Classification Report:
              precision    recall  f1-score   support

         Ham       0.91      0.95      0.93      1216
        Spam       0.94      0.91      0.92      1197

    accuracy                           0.93      2413
   macro avg       0.93      0.93      0.93      2413
weighted avg       0.93      0.93      0.93      2413

Email: "Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize!" - Prediction: Spam
Email: "Can we schedule a meeting for next week?" - Prediction: Ham
Email: "Hurry!!! Halloween is Near Get a Halloween Discount Today USE PROMO CODE 'halloween 2023' " - Prediction: Spam
Email: "Hey, just checking in to see how you're doing." - Prediction: Ham
Model exported to ONNX format at: bert_spam_detection.onnx
