# Custom Spam Filtering

## Install Huggingface Transformers

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader

## Read Data

In [None]:
# Load the data into a pandas dataframe
df = pd.read_csv('./dataset/SMSSpamCollection', sep='\t', names=["label", "text"])

## Preprocessing

In [None]:
# Preprocessing the data to prepare it for training
# Get the values of the text and label columns
texts = df['text'].values
labels = df['label'].values

### Label Encoding

In [None]:
# Encode the labels to integer values
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

## Tokenization

In [None]:
# Tokenize the texts using the DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in texts]

### Padding

In [None]:
# Pad the tokenized texts to have a uniform length
max_len = max([len(text) for text in tokenized_texts])
padded_texts = [text + [0] * (max_len - len(text)) for text in tokenized_texts]

## Train Test Split

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_texts, labels, test_size=0.2)

## Convert to Tensors

In [None]:
# Convert the data to tensors for use with PyTorch
X_train_tensor = torch.tensor(X_train)
X_test_tensor = torch.tensor(X_test)
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)

## Dataset and Dataloaders

In [None]:
# Create TensorDataset and DataLoader objects for the training and testing sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model

### Initialize Model

In [None]:
# Load a pretrained DistilBertForSequenceClassification model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

In [None]:
# Set the device
# Use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

### Hyperparameters

In [None]:
learning_rate = 1e-5
num_epochs = 10

### Loss Function and Optimizer

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Train Model

In [None]:
# Train the model
train_loss_values = []
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_dataloader):
        labels = labels.to(torch.long)
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)[0]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 50 == 49:
            print(f"Epoch: {epoch} Iteration: {i} Loss: {running_loss/50}")
            train_loss_values.append(running_loss/50)
            running_loss = 0.0

### Plot Training Loss

In [None]:
plt.plot(np.arange(len(train_loss_values)), train_loss_values)
plt.title("Training Loss over Iterations")
plt.xlabel("Iteration")
plt.ylabel("Training Loss")
plt.show()

## Evaluate Model

In [None]:
# Evaluation
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs[0], 1)
        # Save the predictions for later use
        predictions.extend(predicted.cpu().numpy().tolist())
        true_labels.extend(labels.cpu().numpy().tolist())


## Performance Metrics

In [None]:
# Calculating the F1 Score
f1 = f1_score(true_labels, predictions, average='macro')
print(f"F1 Score: {f1}")
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(true_labels, predictions))
# Classification Report
print("Classification Report:")
print(classification_report(true_labels, predictions))


In [None]:
# Save Classification Model for inferencing
# torch.save({
#             'epoch': num_epochs,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'loss': loss,
#             }, 'checkpoint.pth')