In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|[^a-z\s]", "", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
path = 'Spam.csv'
df = pd.read_csv(path)

df['SMS'] = df['SMS'].apply(preprocess_text)

X = df['SMS'].values
y = df['CLASS'].apply(lambda x: 1 if x == 'spam' else 0).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

In [8]:
vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95)
vectorizer_save_path = 'tfidf_vectorizer.pkl'

X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

joblib.dump(vectorizer, vectorizer_save_path)

['tfidf_vectorizer.pkl']

In [13]:
class SpamDataset(Dataset):
    def __init__(self, X, y, vectorizer):
        self.X = X
        self.y = y
        self.vectorizer = vectorizer
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        text = self.X[index]
        label = self.y[index]

        text_vector = self.vectorizer.transform([text]).toarray().astype(np.float32)

        input_tensor = torch.tensor(text_vector).squeeze(0)
        label_tensor = torch.tensor(label).long()

        return input_tensor, label_tensor

In [14]:
class SpamDetector(nn.Module):
    def __init__(self, input_dim):
        super(SpamDetector, self).__init__()

        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = torch.relu(x)
        x = self.dropout(x)

        x = self.fc3(x)

        return x

In [15]:
train_dataset = SpamDataset(X_train, y_train, vectorizer)
test_dataset = SpamDataset(X_test, y_test, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

input_dim = X_train_vector.shape[1]
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
print(f"Device = {device}")

model = SpamDetector(input_dim).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

Device = cuda


In [16]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs.squeeze(), targets.float())

        # Backward pass
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        avg_loss = running_loss/len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
        
        # Quick accuracy check
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).float()
                total += targets.size(0)
                correct += (predicted == targets.float()).sum().item()
            print(f"Training Accuracy: {100 * correct / total:.2f}%")

Epoch [10/100], Loss: 0.0163
Training Accuracy: 99.83%
Epoch [20/100], Loss: 0.0065
Training Accuracy: 99.83%
Epoch [30/100], Loss: 0.0063
Training Accuracy: 99.83%
Epoch [40/100], Loss: 0.0054
Training Accuracy: 99.75%
Epoch [50/100], Loss: 0.0055
Training Accuracy: 99.83%
Epoch [60/100], Loss: 0.0041
Training Accuracy: 99.83%
Epoch [70/100], Loss: 0.0033
Training Accuracy: 99.83%
Epoch [80/100], Loss: 0.0061
Training Accuracy: 99.83%
Epoch [90/100], Loss: 0.0039
Training Accuracy: 99.83%
Epoch [100/100], Loss: 0.0062
Training Accuracy: 99.92%


In [None]:
def evaluate_model(mode, test_loader):
    pass
