In [1]:
!pip install torch torchvision torchaudio
!pip install scikit-learn
!pip install tqdm
!pip install nltk




In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load the CSV dataset
df = pd.read_csv('combined_data.csv')

In [4]:
# Display first few rows
df.head()


Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [5]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [6]:
def preprocess_text(text):
    words = word_tokenize(text.lower())  # Tokenize and convert to lower case
    words = [ps.stem(word) for word in words if word.isalnum() and word not in stop_words]  # Stemming and removing stopwords
    return ' '.join(words)

In [7]:
df['processed_text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,label,text,processed_text
0,1,ounce feather bowl hummingbird opec moment ala...,ounc feather bowl hummingbird opec moment alab...
1,1,wulvob get your medircations online qnb ikud v...,wulvob get medirc onlin qnb ikud viagra escape...
2,0,computer connection from cnn com wednesday es...,comput connect cnn com wednesday escapenumb ma...
3,1,university degree obtain a prosperous future m...,univers degre obtain prosper futur money earn ...
4,0,thanks for all your answers guys i know i shou...,thank answer guy know check rsync manual would...


In [8]:
# Label encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # Convert labels to 0s and 1s

In [9]:
# Tokenization and padding
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text']).toarray()  # Convert text to numerical vectors


# Define features and labels

y = df['label'].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)

In [11]:
# Define features and labels
y = df['label'].values

In [12]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Convert to PyTorch tensors
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)

In [14]:
class SpamLSTM(nn.Module):
    def __init__(self, input_size):
        super(SpamLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=128, num_layers=1, batch_first=True)
        self.fc = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x.unsqueeze(1))  # Add batch dimension
        out = self.fc(lstm_out[:, -1, :])  # Take the output of the last time step
        out = self.sigmoid(out)
        return out

input_size = X_train.shape[1]
model = SpamLSTM(input_size)


In [15]:
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for features, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(features)

        # Reshape outputs to [batch_size, 1]
        outputs = outputs.view(-1, 1)

        # Ensure labels are also of shape [batch_size, 1]
        labels = labels.view(-1, 1)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')


100%|██████████| 1044/1044 [01:12<00:00, 14.38it/s]


Epoch 1/10, Loss: 0.08053225335567811


100%|██████████| 1044/1044 [01:10<00:00, 14.79it/s]


Epoch 2/10, Loss: 0.022848281867552566


100%|██████████| 1044/1044 [01:10<00:00, 14.78it/s]


Epoch 3/10, Loss: 0.011627138665849907


100%|██████████| 1044/1044 [01:13<00:00, 14.11it/s]


Epoch 4/10, Loss: 0.007514711520592427


100%|██████████| 1044/1044 [01:15<00:00, 13.77it/s]


Epoch 5/10, Loss: 0.0057525625397141355


100%|██████████| 1044/1044 [01:14<00:00, 14.05it/s]


Epoch 6/10, Loss: 0.004590811813249452


100%|██████████| 1044/1044 [01:15<00:00, 13.79it/s]


Epoch 7/10, Loss: 0.003988357767795016


100%|██████████| 1044/1044 [01:16<00:00, 13.57it/s]


Epoch 8/10, Loss: 0.0033833781730558785


100%|██████████| 1044/1044 [01:15<00:00, 13.77it/s]


Epoch 9/10, Loss: 0.003126535134409214


100%|██████████| 1044/1044 [01:15<00:00, 13.80it/s]

Epoch 10/10, Loss: 0.0028551686884602423





In [16]:
# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    true_labels, predictions = [], []

    with torch.no_grad():
        for features, labels in dataloader:
            outputs = model(features)
            preds = (outputs.squeeze() >= 0.5).int()
            true_labels.extend(labels.tolist())
            predictions.extend(preds.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    return accuracy, precision, recall, f1


In [17]:
# Test the model
acc, prec, recall, f1 = evaluate_model(model, test_loader)
print(f'Test Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

Test Accuracy: 0.9868, Precision: 0.9861, Recall: 0.9887, F1 Score: 0.9874


In [18]:
torch.save(model.state_dict(), 'spam_detection_model.pth')


In [19]:
def predict_spam(text, model, vectorizer):
    model.eval()
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text]).toarray()
    with torch.no_grad():
        prob = model(torch.tensor(vectorized_text, dtype=torch.float32)).item()
        pred = 1 if prob >= 0.5 else 0
        return pred, prob

# Test a new message
test_text = "You have won a free iPhone! Click here to claim."
pred, prob = predict_spam(test_text, model, vectorizer)
print(f'Prediction: {"Spam" if pred == 1 else "Not Spam"}, Probability: {prob:.4f}')


Prediction: Spam, Probability: 0.9998


In [20]:
def predict_spam(text, model, vectorizer):
    model.eval()
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text]).toarray()
    with torch.no_grad():
        prob = model(torch.tensor(vectorized_text, dtype=torch.float32)).item()
        pred = 1 if prob >= 0.5 else 0
        return pred, prob

# Function to get user input and predict spam with a loop
def user_input_prediction(model, vectorizer):
    while True:
        user_text = input("Enter a message to check if it is spam: ")
        pred, prob = predict_spam(user_text, model, vectorizer)
        print(f'Prediction: {"Spam" if pred == 1 else "Not Spam"}, Probability: {prob:.4f}')

        # Ask if the user wants to check another message
        check_more = input("Do you want to check another message? (yes/no): ").strip().lower()
        if check_more != 'yes':
            print("Exiting spam detection.")
            break

# Call the function to check user input in a loop
user_input_prediction(model, vectorizer)


Enter a message to check if it is spam: Thank you replying!
Prediction: Not Spam, Probability: 0.0749
Do you want to check another message? (yes/no): yes
Enter a message to check if it is spam: Congratulation. You won 5 million dollar in lottery
Prediction: Spam, Probability: 0.7971
Do you want to check another message? (yes/no): no
Exiting spam detection.
