In [1]:
!pip install PyMuPDF nltk tensorflow scikit-learn


Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.4


In [2]:
import fitz  # PyMuPDF (for PDF text extraction)
import re
import nltk
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Download NLTK tokenizer
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Sample labeled dataset: Spam (1) and Not Spam (0)
texts = [
    "Congratulations! You won a lottery. Click here to claim your prize.",  # Spam
    "Your order has been shipped. Track it using this link.",  # Not Spam
    "Claim your free gift now by signing up on our website.",  # Spam
    "Meeting scheduled at 5 PM today. Please confirm your attendance.",  # Not Spam
]

labels = [1, 0, 1, 0]  # Spam = 1, Not Spam = 0

# Tokenize and pad
MAX_VOCAB_SIZE = 5000  # Limit vocabulary size
MAX_SEQUENCE_LENGTH = 500  # Max length of sequences

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)  # Fit tokenizer on training data

# Convert text to sequences
text_sequences = tokenizer.texts_to_sequences(texts)
padded_texts = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

# Convert labels to categorical (one-hot encoding)
labels = np.array(labels)
labels = to_categorical(labels, num_classes=2)

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(padded_texts, labels, test_size=0.2, random_state=42)

print("Training Data Shape:", X_train.shape)
print("Test Data Shape:", X_test.shape)


Training Data Shape: (3, 500)
Test Data Shape: (1, 500)


In [5]:
# Define LSTM Model
model = Sequential([
    Embedding(MAX_VOCAB_SIZE, 128, input_length=MAX_SEQUENCE_LENGTH),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')  # Two classes: Spam or Not Spam
])

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()




In [6]:
# Train the model on the dataset
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=4)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - accuracy: 0.6667 - loss: 0.6928 - val_accuracy: 0.0000e+00 - val_loss: 0.7268
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 760ms/step - accuracy: 0.6667 - loss: 0.6811 - val_accuracy: 0.0000e+00 - val_loss: 0.7672
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 737ms/step - accuracy: 0.6667 - loss: 0.6735 - val_accuracy: 0.0000e+00 - val_loss: 0.8219
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 729ms/step - accuracy: 0.6667 - loss: 0.6715 - val_accuracy: 0.0000e+00 - val_loss: 0.8821
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.6667 - loss: 0.6630 - val_accuracy: 0.0000e+00 - val_loss: 0.9448
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.6667 - loss: 0.6499 - val_accuracy: 0.0000e+00 - val_loss: 1.0201
Epoch 7/10
[1m1/1[0m [

<keras.src.callbacks.history.History at 0x7ed1bd885c10>

In [10]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

# Example: Upload a PDF in Colab and provide its path
pdf_path = "/content/drive/MyDrive/sample.pdf"  # Ensure to upload the file in Colab
pdf_text = extract_text_from_pdf(pdf_path)
print("Extracted Text:", pdf_text[:500])  # Print first 500 characters


Extracted Text: 1. Congratulations! You've won a free iPhone. Click the link to claim your prize! 
2. Limited-time offer! Buy now and get 50% off. Click here to avail the deal. 
3. Your account has been compromised. Reset your password immediately using this link. 
4. Get rich quick! Earn $5000 per week from home. Sign up now. 
5. Free vacation for two! Just pay the processing fee and confirm your booking. 
1. Your order #12345 has been shipped. Expected delivery: 3-5 business days. 
2. Reminder: Your appointme


In [11]:
def preprocess_text(text):
    """Clean and preprocess text."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d', '', text)  # Remove numbers
    return text.strip()

# Apply preprocessing to extracted PDF text
clean_text = preprocess_text(pdf_text)
print("Preprocessed Text:", clean_text[:500])


Preprocessed Text: congratulations  you ve won a free iphone  click the link to claim your prize    limited time offer  buy now and get   off  click here to avail the deal    your account has been compromised  reset your password immediately using this link    get rich quick  earn   per week from home  sign up now    free vacation for two  just pay the processing fee and confirm your booking    your order   has been shipped  expected delivery    business days    reminder  your appointment is scheduled for tomorrow


In [12]:
def classify_text(text):
    """Classify a given text as Spam or Not Spam using trained LSTM model."""
    processed_text = preprocess_text(text)  # Preprocess text
    sequence = tokenizer.texts_to_sequences([processed_text])  # Tokenize
    padded_sequence = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH, padding="post")  # Pad sequence

    prediction = model.predict(padded_sequence)
    label = np.argmax(prediction, axis=1)[0]  # Get predicted label

    return "Spam" if label == 1 else "Not Spam"

# Example: Classify extracted PDF text
classification_result = classify_text(clean_text)
print("Predicted Category:", classification_result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 409ms/step
Predicted Category: Spam
