In [1]:
# Import necessary libraries
import pandas as pd
import string
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Load and preprocess the dataset
data = pd.read_csv("SPAM text message 20170820 - Data.csv")

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

# Apply the preprocessing function to the 'Message' column
data['Message'] = data['Message'].apply(preprocess_text)




In [2]:
# Text Tokenization and Padding Sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Message'])
sequences = tokenizer.texts_to_sequences(data['Message'])
padded_sequences = pad_sequences(sequences, padding='post')

# Prepare the Labels
labels = np.array(data['Category'].map({'ham': 0, 'spam': 1}))

# Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build the CNN Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 16
max_length = len(max(padded_sequences, key=len))

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Conv1D(64, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])

# Compile the Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the Model Summary (Architecture)
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 171, 16)           154528    
                                                                 
 conv1d (Conv1D)             (None, 167, 64)           5184      
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 159,777
Trainable params: 159,777
Non-trainable params: 0
_________________________________________________________________


In [3]:
# Train the Model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the Model on the Testing Data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)

# Print the Test Accuracy
print("\nTest Accuracy: {:.4f}".format(test_accuracy))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test Accuracy: 0.9830
