In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("spam.csv", encoding="latin-1")

# Keep only relevant columns and rename them
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

# Convert 'label' to binary (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display first few rows
print(df.head())


   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [None]:
print(df.columns)

Index(['label', 'text'], dtype='object')


In [None]:
!pip install tensorflow==2.15 keras==2.15




In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download stopwords if not already downloaded
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Function for text cleaning
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply text cleaning
df["cleaned_text"] = df["text"].apply(clean_text)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["cleaned_text"])

# Assign labels
y = df["label"]

# Print a sample of cleaned text
print(df[["text", "cleaned_text"]].head())

# Print TF-IDF shape
print("TF-IDF Matrix Shape:", X.shape)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                        cleaned_text  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts st ...  
3                u dun say early hor u c already say  
4        nah dont think goes usf lives around though  
TF-IDF Matrix Shape: (5572, 8448)


In [None]:
import numpy as np

# Convert sparse matrix to dense and show the first 5 rows
print(np.round(X.toarray(), 2)[:5])

# Print feature names
print(vectorizer.get_feature_names_out()[:10])  # First 10 words
  # First 10 words
 # First 10 words


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['aa' 'aah' 'aaniye' 'aaooooright' 'aathilove' 'aathiwhere' 'ab' 'abbey'
 'abdomen' 'abeg']


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the sizes
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (4457, 8448)
Testing set size: (1115, 8448)


In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train the Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Make predictions
y_pred = nb_model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.967713004484305

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np


In [None]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)  # Keep the top 10,000 most frequent words
tokenizer.fit_on_texts(df["cleaned_text"])

# Convert text into sequences
X_seq = tokenizer.texts_to_sequences(df["cleaned_text"])

# Pad sequences to ensure uniform input length
max_len = 100  # Fixed length for all sequences
X_padded = pad_sequences(X_seq, maxlen=max_len, padding="post")

# Assign labels
y = np.array(df["label"])  # Labels: 0 for ham, 1 for spam

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)


In [None]:
# Build LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),  # Dropout to reduce overfitting
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation="sigmoid")  # Binary classification (Spam or Ham)
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Display model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 spatial_dropout1d (Spatial  (None, 100, 128)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 1371701 (5.23 MB)
Trainable params: 1371701 (5.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test), verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test), verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluate on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8655


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}  # Alpha is the smoothing parameter

# Perform grid search
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Best Parameters: {'alpha': 0.1}
Best Accuracy: 0.8236432586955755


In [None]:
from tensorflow.keras.layers import GRU

# Replace LSTM with GRU
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    GRU(100, dropout=0.2, recurrent_dropout=0.2),  # GRU instead of LSTM
    Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79d4d4731dd0>

In [2]:
!git clone https://github.com/Daniel-mass/spam-text-classifier-ml-dl.git


Cloning into 'spam-text-classifier-ml-dl'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [3]:
%cd spam-text-classifier-ml-dl


/content/spam-text-classifier-ml-dl


In [4]:
!git config --global user.name "Danie-mass"

In [5]:
!git config --global user.email "danieldenision@gmail.com"

In [6]:
import os, getpass, subprocess, shlex
token = getpass.getpass('Paste your GitHub token (input hidden): ')
os.environ['GITHUB_TOKEN'] = token

# Set remote URL to include the token (only in memory for this runtime)
remote_url = "https://{}@github.com/daniel-mass/spam-text-classifier-ml-dl.git".format(os.environ['GITHUB_TOKEN'])
subprocess.run(shlex.split(f'git remote set-url origin "{remote_url}"'), check=True)

Paste your GitHub token (input hidden): ··········


CompletedProcess(args=['git', 'remote', 'set-url', 'origin', 'https://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@github.com/daniel-mass/spam-text-classifier-ml-dl.git'], returncode=0)

In [7]:
!git add -A

In [8]:
!git commit -m "Initial push from Colab"

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [9]:
!git push origin HEAD

Everything up-to-date


In [10]:
!pwd

/content/spam-text-classifier-ml-dl


In [11]:
!ls

README.md


In [12]:
!git status


On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [13]:
!git add README.md

In [14]:
!git commit -m "Add README"

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [15]:
!git push origin HEAD

To https://github.com/daniel-mass/spam-text-classifier-ml-dl.git
 [31m! [rejected]       [m HEAD -> main (fetch first)
[31merror: failed to push some refs to 'https://github.com/daniel-mass/spam-text-classifier-ml-dl.git'
[m[33mhint: Updates were rejected because the remote contains work that you do[m
[33mhint: not have locally. This is usually caused by another repository pushing[m
[33mhint: to the same ref. You may want to first integrate the remote changes[m
[33mhint: (e.g., 'git pull ...') before pushing again.[m
[33mhint: See the 'Note about fast-forwards' in 'git push --help' for details.[m


In [16]:
!git pull --rebase origin main


remote: Enumerating objects: 5, done.[K
remote: Counting objects:  20% (1/5)[Kremote: Counting objects:  40% (2/5)[Kremote: Counting objects:  60% (3/5)[Kremote: Counting objects:  80% (4/5)[Kremote: Counting objects: 100% (5/5)[Kremote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects:  50% (1/2)[Kremote: Compressing objects: 100% (2/2)[Kremote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects:  33% (1/3)Unpacking objects:  66% (2/3)Unpacking objects: 100% (3/3)Unpacking objects: 100% (3/3), 1.67 KiB | 1.67 MiB/s, done.
From https://github.com/daniel-mass/spam-text-classifier-ml-dl
 * branch            main       -> FETCH_HEAD
   2a58bb5..c9f7eb7  main       -> origin/main
Updating 2a58bb5..c9f7eb7
Fast-forward
 README.md | 53 [32m+++++++++++++++++++++++++++++++++++++++++++++++++++[m[31m--[m
 1 file changed, 51 insertions(+), 2 deletions(-)


In [17]:
!git push origin main


Everything up-to-date
