#SMS Spam



# --- Using Logistic Regression



In [2]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Load dataset
# This example uses the SMS Spam Collection dataset
df = pd.read_csv("/content/sample_data/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

# Convert labels to binary (ham=0, spam=1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Step 4: Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Step 6: Predictions & Evaluation
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Step 7: Test with custom emails
emails = [
    "Congratulations! You've won a free ticket to the Bahamas. Claim now!",
    "Hi Sarah, are we still on for the meeting tomorrow?"
]
emails_tfidf = vectorizer.transform(emails)
predictions = model.predict(emails_tfidf)

for email, label in zip(emails, predictions):
    print(f"{email} -> {'Spam' if label == 1 else 'Ham'}")


Accuracy: 0.9524663677130045
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115

Congratulations! You've won a free ticket to the Bahamas. Claim now! -> Spam
Hi Sarah, are we still on for the meeting tomorrow? -> Ham




# --- Using Naive Bayes



In [4]:
#Step 1 – Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#Step 2 – Load and explore the dataset
# Example: SMS spam dataset
df = pd.read_csv("/content/sample_data/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
print(df.head())

#Step 3 – Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)

#Step 4 – Convert text into numerical features
#We use TF-IDF (Term Frequency – Inverse Document Frequency).
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

#Step 5 – Train the model
#We’ll use Multinomial Naive Bayes (fast & effective for text).
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

#Step 6 – Evaluate
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

#Step 7 – Test with new emails
emails = [
    "Congratulations! You've won a $1000 gift card. Click here to claim.",
    "Hi John, can we reschedule our meeting to tomorrow?"
]
emails_tfidf = vectorizer.transform(emails)
predictions = model.predict(emails_tfidf)

for email, label in zip(emails, predictions):
    print(f"{email} -> {'Spam' if label == 1 else 'Ham'}")

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
Accuracy: 0.9668161434977578
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Congratulations! You've won a $1000 gift card. Click here to claim. -> Spam
Hi John, can we reschedule our meeting to tomorrow? -> Ham


#Using CNN

In [14]:
# Step 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Load dataset
df = pd.read_csv("/content/sample_data/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# Step 4: Tokenization
vocab_size = 5000
max_len = 100
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Step 5: Build CNN model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_len)) # Increased embedding dimension
model.add(Conv1D(filters=128, kernel_size=5, activation='relu')) # Increased filters
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu')) # Added another Conv1D layer and increased filters
model.add(MaxPooling1D(pool_size=2)) # Added another MaxPooling1D layer
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(64, activation='relu')) # Increased dense layer units
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 6: Train
history = model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test_pad, y_test),
    verbose=2
)

# Step 7: Evaluate
y_pred_cnn = (model.predict(X_test_pad) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred_cnn))
print(classification_report(y_test, y_pred_cnn))


# Step 8: Test with new emails
emails = [
    "Congratulations! You've been selected to win a free trip to Paris. Click here to claim!",
    "Hi, just checking if we are still meeting for lunch tomorrow."
]
emails_seq = tokenizer.texts_to_sequences(emails)
emails_pad = pad_sequences(emails_seq, maxlen=max_len, padding='post')

predictions = (model.predict(emails_pad) > 0.5).astype("int32")

for email, label in zip(emails, predictions):
    print(f"{email} -> {'Spam' if label == 1 else 'Ham'}")

Epoch 1/5




70/70 - 8s - 115ms/step - accuracy: 0.8952 - loss: 0.2523 - val_accuracy: 0.9803 - val_loss: 0.0781
Epoch 2/5
70/70 - 6s - 83ms/step - accuracy: 0.9874 - loss: 0.0497 - val_accuracy: 0.9821 - val_loss: 0.0628
Epoch 3/5
70/70 - 6s - 88ms/step - accuracy: 0.9957 - loss: 0.0163 - val_accuracy: 0.9839 - val_loss: 0.0632
Epoch 4/5
70/70 - 10s - 150ms/step - accuracy: 0.9984 - loss: 0.0069 - val_accuracy: 0.9830 - val_loss: 0.0813
Epoch 5/5
70/70 - 6s - 88ms/step - accuracy: 1.0000 - loss: 4.2233e-04 - val_accuracy: 0.9857 - val_loss: 0.0748
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Accuracy: 0.9856502242152466
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.97      0.93      0.95       150

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[1m1/1[0m [32m━━━━━━━━━━━