Machine Learning Project - 5: **Spam Email Detection using NLP**

**Load Data into Collab:**

In [17]:
import pandas as pd

# Load dataset
df = pd.read_csv("spam_data.csv")

# Convert labels to numerical values
df["Label"] = df["Label"].map({"Spam": 1, "Not Spam": 0})

print(df.head())  # Check data


                                     Email Text  Label
0      "Congratulation! You have won a lottery"      1
1             "Meeting scheduled at 4 PM today"      0
2  "Earn money from home. Click this link now."      1
3           "Submit your project before Friday"      0
4    "Exclusive deal! Buy now and get 50% off!"      1


**Text Preprocessing:**

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the necessary NLTK data packages
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab') # Download the missing 'punkt_tab' data package

# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(words)

# Apply cleaning function to dataset
df["Processed_Text"] = df["Email Text"].apply(preprocess_text)

print(df.head())  # Check processed text

**Convert Text to Numerical Data:**

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Processed_Text"])  # Convert text to numerical form
y = df["Label"]  # Target variable (Spam = 1, Not Spam = 0)


**Split Data for Training & Testing:**


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Data: {X_train.shape}, Testing Data: {X_test.shape}")


Training Data: (4, 19), Testing Data: (1, 19)


**Train the Spam Classifier Model**

In [22]:
from sklearn.naive_bayes import MultinomialNB

# Create and train the model
model = MultinomialNB()
model.fit(X_train, y_train)

print("Model training completed!")


Model training completed!


**Make Predictions**

In [23]:
y_pred = model.predict(X_test)

# Compare actual vs predicted
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results)


   Actual  Predicted
1       0          1


**Evaluate Model Performance**

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(cm)


Model Accuracy: 0.00%
Confusion Matrix:
[[0 1]
 [0 0]]


 **Predict if a New Email is Spam or Not**

In [25]:
new_email = ["Win a free iPhone now! Click this link."]

# Convert email to numerical data
new_email_vectorized = vectorizer.transform(new_email)

# Predict spam or not
prediction = model.predict(new_email_vectorized)

if prediction[0] == 1:
    print("Spam Email 🚨")
else:
    print("Not Spam ✅")


Spam Email 🚨
