<a href="https://colab.research.google.com/github/efeecllk/nlp/blob/main/spam_mail_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [10]:
# 1. Preparing the Dataset
# Creating a simple dataset manually for spam detection
print("Creating a simple dataset...")
data = {
    'label': ['spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam'],
    'message': [
        "Congratulations! You've won a free ticket to the Bahamas!",
        "Hey, are we still meeting for coffee tomorrow?",
        "Don't forget to send me the documents.",
        "You have been selected for a $500 cash prize!",
        "Let's catch up over dinner this weekend.",
        "Claim your free coupon now by clicking this link!",
        "I'll call you later today.",
        "Win a brand new car by entering this contest!"
    ]
}
df = pd.DataFrame(data)
print("Dataset successfully created.")

Creating a simple dataset...
Dataset successfully created.


In [11]:
# Rename the columns to 'label' and 'message'
df.columns = ["label", "message"]
print("Columns renamed to 'label' and 'message'.")


Columns renamed to 'label' and 'message'.


In [12]:
# 2. Data Preprocessing
# Convert 'ham' and 'spam' labels to numerical values (0 and 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
print("Labels converted to numerical values (0 for 'ham', 1 for 'spam').")


Labels converted to numerical values (0 for 'ham', 1 for 'spam').


In [13]:
# 3. Feature Extraction
# Convert messages to numerical data using TF-IDF Vectorizer
# 'stop_words' parameter removes English stopwords, 'max_df' ignores terms that occur too frequently
print("Initializing TF-IDF Vectorizer...")
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X = tfidf.fit_transform(df['message'])  # Create feature matrix from messages
y = df['label']  # Define target variable
print("Feature matrix created using TF-IDF Vectorizer.")


Initializing TF-IDF Vectorizer...
Feature matrix created using TF-IDF Vectorizer.


In [14]:

# 4. Splitting Training and Test Data
# Split the dataset into training and testing sets (80% training, 20% testing)
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} samples, Test set size: {X_test.shape[0]} samples.")


Splitting data into training and testing sets...
Training set size: 6 samples, Test set size: 2 samples.


In [15]:
# 5. Training the Model
# Create the Logistic Regression model and train it on the training data
print("Training Logistic Regression model...")
model = LogisticRegression()
model.fit(X_train, y_train)
print("Model training completed.")


Training Logistic Regression model...
Model training completed.


In [16]:
# 6. Evaluating the Model
# Evaluate the model by making predictions on the test data
print("Evaluating the model...")
y_pred = model.predict(X_test)  # Make predictions on the test set
accuracy = accuracy_score(y_test, y_pred)  # Calculate the accuracy
print(f"Accuracy: {accuracy * 100:.2f}%")  # Print the accuracy


Evaluating the model...
Accuracy: 100.00%


In [17]:
# Print the classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
 [[1 0]
 [0 1]]


In [18]:
# 7. Testing New Messages
# Create a function to predict whether a new message is spam or not
def predict_message(msg):
    print(f"Predicting for new message: '{msg}'")
    vec_msg = tfidf.transform([msg])  # Vectorize the message using TF-IDF
    prediction = model.predict(vec_msg)[0]  # Make prediction using the model
    print(f"Prediction completed. Result: {'Spam' if prediction == 1 else 'Ham'}")
    return "Spam" if prediction == 1 else "Ham"  # Return 'Spam' or 'Ham' based on the prediction


In [19]:
# Example Test
new_message = "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123xyz to claim now."
print(f"Message: '{new_message}' -> Prediction: {predict_message(new_message)}")  # Test the new message

Predicting for new message: 'Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123xyz to claim now.'
Prediction completed. Result: Spam
Message: 'Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123xyz to claim now.' -> Prediction: Spam
