In [1]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load Dataset

In [2]:
df = pd.read_csv('spam.csv')

df['label'] = df['label'].astype(int)  # Ensure labels are integer (0 = Ham, 1 = Spam)

# Text Preprocessing

In [3]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['message'] = df['message'].apply(clean_text)

# Convert Text to Numerical Form (TF-IDF)

In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['message']).toarray()
y = df['label']

In [5]:
vectorizer

# Split Data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression Model

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make Predictions & Evaluate

In [8]:
y_pred = model.predict(X_test)

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Confusion Matrix:
 [[41  0]
 [ 0 47]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        47

    accuracy                           1.00        88
   macro avg       1.00      1.00      1.00        88
weighted avg       1.00      1.00      1.00        88



In [10]:
text = ["Your meeting is scheduled for 3 PM today"]
text_tfidf = vectorizer.transform(text).toarray()
prediction = model.predict(text_tfidf)
print("Spam" if prediction[0] == 1 else "Not Spam")


Not Spam


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample dataset
data = {
    'message': [
        "win a free lottery now",
        "lottery prize waiting for you",
        "meeting at 5 pm tomorrow",
        "free vacation package available",
        "schedule meeting for next week"
    ]
}

df = pd.DataFrame(data)

# Apply TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['message'])

# Convert to DataFrame for easy understanding
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Show TF-IDF values
display(tfidf_df)


Unnamed: 0,at,available,for,free,lottery,meeting,next,now,package,pm,prize,schedule,tomorrow,vacation,waiting,week,win,you
0,0.0,0.0,0.0,0.444002,0.444002,0.0,0.0,0.550329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.550329,0.0
1,0.0,0.0,0.388988,0.0,0.388988,0.0,0.0,0.0,0.0,0.0,0.48214,0.0,0.0,0.0,0.48214,0.0,0.0,0.48214
2,0.523358,0.0,0.0,0.0,0.0,0.422242,0.0,0.0,0.0,0.523358,0.0,0.0,0.523358,0.0,0.0,0.0,0.0,0.0
3,0.0,0.523358,0.0,0.422242,0.0,0.0,0.0,0.0,0.523358,0.0,0.0,0.0,0.0,0.523358,0.0,0.0,0.0,0.0
4,0.0,0.0,0.388988,0.0,0.0,0.388988,0.48214,0.0,0.0,0.0,0.0,0.48214,0.0,0.0,0.0,0.48214,0.0,0.0
