In [1]:
import pandas as pd
import numpy as np


In [5]:
data = pd.read_csv(r"C:\Users\Gunjan\OneDrive\Documents\Datasets\spam.csv", encoding='windows-1252')


In [6]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
data = data[['v1', 'v2']]

In [9]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
data.columns = ['Label', 'Message']

In [16]:
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
data['Label'] = data['Label'].map({'ham': 0, 'spam': 1})


In [18]:
from sklearn.model_selection import train_test_split

X = data['Message']
y = data['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [22]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": LinearSVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}


In [24]:
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} :")
    print(f"Accuracy: {acc*100:.2f}%")



Naive Bayes :
Accuracy: 96.68%

Logistic Regression :
Accuracy: 95.25%

Support Vector Machine :
Accuracy: 97.85%

Decision Tree :
Accuracy: 97.22%

Random Forest :
Accuracy: 97.67%


In [25]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)


In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_svm = svm_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred_svm)*100)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))


Accuracy: 97.847533632287

Confusion Matrix:
 [[960   5]
 [ 19 131]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [29]:
sample_messages = [
    "Congratulations! You have won a free lottery ticket to Paris!",
    "Hey Gunjan, are we meeting tomorrow?",
    "URGENT! You have been selected for a cash prize, click here to claim.",
    "Let's catch up at the cafÃ© in the evening."
]

sample_tfidf = vectorizer.transform(sample_messages)
predictions = svm_model.predict(sample_tfidf)

for msg, pred in zip(sample_messages, predictions):
    label = "Spam ðŸš¨" if pred == 1 else "Ham âœ…"
    print(f"{label} â†’ {msg}")


Spam ðŸš¨ â†’ Congratulations! You have won a free lottery ticket to Paris!
Ham âœ… â†’ Hey Gunjan, are we meeting tomorrow?
Spam ðŸš¨ â†’ URGENT! You have been selected for a cash prize, click here to claim.
Ham âœ… â†’ Let's catch up at the cafÃ© in the evening.


In [30]:
while True:
    user_input = input("\nEnter a message (or type 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Goodbye ðŸ‘‹")
        break

    # Convert input to TF-IDF form
    user_tfidf = vectorizer.transform([user_input])

    # Predict
    pred = svm_model.predict(user_tfidf)[0]

    if pred == 1:
        print("ðŸš¨ This message is **SPAM**!")
    else:
        print("âœ… This message is **NOT SPAM**.")


Enter a message (or type 'exit' to quit): heyy you won price in ceremony for olympiad.
ðŸš¨ This message is **SPAM**!

Enter a message (or type 'exit' to quit): exit
Goodbye ðŸ‘‹
