In [1]:
import re
import numpy as np
import pandas as pd

In [3]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
import joblib

In [10]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text


In [14]:
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [16]:
print (data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [19]:
# Print columns to verify names
print(data.columns)


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [20]:
# Use the 'v2' column for email content
data['v2'] = data['v2'].apply(preprocess_text)


In [24]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['v2'], data['v1'], test_size=0.2, random_state=42)

In [22]:
# Create a pipeline with a TF-IDF Vectorizer and a Support Vector Machine classifier
model = make_pipeline(TfidfVectorizer(), SVC())

In [23]:
# Train the model
model.fit(X_train, y_train)

In [25]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


In [27]:
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.979372197309417
Confusion Matrix:
[[965   0]
 [ 23 127]]


In [28]:
# Save the trained model for future use
joblib.dump(model, 'spam_detection_model_svm.joblib')


['spam_detection_model_svm.joblib']

In [29]:
# User Interaction for Prediction
user_input = input("Enter an email text for spam detection: ")
user_input = preprocess_text(user_input)
prediction = model.predict([user_input])
print(f"The email is {'spam' if prediction[0] == 'spam' else 'not spam'}.")

Enter an email text for spam detection: i want to connect with you
The email is not spam.
