In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# Import necessary libraries
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Encode the labels
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

In [4]:
df['Category']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64

In [5]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [6]:
# Apply the preprocessing function to the Message column
df['Message'] = df['Message'].apply(preprocess_text)

In [7]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df['Message'], df['Category'], test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [8]:
# Train the model using Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(x_train_tfidf, y_train)

In [9]:
y_pred = nb_model.predict(x_test_tfidf)
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(1115,))

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [11]:
# Print the results
print(f'Accuracy: {accuracy * 100:.2f}%')
print('Classification Report:')
print(report)

Accuracy: 96.68%
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [12]:
# save model using joblib
import joblib
joblib.dump(nb_model, 'spam_classifier_model.joblib')   

['spam_classifier_model.joblib']

In [13]:
# save the vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']