In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
# Load the dataset
data = pd.read_csv('Dataset\Spam_dataset.csv')

In [3]:
# Split the dataset into features (X) and labels (y)
X = data['text']
y = data['label']

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data and transform it
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [6]:
# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Save the trained Naive Bayes classifier and CountVectorizer to files
joblib.dump(nb_classifier, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'count_vectorizer.pkl')

['count_vectorizer.pkl']

In [7]:
# Make predictions on the testing set
y_pred = nb_classifier.predict(X_test_vectorized)

In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9719806763285024
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       742
        spam       0.96      0.95      0.95       293

    accuracy                           0.97      1035
   macro avg       0.97      0.96      0.97      1035
weighted avg       0.97      0.97      0.97      1035

Confusion Matrix:
[[729  13]
 [ 16 277]]
