In [1]:
pip install pandas numpy scikit-learn




In this code, we have loaded the dataset "spam.csv" from Kaggle, split it into training and testing sets, converted the text data into numerical features using CountVectorizer, and trained the Naive Bayes classifier on the training data. Finally, we evaluated the performance of the model on the testing data and printed accuracy, confusion matrix, and classification report.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Data Preprocessing
# Load the dataset (Make sure "spam.csv" is in the same directory)
data = pd.read_csv("spam.csv", encoding="ISO-8859-1")

# Drop unnecessary columns and rename columns
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data.columns = ["label", "text"]

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

# Step 3: Feature Extraction
# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Step 4: Building the Spam Detector (Naive Bayes Classifier)
spam_detector = MultinomialNB()
spam_detector.fit(X_train_counts, y_train)

# Step 5: Evaluating the Model
y_pred = spam_detector.predict(X_test_counts)

# Calculate accuracy and other metrics
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9838565022421525
Confusion Matrix:
[[963   2]
 [ 16 134]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

