In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Load the dataset
data = pd.read_csv('emails.csv')

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], random_state=1)

print('Number of rows in the total set: {}'.format(data.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5754
Number of rows in the training set: 4315
Number of rows in the test set: 1439


In [49]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.990271021542738
Precision score: 0.9827089337175793
Recall score: 0.9770773638968482
F1 score: 0.9798850574712644


In [50]:
# Function to classify an email text as Spam or Not Spam
def classify_email(text):
    text_vector = tfidf_vectorizer.transform(text)
    prediction = naive_bayes.predict(text_vector)
    return "Spam" if prediction[0] == 1 else "Not Spam"

# Examples of email texts
email_text_1 = ["Meeting Agenda for Tomorrow"]  # Example 1: Not Spam
email_text_2 = ["Subject: Hi Walid,\n\nDo you listen to music on Spotify...	"] # Example 2: Spam
email_text_3 = ["Discogs has recently detected an increase in scammers in the Marketplace, and weâ€™re taking action to address the issue. New sellers on Discogs will undergo a waiting period to reduce fraud attempts. We are committed to maintaining a safe and trustworthy marketplace for independent buyers and sellers. If you see suspicious activity, please report it immediately to Discogs Support through this form. Learn more about how to recognize fraud in our guide to best practices for buying safely on Discogs: Safe Buying Tips"] # Example 3: Not Spam
email_text_4 = ["Subject: Important Updates on Your Account"] # Example 4: Spam
email_text_5 = ["Subject: Dinner Plans for This Friday"] # Example 5: Not Spam
email_text_6 = ["Subject: Claim Your Prize Now!"] # Example 6: Spam

examples = [email_text_1, email_text_2, email_text_3, email_text_4, email_text_5, email_text_6]

for i, example in enumerate(examples):
    result = classify_email(example)
    print(f"Example {i+1}: {result}")

Example 1: Not Spam
Example 2: Spam
Example 3: Spam
Example 4: Not Spam
Example 5: Not Spam
Example 6: Spam
