In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [5]:
# Load the dataset
dataset = pd.read_csv("dataset/spambase.csv")
# Show the first 10 rows of the dataset
dataset.head(10)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_edu,word_freq_table,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,spam
0,0,640,640,0,320,0,0,0,0,0,...,0,0,0,0,0,0,778,0,0,1
1,210,280,500,0,140,280,210,70,0,940,...,0,0,0,0,132,0,372,180,48,1
2,60,0,710,0,1230,190,190,120,640,250,...,60,0,0,10,143,0,276,184,10,1
3,0,0,0,0,630,0,310,630,310,630,...,0,0,0,0,137,0,137,0,0,1
4,0,0,0,0,630,0,310,630,310,630,...,0,0,0,0,135,0,135,0,0,1
5,0,0,0,0,1850,0,0,1850,0,0,...,0,0,0,0,223,0,0,0,0,1
6,0,0,0,0,1920,0,0,0,0,640,...,0,0,0,0,54,0,164,54,0,1
7,0,0,0,0,1880,0,0,1880,0,0,...,0,0,0,0,206,0,0,0,0,1
8,150,0,460,0,610,0,300,0,920,760,...,0,0,0,0,271,0,181,203,22,1
9,60,120,770,0,190,320,380,0,60,0,...,0,0,0,40,30,0,244,81,0,1


# 1. More frequent words in SPAM and NO SPAM emails

To get the most common words in spam emails.

In [6]:
# Agrupate by spam and sum the values
column_sum = dataset.groupby('spam', as_index=False).sum()
# Select only the columns that represent the frequency of words
word_columns = [col for col in dataset.columns if col.startswith('word_freq_')]
# Extract the frequencies of words in spam emails
spam_word_frequencies = column_sum.loc[1, word_columns]
# Order the frequencies from highest to lowest
most_used_words_in_spam = spam_word_frequencies.sort_values(ascending=False)
# Get the top 10 most used words in spam emails
top_10_spam_words = most_used_words_in_spam.head(10)
# Remove 'word_freq_' from the column names
top_10_spam_words.index = top_10_spam_words.index.str.replace('word_freq_', '')
top_10_spam_words

you         4105599
your        2502597
will         997100
free         939790
our          931799
all          732080
mail         635470
email        578759
business     521250
remove       499309
Name: 1, dtype: int64

To get the most common words in non-spam emails.

In [7]:
# Extract the frequencies of words in non-spam emails
non_spam_word_frequencies = column_sum.loc[0, word_columns]
# Order the frequencies from highest to lowest
most_used_words_in_non_spam = non_spam_word_frequencies.sort_values(ascending=False)
# Get the top 10 most used words in non-spam emails
top_10_non_spam_words = most_used_words_in_non_spam.head(10)
# Remove 'word_freq_' from the column names
top_10_non_spam_words.index = top_10_non_spam_words.index.str.replace('word_freq_', '')
top_10_non_spam_words

you        3541702
george     3527559
hp         2496576
will       1495268
your       1223098
hpl        1204398
re         1159138
edu         800669
address     681569
meeting     604460
Name: 0, dtype: int64

# 2. Dataset split into train and test subsets

In [8]:
# Create the X characteristics matrix by dropping the 'spam' column
X = dataset.drop('spam', axis=1)
# Create the y target vector by selecting only the 'spam' column
y = dataset['spam']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.3, random_state=42, stratify=y)
# Print the number of samples in each set
print(f"Training set has {X_train.shape[0]} samples.")
print(f"Testing set has {X_test.shape[0]} samples.")

Training set has 3220 samples.
Testing set has 1381 samples.


# 3. Naive Bayes Classifier

In [15]:
# Create the Naive Bayes classifier
clf = MultinomialNB()
# Train the classifier
clf.fit(X_train, y_train)

In [19]:
# Calculate the accuracy of the classifier with the training set
acc_train = accuracy_score(y_train, clf.predict(X_train))
# Calculate the accuracy of the classifier with the testing set
acc_test = accuracy_score(y_test, clf.predict(X_test))
# Print the accuracy metrics
print(f"Training accuracy: {acc_train}")
print(f"Test accuracy: {acc_test}")

Training accuracy: 0.8633540372670807
Test accuracy: 0.8732802317161478
