In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Load data
spam_df = pd.read_csv("spam.csv")
spam_df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
# Convert to numerical data by adding "is spam" column
spam_df["spam"] = spam_df["Category"].apply(lambda x: 1 if x == "spam" else 0)

In [5]:
# Create train/test split
x_train, x_test, y_train, y_test = train_test_split(spam_df.Message, spam_df.spam)
x_train.describe()

count                       4179
unique                      3937
top       Sorry, I'll call later
freq                          20
Name: Message, dtype: object

In [6]:
# Find word count and store data as matrix
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [11]:
x_train_count # 7539 unique words in whole dataset = columns, 4179 emails = rows

<4179x7539 sparse matrix of type '<class 'numpy.int64'>'
	with 55591 stored elements in Compressed Sparse Row format>

In [8]:
# Train model
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [18]:
# Pre-test ham text
email_ham = ["hey wanna meet up for the game?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [17]:
# Pre-test spam text
email_spam = ["reward money click"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [20]:
# Test model accuracy
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9877961234745154