In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("spam_or_not_spam.csv")
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


### Text Processing

In [3]:
df.dropna(inplace=True)     # replace the missing value with NAN

In [4]:
c_vectorizer = CountVectorizer()        # bag of words

In [5]:
X = df["email"]     # feature set
y = df["label"]     # lebel set

In [6]:
X

0        date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...
1       martin a posted tassos papadopoulos the greek ...
2       man threatens explosion in moscow thursday aug...
3       klez the virus that won t die already the most...
4        in adding cream to spaghetti carbonara which ...
                              ...                        
2995     abc s good morning america ranks it the NUMBE...
2996     hyperlink hyperlink hyperlink let mortgage le...
2997     thank you for shopping with us gifts for all ...
2998     the famous ebay marketing e course learn to s...
2999     hello this is chinese traditional 子 件 NUMBER世...
Name: email, Length: 2999, dtype: object

In [7]:
y

0       0
1       0
2       0
3       0
4       0
       ..
2995    1
2996    1
2997    1
2998    1
2999    1
Name: label, Length: 2999, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [9]:
type(X_train)

pandas.core.series.Series

In [10]:
X_test

1376    on wednesday NUMBER august NUMBER NUMBER NUMBE...
1026    i guess the first question here should be does...
1380     it s this section of spamassassin raw spamass...
2692    greetings you are receiving this letter becaus...
1722     neale pickett if you can spare the memory you...
                              ...                        
695      original message from james rogers jamesr bes...
2252    url URL date NUMBER NUMBER NUMBERtNUMBER NUMBE...
2741     dear sir or madam in the past you have reques...
2287    url URL date not supplied img URL wonderful ga...
1803    use perl daily headline mailer using web servi...
Name: email, Length: 900, dtype: object

In [11]:
y_train

2855    1
577     0
1729    0
1417    0
847     0
       ..
1164    0
500     0
2990    1
421     0
1829    0
Name: label, Length: 2099, dtype: int64

In [12]:
y_test

1376    0
1026    0
1380    0
2692    1
1722    0
       ..
695     0
2252    0
2741    1
2287    0
1803    0
Name: label, Length: 900, dtype: int64

In [13]:
mnb_model = MultinomialNB()

In [14]:
v_mnb_model = Pipeline(steps=[('vectorizer', c_vectorizer), ('classifier', mnb_model)])
v_mnb_model

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', MultinomialNB())])

In [15]:
v_mnb_model.fit(X_train, y_train)
# the fit_transform step for the c_vectorizer
# is done by the Pipeline but for the classifier, we need to do the training

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', MultinomialNB())])

In [16]:
# Accuracy
v_mnb_model.score(X_test, y_test)

0.9922222222222222

In [17]:
predictions = v_mnb_model.predict(X_test)
accuracy_score(y_test, predictions)

# Both can be used

0.9922222222222222

### Testing on new examples

In [18]:
text = ["i have a question about assignment"]
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print("Not Spam")
else:
    print("Spam")

Not Spam


In [19]:
text = ["Huge savings of thousands of dollars!"]
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print("Not Spam")
else:
    print("Spam")

Spam


In [20]:
text = ["Huge savings of thousands"]
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print("Not Spam")
else:
    print("Spam")

Spam


In [23]:
text = ["Wow! amazing deals"]
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print("Not Spam")
else:
    print("Spam")
predictions

Not Spam


array([0], dtype=int64)

In [24]:
text = ["Wow! amazing deals save dollars!"]
predictions = v_mnb_model.predict(text)
if predictions == 0:
    print("Not Spam")
else:
    print("Spam")
predictions

Spam


array([1], dtype=int64)