In [2]:
import pandas as pd
data = pd.read_csv("spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
# labelling category column as numbers using apply and lambda
data['Spam'] = data['Category'].apply(lambda x: 1 if x == "spam" else 0)
data.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.Message, data.Spam, test_size = 0.2)

In [20]:
# we need to convert Message column into numbers somehow
# we use count vectorizer for the same
# here is a demo of how count vectorizer works
from sklearn.feature_extraction.text import CountVectorizer
document = [ 'One Geek helps Two Geeks', 
            'Two Geeks help Four Geeks', 
            'Each Geek helps many other Geeks at GeeksforGeeks.']
cv = CountVectorizer()
x = cv.fit_transform(document)
print(x.toarray())
print(cv.get_feature_names())

[[0 0 0 1 1 0 0 1 0 1 0 1]
 [0 0 1 0 2 0 1 0 0 0 0 1]
 [1 1 0 1 1 1 0 1 1 0 1 0]]
['at', 'each', 'four', 'geek', 'geeks', 'geeksforgeeks', 'help', 'helps', 'many', 'one', 'other', 'two']


In [42]:
# using it on our dataset
x_train_count = cv.fit_transform(x_train.values) # we can pass an array to count vectorizer
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
# we will use multinomial naive bayes for our problem as the features(not the target) are discrete
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(x_train_count, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
x_train_count.toarray().shape

(4457, 7775)

In [46]:
# for finding score we need to convert x_test data using count vectorizer
x_test_count = cv.transform(x_test)
x_test_count.toarray().shape
nb.score(x_test_count, y_test)


0.9919282511210762

In [49]:
emails = ["hello how are you", "win 20% discount"]
emails_count = cv.transform(emails)
nb.predict(emails_count)

array([0, 1], dtype=int64)

In [50]:
# using sklearn pipeline to compact the code
from sklearn.pipeline import Pipeline
clf = Pipeline([("vectorization",CountVectorizer()),
               ("nb", MultinomialNB())])

In [51]:
# now we can directly train clf on x_train and y_train
clf.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('vectorization',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [52]:
clf.score(x_test, y_test)

0.9919282511210762

In [53]:
clf.predict(emails)

array([0, 1], dtype=int64)