In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Train test split

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

X_train.shape

(4457,)

In [8]:
X_test.shape

(1115,)

In [9]:
type(X_train)

pandas.core.series.Series

In [13]:
X_train.loc[:4]

3635    Its a big difference.  &lt;#&gt;  versus  &lt;...
5300                              I can't make it tonight
2273    Haha awesome, I've been to 4u a couple times. ...
77      I like you peoples very much:) but am very shy...
2721                                            Ok lor...
                              ...                        
2574    Congrats 2 mobile 3G Videophones R yours. call...
2118    Wish u many many returns of the day.. Happy bi...
1656                               At 7 we will go ok na.
2899          If you r @ home then come down within 5 min
4       Nah I don't think he goes to usf, he lives aro...
Name: Message, Length: 3023, dtype: object

In [14]:
type(y_train)

pandas.core.series.Series

In [16]:
y_train.loc[:4]

3635    0
5300    0
2273    0
77      0
2721    0
       ..
2574    1
2118    0
1656    0
2899    0
4       0
Name: spam, Length: 3023, dtype: int64

In [17]:
type(X_train.values)

numpy.ndarray

## Create bag of words representation using CountVectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7747 sparse matrix of type '<class 'numpy.int64'>'
	with 59054 stored elements in Compressed Sparse Row format>

In [20]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
X_train_cv.shape

(4457, 7747)

In [22]:
v.get_feature_names_out()[1771]

'cheek'

In [23]:
v.vocabulary_

{'its': 3764,
 'big': 1358,
 'difference': 2333,
 'lt': 4244,
 'gt': 3272,
 'versus': 7271,
 'every': 2678,
 'hrs': 3533,
 'can': 1639,
 'make': 4316,
 'it': 3756,
 'tonight': 6973,
 'haha': 3308,
 'awesome': 1183,
 've': 7259,
 'been': 1302,
 'to': 6944,
 '4u': 530,
 'couple': 2049,
 'times': 6914,
 'who': 7493,
 'all': 920,
 'coming': 1927,
 'like': 4109,
 'you': 7704,
 'peoples': 5142,
 'very': 7272,
 'much': 4630,
 'but': 1584,
 'am': 943,
 'shy': 6157,
 'pa': 5038,
 'ok': 4909,
 'lor': 4200,
 'this': 6864,
 'is': 3744,
 'the': 6823,
 '2nd': 402,
 'time': 6913,
 'we': 7416,
 'have': 3365,
 'tried': 7047,
 'contact': 1994,
 'won': 7578,
 '1450': 298,
 'prize': 5418,
 'claim': 1835,
 'just': 3875,
 'call': 1616,
 '09053750005': 167,
 'b4': 1190,
 '310303': 435,
 'cs': 2107,
 'stop': 6504,
 'sms': 6276,
 '08718725756': 138,
 '140ppm': 296,
 'free': 2996,
 'entry': 2627,
 'in': 3637,
 'wkly': 7561,
 'comp': 1934,
 'win': 7520,
 'fa': 2752,
 'cup': 2133,
 'final': 2860,
 'tkts': 6931,
 

In [24]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [27]:
np.where(X_train_np[0]!=0)

(array([1358, 2333, 2678, 3272, 3533, 3764, 4244, 7271], dtype=int64),)

In [37]:
X_train_np[0][2049]

0

## Train the naive bayes model

In [31]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [32]:
X_test_cv = v.transform(X_test)

## Evaluate Performance

In [33]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       973
           1       0.98      0.92      0.95       142

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [38]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

## Train the model using sklearn pipeline and reduce number of lines of code

In [40]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [41]:
clf.fit(X_train, y_train)

In [42]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       973
           1       0.98      0.92      0.95       142

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

