In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [7]:
X_train.shape

(4457,)

In [8]:
X_test.shape

(1115,)

In [9]:
#  Create bag of words representation using CountVectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7743 sparse matrix of type '<class 'numpy.int64'>'
	with 59427 stored elements in Compressed Sparse Row format>

In [11]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
X_train_cv.shape

(4457, 7743)

In [12]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [15]:
v.get_feature_names_out(), len(v.get_feature_names_out())

(array(['00', '000', '000pes', ..., 'zoom', 'zouk', 'èn'], dtype=object), 7743)

In [13]:
v.get_feature_names_out()[1771]

'cheaper'

In [17]:
v.vocabulary_

{'hey': 3434,
 'kate': 3906,
 'hope': 3503,
 'ur': 7201,
 'ok': 4911,
 'will': 7520,
 'give': 3163,
 'buz': 1608,
 'wedlunch': 7433,
 'go': 3179,
 'outsomewhere': 5014,
 'adrink': 834,
 'in': 3651,
 'town': 7002,
 'cud': 2126,
 '2watershd': 418,
 'bit': 1386,
 'ppl': 5343,
 'fromwrk': 3042,
 'bthere': 1562,
 'love': 4225,
 'petexxx': 5169,
 'do': 2390,
 'you': 7712,
 'know': 3966,
 'why': 7506,
 'god': 3184,
 'created': 2085,
 'gap': 3093,
 'between': 1354,
 'your': 7717,
 'fingers': 2875,
 'so': 6281,
 'that': 6812,
 'one': 4934,
 'who': 7498,
 'is': 3756,
 'made': 4302,
 'for': 2954,
 'comes': 1933,
 'amp': 964,
 'fills': 2859,
 'those': 6861,
 'gaps': 3094,
 'by': 1616,
 'holding': 3483,
 'hand': 3320,
 'with': 7549,
 'dude': 2492,
 'knw': 3970,
 'also': 938,
 'telugu': 6761,
 'thts': 6881,
 'gud': 3276,
 'nyt': 4867,
 'nt': 4842,
 'joking': 3853,
 'seriously': 6024,
 'told': 6950,
 'fine': 2873,
 'if': 3612,
 'the': 6815,
 'way': 7410,
 'feel': 2817,
 'its': 3773,
 'gota': 3217,
 '

In [18]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
np.where(X_train_np[0]!=0)

(array([ 418,  834, 1386, 1562, 1608, 2126, 3042, 3163, 3179, 3434, 3503,
        3651, 3906, 4225, 4911, 5014, 5169, 5343, 7002, 7201, 7433, 7520],
       dtype=int64),)

In [21]:
X_train[:4]

4994    HEY KATE, HOPE UR OK... WILL GIVE U A BUZ WEDL...
4391    Do you know why god created gap between your f...
2775       Dude u knw also telugu..thts gud..k, gud nyt..
5390                           Nt joking seriously i told
Name: Message, dtype: object

In [27]:
#  Train the naive bayes model

In [28]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [29]:
X_test_cv = v.transform(X_test)

In [30]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       969
           1       0.97      0.95      0.96       146

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [31]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [32]:
# Train the model using sklearn pipeline and reduce number of lines of code

In [33]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [34]:
clf.fit(X_train, y_train)

In [35]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       969
           1       0.97      0.95      0.96       146

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

