In [1]:
import pandas as pd
import numpy as np


In [5]:
df = pd.read_csv('SMSSpamCollection',sep='\t',names = ['Category','Message'])
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
df['spam'] = df['Category'].apply(lambda x:1 if x == 'spam' else 0 )
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['Message'],df['spam'],test_size=0.2,random_state=18)

In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [11]:
type(X_train)

pandas.core.series.Series

In [12]:

X_train[:4]

1578    Remember all those whom i hurt during days of ...
1941    Dude avatar 3d was imp. At one point i thought...
4524    Actually I decided I was too hungry so I haven...
1625    500 free text msgs. Just text ok to 80488 and ...
Name: Message, dtype: object

In [13]:
y_train[:4]

1578    0
1941    0
4524    0
1625    1
Name: spam, dtype: int64

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train.values)

In [17]:
X_train_cv.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
X_train_cv.shape

(4457, 7803)

In [20]:
cv.get_feature_names_out()[1000:1050]

array(['answers', 'antelope', 'antha', 'anthony', 'anti', 'any',
       'anybody', 'anyhow', 'anymore', 'anyone', 'anyones', 'anyplaces',
       'anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway',
       'anyways', 'anywhere', 'aom', 'apart', 'apartment', 'apes',
       'apeshit', 'apnt', 'apo', 'apologise', 'apologize', 'apology',
       'app', 'apparently', 'appeal', 'appear', 'appendix',
       'applausestore', 'applebees', 'application', 'apply',
       'appointment', 'appointments', 'appreciate', 'appreciated',
       'approaches', 'approaching', 'appropriate', 'approve', 'approx',
       'apps', 'appt', 'appy'], dtype=object)

In [None]:
dir(cv)

In [None]:
cv.vocabulary_

In [24]:
X_train_np = X_train_cv.toarray()
X_train_np[:4][0]

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
np.where(X_train_np[0]!=0)

(array([ 926,  977, 1095, 1282, 2195, 2287, 2507, 3141, 3200, 3580, 3656,
        3664, 3782, 3931, 4415, 4422, 4504, 4762, 4927, 5061, 5152, 5395,
        5436, 5719, 5914, 5929, 5947, 6327, 6860, 6864, 6912, 6983, 7557]),)

In [27]:
X_train[:4][1578]

'Remember all those whom i hurt during days of satanic imposter in me.need to pay a price,so be it.may destiny keep me going and as u said pray that i get the mind to get over the same.'

In [28]:
X_train_np[0][977]

1

In [29]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [30]:
X_test_cv = cv.transform(X_test)

In [32]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       977
           1       0.96      0.91      0.93       138

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [33]:
emails = [
    'Hey Harry, can we get together to watch ipl tommorow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = cv.transform(emails)
model.predict(emails_count)

array([0, 1])

In [34]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [35]:
clf.fit(X_train,y_train)

In [37]:
print(classification_report(y_test,clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       977
           1       0.96      0.91      0.93       138

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

