In [38]:
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv("data/spam.csv")

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df["spam"] = df['Category'].apply(lambda x: 1 if x == "spam" else 0)

In [5]:
df.head(
)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.3, random_state=42)

In [7]:
X_train.shape

(3900,)

In [8]:
X_test.shape

(1672,)

In [9]:
type(X_train)

pandas.core.series.Series

In [10]:
X_train[:4]

708     Quite late lar... Ard 12 anyway i wun b drivin...
4338                        on a Tuesday night r u 4 real
5029    Go chase after her and run her over while she'...
4921     G says you never answer your texts, confirm/deny
Name: Message, dtype: object

In [11]:
X_train[:4][708]

'Quite late lar... Ard 12 anyway i wun b drivin...'

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

v =CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)

In [19]:
X_train_cv.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
X_train_cv.shape

(3900, 7262)

In [23]:
v.get_feature_names_out().shape

(7262,)

In [26]:
v.vocabulary_

{'quite': 5196,
 'late': 3755,
 'lar': 3749,
 'ard': 970,
 '12': 260,
 'anyway': 932,
 'wun': 7161,
 'drivin': 2281,
 'on': 4621,
 'tuesday': 6618,
 'night': 4472,
 'real': 5260,
 'go': 2949,
 'chase': 1610,
 'after': 792,
 'her': 3180,
 'and': 890,
 'run': 5493,
 'over': 4709,
 'while': 7014,
 'she': 5707,
 'crossing': 1937,
 'the': 6373,
 'street': 6103,
 'says': 5566,
 'you': 7226,
 'never': 4449,
 'answer': 910,
 'your': 7231,
 'texts': 6355,
 'confirm': 1816,
 'deny': 2089,
 'still': 6077,
 'work': 7117,
 'going': 2960,
 'it': 3498,
 'is': 3486,
 'very': 6813,
 'small': 5861,
 'house': 3274,
 'think': 6401,
 'could': 1884,
 'stop': 6087,
 'by': 1470,
 'in': 3384,
 'like': 3837,
 'an': 887,
 'hour': 3271,
 'or': 4662,
 'so': 5896,
 'my': 4371,
 'roommate': 5461,
 'looking': 3918,
 'to': 6485,
 'stock': 6078,
 'up': 6726,
 'for': 2732,
 'trip': 6588,
 'lol': 3904,
 'great': 3017,
 'now': 4532,
 'im': 3362,
 'getting': 2912,
 'hungry': 3308,
 'that': 6370,
 'would': 7134,
 'be': 1168

In [28]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_cv, y_train)

In [29]:
X_test_cv = v.transform(X_test)

In [31]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1448
           1       0.98      0.95      0.96       224

    accuracy                           0.99      1672
   macro avg       0.99      0.97      0.98      1672
weighted avg       0.99      0.99      0.99      1672

[[1444    4]
 [  12  212]]


In [33]:
emails = [
    "I am sure you will the get the job",
    "Upto 20% off on parking. Exclusive offer for today. Spin and claim reward of a new iphone 15!",
]

emails_cv = v.transform(emails)
model.predict(emails_cv)

array([0, 1])

In [34]:
from sklearn.pipeline import Pipeline

clf = Pipeline([('vect', CountVectorizer()),
                ('nb', MultinomialNB())])

In [35]:
clf.fit(X_train, y_train)

In [37]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1448
           1       0.98      0.95      0.96       224

    accuracy                           0.99      1672
   macro avg       0.99      0.97      0.98      1672
weighted avg       0.99      0.99      0.99      1672

[[1444    4]
 [  12  212]]
