In [1]:
import pandas as pd
import numpy as np 

In [2]:
df = pd.read_csv('../dataset/spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df.shape

(5572, 2)

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

3922    Do 1 thing! Change that sentence into: "Becaus...
3918    Eh ur laptop got no stock lei... He say mon mu...
3778    Mila, age23, blonde, new in UK. I look sex wit...
2175                        See? I thought it all through
Name: Message, dtype: object

In [14]:
y_train[:4]

3922    0
3918    0
3778    1
2175    0
Name: spam, dtype: int64

In [15]:
type(X_train.values)

numpy.ndarray

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values) # count vectorizer for x_train


In [17]:
X_train_cv.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
X_train_cv.shape 

(4457, 7769)

In [19]:
v.get_feature_names_out()[1294]

'bed'

In [20]:
v.vocabulary_

{'do': 2411,
 'thing': 6870,
 'change': 1742,
 'that': 6836,
 'sentence': 6043,
 'into': 3762,
 'because': 1288,
 'want': 7405,
 'concentrate': 1969,
 'in': 3686,
 'my': 4704,
 'educational': 2575,
 'career': 1676,
 'im': 3662,
 'leaving': 4103,
 'here': 3472,
 'eh': 2592,
 'ur': 7227,
 'laptop': 4053,
 'got': 3257,
 'no': 4835,
 'stock': 6505,
 'lei': 4113,
 'he': 3425,
 'say': 5946,
 'mon': 4606,
 'muz': 4702,
 'come': 1927,
 'again': 860,
 'to': 6962,
 'take': 6717,
 'look': 4236,
 'not': 4866,
 'mila': 4520,
 'age23': 865,
 'blonde': 1412,
 'new': 4796,
 'uk': 7151,
 'sex': 6067,
 'with': 7575,
 'guys': 3340,
 'if': 3649,
 'like': 4151,
 'fun': 3094,
 'me': 4452,
 'text': 6813,
 'mtalk': 4666,
 '69866': 597,
 '18': 325,
 '30pp': 439,
 'txt': 7125,
 '1st': 343,
 '5free': 563,
 '50': 539,
 'increments': 3701,
 'help08718728876': 3462,
 'see': 6005,
 'thought': 6891,
 'it': 3803,
 'all': 913,
 'through': 6901,
 'how': 3573,
 'are': 1051,
 'you': 7732,
 'miss': 4551,
 'other': 5036,
 '

In [21]:
X_train_np = X_train_cv.toarray()

X_train_np[0] #fisrt email

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
np.where(X_train_np[0]!=0)

(array([1288, 1676, 1742, 1969, 2411, 2575, 3472, 3662, 3686, 3762, 4103,
        4704, 6043, 6836, 6870, 7405], dtype=int64),)

In [24]:
X_train_np[0][969]

0

#### Modeling

In [25]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)


In [26]:
X_test_cv = v.transform(X_test)

In [27]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       944
           1       0.96      0.93      0.94       171

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
emails = [
    'Hey you, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

In [29]:
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [30]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [31]:
clf.fit(X_train, y_train)

In [32]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       944
           1       0.96      0.93      0.94       171

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [33]:
import pickle

# Simpan model ke file
with open('model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)
