# Preparation

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from IPython.display import display

# 1. Load Data

In [2]:
df = pd.read_csv('../../datasets/spam.csv', encoding='latin-1')

df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


# 2. Preprocessing

## 2.1 Drop Columns

In [3]:
df = df.drop(df.columns[2:], axis=1)
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


## 2.2 Data Inspection

In [4]:
df = df.rename(columns={
    'v1': 'Labels', 
    'v2': 'SMS'
})

display(df['Labels'].value_counts())
print('\n')

display(df.info())
print('\n')

display(df.describe())

Labels
ham     4825
spam     747
Name: count, dtype: int64



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Labels  5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


None





Unnamed: 0,Labels,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


## 2.3 Label Encoding

Membuat data label dari kategorikal menjadi numerik

In [5]:
df['Labels'] = df['Labels'].map({
    'spam': 1,
    'ham': 0
})

df.head(15)

Unnamed: 0,Labels,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


## 2.4 Split Features and Label

In [6]:
X = df['SMS'].values
y = df['Labels'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=30
)

# 3. Feature Extraction

In [None]:
bow = CountVectorizer()

X_train = bow.fit_transform(X_train)

X_test = bow.transform(X_test)

In [9]:
print(f'Total kata unik: {len(bow.get_feature_names_out())}')
print(f'Dimensi data: {X_train.shape}')

Total kata unik: 7694
Dimensi data: (4457, 7694)


In [None]:
# Convert sparse matrix to dense DataFrame
X_train_df = pd.DataFrame(
    X_train.toarray(), # type: ignore
    columns=bow.get_feature_names_out()
)

# Show only first 5 rows
X_train_df.head()

Unnamed: 0,00,000,008704050406,0089,01223585236,01223585334,02,0207,02072069400,02073162414,...,åômorrow,ì_,ì¼1,ìä,ìï,û_,û_thanks,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 4. Model Training and Evaluation

In [10]:
mnb = MultinomialNB().fit(X_train, y_train)

# Evaluate model on train and test data
for name, X, y in [("train", X_train, y_train), ("test", X_test, y_test)]:
    accuracy = accuracy_score(y, mnb.predict(X))
    print(f"Akurasi data {name}: {accuracy:.4f}")

Akurasi data train: 0.9930
Akurasi data test: 0.9848


# 5. Testing

In [13]:
new_sms = [
    "Free tickets to Bali! Click this link to claim your prize!",
    "Hey, are we still meeting later for lunch?"
]

# Convert to the same bag-of-words format
new_sms_bow = bow.transform(new_sms)

# Predict
predictions = mnb.predict(new_sms_bow)

# Show results
for text, label in zip(new_sms, predictions):
    print(f"📩 {text}")
    print(f"➡️  Predicted as: {'SPAM' if label == 1 else 'HAM'}\n")

📩 Free tickets to Bali! Click this link to claim your prize!
➡️  Predicted as: SPAM

📩 Hey, are we still meeting later for lunch?
➡️  Predicted as: HAM



# 6. Conclusion

Pada percobaan kali ini digunakan model multinomial naive bayes yang juga dikombinasikan dengan CountVectorizer untuk membantu dalam proses mengubah data komentar yang awalnya string atau teks menjadi angka dengan menggunakan teknik **Bag-of-Words**. Model multinomial ini cocokk karena model ini berusaha untuk menghitung jumlah huruf yang ada. Model ini bisa mengenali berapa kali sebuah kata muncul pada sebuah teks. Jika dibandingkan dengan model yang lain, maka model multinomial inilah yang paling cocok.