In [56]:
# Impor library yang diperlukan
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [57]:
# Baca dataset
df = pd.read_csv('spam_email.csv')
df.head(20)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [58]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [59]:

df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [60]:
# Pisahkan fitur dan label
X = df['Message']
y = df['Category']
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [61]:
# Pra-pemrosesan data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(X)

In [62]:
# Bagi dataset menjadi subset pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y

(<4457x8440 sparse matrix of type '<class 'numpy.float64'>'
 	with 34840 stored elements in Compressed Sparse Row format>,
 <1115x8440 sparse matrix of type '<class 'numpy.float64'>'
 	with 8689 stored elements in Compressed Sparse Row format>,
 1978    spam
 3989     ham
 3935     ham
 4078     ham
 4086    spam
         ... 
 3772     ham
 5191     ham
 5226     ham
 5390     ham
 860      ham
 Name: Category, Length: 4457, dtype: object,
 0        ham
 1        ham
 2       spam
 3        ham
 4        ham
         ... 
 5567    spam
 5568     ham
 5569     ham
 5570     ham
 5571     ham
 Name: Category, Length: 5572, dtype: object)

In [63]:
# Lakukan Pemrosesan Data Menggunakan Model Yang Digunakan Untuk Mengetahui Category Email
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

# SVM
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [64]:
# Evaluasi model
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label='spam')
    recall = recall_score(y_true, y_pred, pos_label='spam')
    f1 = f1_score(y_true, y_pred, pos_label='spam')
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

In [65]:
print("Naive Bayes Evaluation:")
evaluate_model(y_test, nb_pred)

print("\nSVM Evaluation:")
evaluate_model(y_test, svm_pred)

print("\nRandom Forest Evaluation:")
evaluate_model(y_test, rf_pred)

Naive Bayes Evaluation:
Accuracy: 0.9775784753363229
Precision: 1.0
Recall: 0.8322147651006712
F1-score: 0.9084249084249084

SVM Evaluation:
Accuracy: 0.9775784753363229
Precision: 1.0
Recall: 0.8322147651006712
F1-score: 0.9084249084249084

Random Forest Evaluation:
Accuracy: 0.9820627802690582
Precision: 1.0
Recall: 0.8657718120805369
F1-score: 0.9280575539568345


In [66]:
# Contoh prediksi pada data baru
new_email = ["Dear valued customer, Congratulations! You have been selected to receive an exclusive offer on our premium collection of luxury watches. For a limited time only, we are offering a massive 50% discount on all our exquisite timepieces. Imagine adorning your wrist with a timeless masterpiece that exudes elegance and sophistication. Our watches are crafted with precision and attention to detail, using only the finest materials. Whether you prefer a classic design or a modern statement piece, we have the perfect watch to complement your style. To take advantage of this incredible offer, simply click on the link below to browse our stunning collection and use the discount code LUX50 at checkout. But hurry, as this offer is valid for a limited time only!"]
new_email = vectorizer.transform(new_email)

nb_pred_new = nb_model.predict(new_email)
svm_pred_new = svm_model.predict(new_email)
rf_pred_new = rf_model.predict(new_email)

print("\nNew Email Classification (Naive Bayes):", nb_pred_new[0])
print("New Email Classification (SVM):", svm_pred_new[0])
print("New Email Classification (Random Forest):", rf_pred_new[0])


New Email Classification (Naive Bayes): spam
New Email Classification (SVM): ham
New Email Classification (Random Forest): spam
