In [27]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import RandomOverSampler

In [28]:
data = pd.read_csv("C:/Users/faru0/Downloads/emails.csv/emails.csv")

In [37]:
data.head()

Unnamed: 0,text,spam
0,subject : naturally irresistible corporate ide...,1
1,subject : stock trading gunslinger fanny merri...,1
2,subject : unbelievable new home made easy im w...,1
3,subject : 4 color printing special request add...,1
4,"subject : money , get software cd ! software c...",1


In [30]:
def preprocess_text(text):
    text = re.sub('<.*?>', '', text)
    text = re.sub('[\s]+', ' ', text)
    text = text.lower()
    return text


In [31]:
data['text'] = data['text'].apply(preprocess_text)

In [32]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def tokenize_and_preprocess(text):
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

data['text'] = data['text'].apply(tokenize_and_preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\faru0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
data['text'] = data['text'].apply(lambda x: ' '.join(x))

In [34]:
vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(data['text'])

In [35]:
X = features.toarray()
y = np.where(data['spam'] == 'spam', 1, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [40]:
y_pred = classifier.predict(X_test)
y_pred_binary = np.where(y_pred > 0.5, 1, 0)

In [41]:
print(classification_report(y_test, y_pred_binary))
print(confusion_matrix(y_test, y_pred_binary))
print(f'Accuracy: {accuracy_score(y_test, y_pred_binary)}')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1146

    accuracy                           1.00      1146
   macro avg       1.00      1.00      1.00      1146
weighted avg       1.00      1.00      1.00      1146

[[1146]]
Accuracy: 1.0
