#### Importing Required Libraries

In [33]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


#### Downloading Required Data for NLTK

In [34]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Loading the Dataset

In [38]:
messages = pd.read_csv('email.csv')

In [42]:
messages.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Preparing Tools for Preprocessing

In [39]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus = []
valid_indices = []


#### Preprocessing Each Email

In [40]:
for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['Message'][i])  # Remove everything except letters
    review = review.lower()  # Convert to lowercase
    review = review.split()  # Break text into words
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]  # Remove common words and simplify remaining words
    if review:  # Keep only non-empty results
        valid_indices.append(i)  # Save the row number of valid text
        corpus.append(' '.join(review))  # Add cleaned text to the list


#### Filtering the Dataset

In [41]:
messages_filtered = messages.iloc[valid_indices]


#### Preparing the Target Labels

In [53]:
y = pd.get_dummies(messages_filtered['Category']).iloc[:, 1].values


#### Converting Text to Numbers

In [54]:
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()


#### Splitting Data into Training and Testing

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)


#### Training the Model

In [56]:
mnb = MultinomialNB(alpha=0.8)
mnb.fit(X_train, y_train)


#### Making Predictions and Checking Accuracy

In [57]:
y_pred_mnb = mnb.predict(X_test)
mnb_acc = accuracy_score(y_test, y_pred_mnb)
print("MNB Accuracy:", mnb_acc)


MNB Accuracy: 0.9829290206648698


#### Testing with a Sample Message

In [58]:
message = ('Congratulations! You have been selected to win a brand-new iPhone 15. '
           'Click the link below to claim your prize now: www.fakeprizeexample.com '
           'This is a limited-time offer, so act fast. Offer expires in 24 hours.')
data = [message]
vect = cv.transform(data).toarray()
my_prediction = mnb.predict(vect)


#### Printing the Prediction

In [59]:
if my_prediction[0] == 0:
    print("It's a ham")
else:
    print("It's a spam")


It's a spam
