In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Load the dataset
df = pd.read_excel(r"C:\Users\Poojitha\Downloads\Spam Email Detection.xlsx")

In [3]:
# Rename columns for clarity
df = df.rename(columns={'v1': 'label', 'v2': 'text'})

In [4]:
# Display the first few rows of the dataframe
print(df.head())

  label                                               text Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [5]:
# Display the shape of the dataframe
print(df.shape)

(5572, 5)


In [6]:
# Display the columns of the dataframe
print(df.columns)

Index(['label', 'text', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [7]:
# Remove duplicates
df.drop_duplicates(inplace=True)
print(df.shape)

(5163, 5)


In [8]:
# Check for missing values
print(df.isnull().sum())

label            0
text             0
Unnamed: 2    5120
Unnamed: 3    5153
Unnamed: 4    5158
dtype: int64


In [9]:
# Drop unnecessary columns (if any)
df = df[['label', 'text']]

In [10]:
# Download the stopwords package
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Poojitha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Define the text processing function
def process(text):
    if isinstance(text, str):
        # Remove punctuation
        nopunc = [char for char in text if char not in string.punctuation]
        nopunc = ''.join(nopunc)

        # Remove stopwords
        clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
        return clean
    else:
        return []

In [12]:
# Apply the text processing function to the text column
print(df['text'].head().apply(process))

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: text, dtype: object


In [13]:
# Convert text data into a matrix of token counts
vectorizer = CountVectorizer(analyzer=process)
message = vectorizer.fit_transform(df['text'])

# Split the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(message, df['label'], test_size=0.20, random_state=0)
print(message.shape)

(5163, 11300)


In [14]:
# Create and train the Naive Bayes classifier
classifier = MultinomialNB().fit(xtrain, ytrain)

# Evaluate the model on the training set
pred_train = classifier.predict(xtrain)
print(classification_report(ytrain, pred_train))
print()
print("Confusion Matrix (Training): \n", confusion_matrix(ytrain, pred_train))
print("Accuracy (Training): \n", accuracy_score(ytrain, pred_train))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3616
        spam       0.98      0.98      0.98       514

    accuracy                           1.00      4130
   macro avg       0.99      0.99      0.99      4130
weighted avg       1.00      1.00      1.00      4130


Confusion Matrix (Training): 
 [[3607    9]
 [  10  504]]
Accuracy (Training): 
 0.9953995157384988
