## Import Libraries

In [66]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

[nltk_data] Downloading package stopwords to C:\Users\IP
[nltk_data]     G3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\IP
[nltk_data]     G3\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Dataset

In [4]:
data = pd.read_csv(r"C:\Users\IP G3\Downloads\email.csv")

## Data Cleaning & Exploration

In [6]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
print("Dataset Shape:", data.shape)

Dataset Shape: (5573, 2)


In [10]:
print("\n Column Names:")
print(data.columns)


 Column Names:
Index(['Category', 'Message'], dtype='object')


In [12]:
print("=== Missing Values===")
print(data.isnull().sum())

=== Missing Values===
Category    0
Message     0
dtype: int64


In [14]:
def clean_text(text):
    # convert to lowercase
    text = text.lower()

    # Remove Special Characters, numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove Extra Whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize and Remove Stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Apply Stemming
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [16]:
data['Cleaned_Message'] = data['Message'].apply(clean_text)

In [17]:
print(data['Cleaned_Message'].head())

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts st ...
3                  u dun say early hor u c already say
4             nah dont think go usf life around though
Name: Cleaned_Message, dtype: object


In [20]:
print(data['Message'].head())

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object


## Model Training

In [72]:
X_train_text, X_test_text, Y_train, Y_test = train_test_split(data['Cleaned_Message'], data['Category'], random_state = 42, test_size = 0.2)

In [74]:
Vectorizer = TfidfVectorizer()

In [80]:
X_train = Vectorizer.fit_transform(X_train_text)
X_test = Vectorizer.transform(X_test_text)

In [82]:
X_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [84]:
model = MultinomialNB()
model.fit(X_train, Y_train)

In [86]:
y_pred = model.predict(X_test)

In [88]:
accuracy_score(Y_test, y_pred)

0.9650224215246637

In [90]:
confusion_matrix(Y_test, y_pred)

array([[957,   1],
       [ 38, 119]], dtype=int64)

In [92]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       958
        spam       0.99      0.76      0.86       157

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

