# **Spam Ham Classification using NLP**

In [2]:
# Importing the libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import warnings
warnings.filterwarnings('ignore')

### **1. Exploring data**

In [4]:
emails = pd.read_csv("data/email_classification.csv")
emails

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham
...,...,...
174,We're pleased to inform you that your refund h...,ham
175,Get rich quick! Invest in our revolutionary ne...,spam
176,Your free trial period is ending soon. Upgrade...,ham
177,Your order is on its way! Track your shipment ...,ham


In [5]:
emails.shape

(179, 2)

In [6]:
emails['email'].loc[4]

'Your opinion matters! Take our survey and help us enhance your experience.'

In [7]:
emails['label'].value_counts()

label
ham     100
spam     79
Name: count, dtype: int64

### **2. Removing Stopwords and Moving forward with Lemmatization**

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
corpus_emails = []

for i in range(len(emails)):
  email = re.sub('[^a-zA-Z]', ' ', emails['email'][i])
  email = email.lower()
  email = email.split()

  email = [lemmatizer.lemmatize(word) for word in email if not word in stopwords.words('english')]
  email = ' '.join(email)
  corpus_emails.append(email)

In [12]:
corpus_emails[0]

'upgrade premium plan exclusive access premium content feature'

### **3. Implementing Bag of Words**

In [13]:
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2, 2))
X = cv.fit_transform(corpus_emails).toarray()

In [14]:
X.shape

(179, 693)

In [15]:
y = pd.get_dummies(emails['label'])
y = y.iloc[:, 1].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
spam_detect_model_bg = MultinomialNB().fit(X_train, y_train)

In [18]:
y_pred = spam_detect_model_bg.predict(X_test)

In [19]:
score_bg = accuracy_score(y_test, y_pred)
print(score_bg)

0.75


In [20]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       False       0.55      1.00      0.71        11
        True       1.00      0.64      0.78        25

    accuracy                           0.75        36
   macro avg       0.78      0.82      0.75        36
weighted avg       0.86      0.75      0.76        36



### **4. Implementing TF-IDF Model**

In [21]:
tv = TfidfVectorizer(max_features=2500, ngram_range=(2, 2))
X = tv.fit_transform(corpus_emails).toarray()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
spam_detect_model_tfidf = MultinomialNB().fit(X_train, y_train)

In [24]:
y_pred = spam_detect_model_tfidf.predict(X_test)

In [25]:
score_tfidf = accuracy_score(y_test, y_pred)
print(score_tfidf)

0.9166666666666666


In [26]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       False       0.95      0.90      0.93        21
        True       0.88      0.93      0.90        15

    accuracy                           0.92        36
   macro avg       0.91      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36

