# Ham or Spam?

## Imports

In [1]:
import nltk
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
import numpy as np

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Buga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Buga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Buga\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Buga\AppData\Roaming\nltk_data...


True

In [3]:
df = pd.read_csv('https://wagon-public-datasets.s3.amazonaws.com/05-Machine-Learning/10-Natural-Language-Processing/ham_spam_emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


## Cleaning the (text) dataset

### Remove Punctuation

In [4]:
string.punctuation

for punctuation in string.punctuation:
    remove_punctuation = lambda text : text.replace(punctuation, "")
    df['clean_text'] = df['text'].apply(remove_punctuation)
    
df

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,Subject: naturally irresistible your corporate...
1,Subject: the stock trading gunslinger fanny i...,1,Subject: the stock trading gunslinger fanny i...
2,Subject: unbelievable new homes made easy im ...,1,Subject: unbelievable new homes made easy im ...
3,Subject: 4 color printing special request add...,1,Subject: 4 color printing special request add...
4,"Subject: do not have money , get software cds ...",1,"Subject: do not have money , get software cds ..."
...,...,...,...
5723,Subject: re : research and development charges...,0,Subject: re : research and development charges...
5724,"Subject: re : receipts from visit jim , than...",0,"Subject: re : receipts from visit jim , than..."
5725,Subject: re : enron case study update wow ! a...,0,Subject: re : enron case study update wow ! a...
5726,"Subject: re : interest david , please , call...",0,"Subject: re : interest david , please , call..."


### Lower Case

In [5]:
lower_case = lambda text : text.lower()
df['clean_text'] = df['clean_text'].apply(lower_case)
df

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject: naturally irresistible your corporate...
1,Subject: the stock trading gunslinger fanny i...,1,subject: the stock trading gunslinger fanny i...
2,Subject: unbelievable new homes made easy im ...,1,subject: unbelievable new homes made easy im ...
3,Subject: 4 color printing special request add...,1,subject: 4 color printing special request add...
4,"Subject: do not have money , get software cds ...",1,"subject: do not have money , get software cds ..."
...,...,...,...
5723,Subject: re : research and development charges...,0,subject: re : research and development charges...
5724,"Subject: re : receipts from visit jim , than...",0,"subject: re : receipts from visit jim , than..."
5725,Subject: re : enron case study update wow ! a...,0,subject: re : enron case study update wow ! a...
5726,"Subject: re : interest david , please , call...",0,"subject: re : interest david , please , call..."


### Remove Numbers

In [6]:
remove_numbers = lambda text : ''.join(char for char in text if not char.isdigit())
df['clean_text'] = df['clean_text'].apply(remove_numbers)
df

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject: naturally irresistible your corporate...
1,Subject: the stock trading gunslinger fanny i...,1,subject: the stock trading gunslinger fanny i...
2,Subject: unbelievable new homes made easy im ...,1,subject: unbelievable new homes made easy im ...
3,Subject: 4 color printing special request add...,1,subject: color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,"subject: do not have money , get software cds ..."
...,...,...,...
5723,Subject: re : research and development charges...,0,subject: re : research and development charges...
5724,"Subject: re : receipts from visit jim , than...",0,"subject: re : receipts from visit jim , than..."
5725,Subject: re : enron case study update wow ! a...,0,subject: re : enron case study update wow ! a...
5726,"Subject: re : interest david , please , call...",0,"subject: re : interest david , please , call..."


### Remove StopWords

In [7]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words]
    return ' '.join(filtered_words)

df['clean_text'] = df['clean_text'].apply(remove_stopwords)
df

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject : naturally irresistible your corporat...
1,Subject: the stock trading gunslinger fanny i...,1,subject : the stock trading gunslinger fanny i...
2,Subject: unbelievable new homes made easy im ...,1,subject : unbelievable new homes made easy im ...
3,Subject: 4 color printing special request add...,1,subject : color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,"subject : do not have money , get software cds..."
...,...,...,...
5723,Subject: re : research and development charges...,0,subject : re : research and development charge...
5724,"Subject: re : receipts from visit jim , than...",0,"subject : re : receipts from visit jim , thank..."
5725,Subject: re : enron case study update wow ! a...,0,subject : re : enron case study update wow ! a...
5726,"Subject: re : interest david , please , call...",0,"subject : re : interest david , please , call ..."


### Lemmatize

In [8]:
def leammatize_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)

df['clean_text'] = df['clean_text'].apply(leammatize_text)
df

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject : naturally irresistible your corporat...
1,Subject: the stock trading gunslinger fanny i...,1,subject : the stock trading gunslinger fanny i...
2,Subject: unbelievable new homes made easy im ...,1,subject : unbelievable new home made easy im w...
3,Subject: 4 color printing special request add...,1,subject : color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,"subject : do not have money , get software cd ..."
...,...,...,...
5723,Subject: re : research and development charges...,0,subject : re : research and development charge...
5724,"Subject: re : receipts from visit jim , than...",0,"subject : re : receipt from visit jim , thanks..."
5725,Subject: re : enron case study update wow ! a...,0,subject : re : enron case study update wow ! a...
5726,"Subject: re : interest david , please , call...",0,"subject : re : interest david , please , call ..."


## Bag-of-words Modelling

### Digitizing the textual data into numbers

In [9]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['clean_text'])
X_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Multinomial Naive Bayes Modelling

In [10]:
model = MultinomialNB()

scores = cross_val_score(
    model,
    X_bow,
    df['spam'],
    cv=5,
    scoring='accuracy').mean()
np.round(scores, 2)

0.99