# Spam or Ham

## Final Project - Intro to Deep Learning
## BGU Winter 2021

#NLTK's Naive Bayes Classifier

In [14]:
import os
from os import walk
from string import punctuation
from random import shuffle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import sklearn as sk

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

path_route = os.walk('/content/gdrive/My Drive/deep_learning/FinalProject/data')

Mounted at /content/gdrive


In [5]:
hamData, spamData = [], []

for root, directory, file in path_route:
  if 'ham' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                hamData.append(" ".join(ip.readlines()))
  elif 'spam' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                spamData.append(" ".join(ip.readlines()))

In [19]:




# remove all redundent data

allHamData = list(set(hamData))
allSpamData = list(set(spamData))

# storing it in a dataframe

hamPlusSpamData = allHamData + allSpamData
labels = ["ham"]*len(allHamData) + ["spam"]*len(allSpamData)

raw_df = pd.DataFrame({"email": hamPlusSpamData, 
                       "label": labels})

# checking how it looks

raw_df.sample(5)



'spam'

In [20]:
# creating a preprocessing function
# to tokenize and lemmatize the data using NLTK library

def preprocess(data):
    # tokenization
    tokens = nltk.word_tokenize(data)
    tokens = [w.lower() for w in tokens if w.isalpha()]

    # finding uncommon words
    cnt = Counter(tokens)
    uncommons = cnt.most_common()[:-int(len(cnt)*0.1):-1]
    
    # listing stopwords from NLTK
    stops = set(nltk.corpus.stopwords.words('english'))

    # removing stop words and uncommon words
    tokens = [w for w in tokens if (w not in stops and w not in uncommons)]

    # lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w, pos='a') for w in tokens]

    return tokens


# pre-processing the emails
# using word_tokenize() and WordNetLemmatizer()

nltk_processed_df = pd.DataFrame()
nltk_processed_df['email'] = [preprocess(e) for e in raw_df.email]

# label encoding the labels

label_encoder = sk.preprocessing.LabelEncoder()
nltk_processed_df['label'] = label_encoder.fit_transform(raw_df.label)

# checking how the processed data looks like

nltk_processed_df.sample(5)


1

In [26]:
raw_df.label[25526]

'spam'

In [None]:

raw_df.email[25526]


In [23]:

nltk_processed_df.label[25526]


1

In [None]:
nltk_processed_df.email[25526]

In [27]:
# converting categorical email data to numerical data using Counters

X, y = nltk_processed_df.email, nltk_processed_df.label
X_featurized = [Counter(i) for i in X]

In [28]:
# getting the data ready for NaiveBayesClassifier 
# randomizing using shuffle
# manually splitting into test and train data

allDataProcessed = [(X_featurized[i], y[i]) for i in range(len(X))]

shuffle(allDataProcessed)

trainData, testData = allDataProcessed[:int(len(allDataProcessed)*0.7)], allDataProcessed[int(len(allDataProcessed)*0.7):]

In [29]:
# Training the model

model_nltkNaiveBayes = nltk.classify.NaiveBayesClassifier.train(trainData)

In [30]:
# Testing the model

testing_accuracy = nltk.classify.accuracy(model_nltkNaiveBayes, testData)
print("Accuracy with NLTK's Naive Bayes classifier is:", testing_accuracy)

Accuracy with NLTK's Naive Bayes classifier is: 0.9889605421357526


#Scikit-learn's Multinomial Naive Bayes Classifier

In [31]:
# Vectorize the features using CountVectorizer

cv_vec = sk.feature_extraction.text.CountVectorizer(tokenizer = nltk.word_tokenize, 
                                                    stop_words = nltk.corpus.stopwords.words("english"))

cv_X = cv_vec.fit_transform(raw_df.email)

  % sorted(inconsistent)


In [32]:
# Vectorize the features using TfidfVectorizer

tfidf_vec = sk.feature_extraction.text.TfidfVectorizer(tokenizer = nltk.word_tokenize, 
                                                    stop_words = nltk.corpus.stopwords.words("english"))

tdidf_X = cv_vec.fit_transform(raw_df.email)

In [33]:
# label encode the labels using LabelEncoder

label_encoder = sk.preprocessing.LabelEncoder()
y = label_encoder.fit_transform(raw_df.label)

In [34]:
# loading the MultinomialNB model

from sklearn.naive_bayes import MultinomialNB
model_sklearn_mnb = MultinomialNB()

In [35]:
# getting cross validation score on count-vectorized features
# getting cross validation score on tfidf processed features

cv_score = sk.model_selection.cross_validate(model_sklearn_mnb, cv_X, y)

tfidf_score = sk.model_selection.cross_validate(model_sklearn_mnb, tdidf_X, y)

In [36]:
# checking the scores by putting them into a dataframe first

sklearn_scores = pd.DataFrame([cv_score, tfidf_score], index=['CountVetorizer', 'TfidfVectorizer'])
sklearn_scores = sklearn_scores.applymap(lambda x: x.mean())

sklearn_scores

Unnamed: 0,fit_time,score_time,test_score
CountVetorizer,0.050531,0.009494,0.983931
TfidfVectorizer,0.048479,0.009019,0.983931
