<a href="https://colab.research.google.com/github/Denis04-M/email_classification/blob/main/spam_email_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import glob

filepath1 = '/content/drive/MyDrive/data/enron1/ham/'
filepath2 = '/content/drive/MyDrive/data/enron1/spam/'

emails, labels = [], []

# appending valid emails into emails[]
for filename in glob.glob(os.path.join(filepath1, '*.txt')):
  with open(filename, 'r', encoding='ISO-8859-1') as email_file:
    emails.append(email_file.read())
    labels.append(1) # labeling each valid email as 1

# appending spam emails into emils[]
for filename in glob.glob(os.path.join(filepath2, '*.txt')):
  with open(filename, 'r', encoding='ISO-8859-1') as email_file:
    emails.append(email_file.read())
    labels.append(0) # labeling each spam email as 0

print(len(emails))
print(len(labels))

5172
5172


In [None]:
from nltk.corpus import names
import nltk
from  nltk.stem import WordNetLemmatizer

# nltk.download('names')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

people_names = set(names.words())
lemmatizer = WordNetLemmatizer()

# function to remove numbers and special characters
def letters_only(astr):
  return astr.isalpha()

# cleaning the data (removin)
def cleanText(docs):
  cleaned_docs = []
  for doc in docs:
    cleaned_docs.append(" ".join([lemmatizer.lemmatize(word.lower())
      for word in doc.split()
        if letters_only(word) and word not in people_names]))
  return cleaned_docs

cleaned_emails = cleanText(emails)

print(cleaned_emails[0])

tried to get fancy with your address and it came back to me forwarded by lauri a allen hol aepin on pm to daren farmer enron com cc subject daren your rate for meter highlander central point for year starting delivered to equistar channelview is mm that price expires in week on november let me know if you need me to refresh after that time thanks


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

cv = CountVectorizer(stop_words = 'english', max_features=500)

term_docs = cv.fit_transform(cleaned_emails)

def get_label_index(labels):
  label_index = defaultdict(list)
  for index, label in enumerate(labels):
    label_index[label].append(index)
  return label_index

label_index = get_label_index(labels)

def get_prior(label_index):
  prior = {label: len(index) for label, index in label_index.items()}
  total_count = sum(prior.values())
  for label in prior:
    prior[label] /= float(total_count)
  return prior

prior = get_prior(label_index)
print(prior)

{1: 0.7099767981438515, 0: 0.2900232018561485}


In [None]:
import numpy as np

def get_likelihood(term_document_matrix, label_index, smoothing = 0):
  likelihood = {}
  for label, index in label_index.items():
    likelihood[label] = term_document_matrix[index, :].sum(axis=0) + smoothing
    likelihood[label] = np.asarray(likelihood[label])[0]
    total_count = likelihood[label].sum()
    likelihood[label] = likelihood[label] / float(total_count)
  return likelihood

smoothing = 1
likelihood = get_likelihood(term_docs, label_index, smoothing)

print(likelihood[1][:5])

[0.00108581 0.00095774 0.00087978 0.00084637 0.00010023]


In [None]:
def get_posterior(term_document_matrix, prior, likelihood):
  num_docs = term_document_matrix.shape[0]
  posteriors = []
  for i in range(num_docs):
    posterior = {key: np.log(prior_label)
      for key, prior_label in prior.items()}
    for label, likelihood_label in likelihood.items():
      term_document_vector = term_document_matrix.getrow(i)
    counts = term_document_vector.data
    indices = term_document_vector.indices
    for count, index in zip(counts, indices):
      posterior[label] += np.log(likelihood_label[index]) * count
      min_log_posterior = min(posterior.values())
    for label in posterior:
      try:
        posterior[label] = np.exp(posterior[label] - min_log_posterior)
      
      except:
        posterior[label] = float('inf')
    sum_posterior = sum(posterior.values())
    for label in posterior:
      if posterior[label] == float('inf'):
        posterior[label] = 1.0
      else:
        posterior[label] /= sum_posterior
      posteriors.append(posterior.copy())
  return posteriors

In [None]:
e_mails_test = ['''Subject: flat screens
               Hello,
               please call or contact regarding the other flat screens requested .
               trisha tlapek - eb 3132 b
               michael sergeev - eb 3132 a
               also the sun blocker that was taken away from eb 3131 a .
               trisha should two monitors also michael .
               thanks
               kevin moore''',
                '''Subject: having problems in bed ? we can help !
                cialis allows men to enjoy a fully normal sex life without
                having to plan the sexual act .
                if we let things terrify us, life will not be worth living
                brevity is the soul of lingerie .
                suspicion always haunts the guilty mind .''']

cleaned_test = cleanText(e_mails_test)
term_docs_test = cv.transform(cleaned_test)
posterior = get_posterior(term_docs_test, prior, likelihood)
print(posterior)

[{1: 1.0, 0: 1.0}, {1: 1.0, 0: 1.859490872529796e-18}, {1: 1.0, 0: 1.0}, {1: 1.0, 0: 5.617000906353728e-25}]


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleaned_emails, labels, random_state = 42)
term_docs_train = cv.fit_transform(X_train)
label_index = get_label_index(y_train)
prior = get_prior(label_index)
likelihood = get_likelihood(term_docs_train, label_index, smoothing)
term_docs_test = cv.transform(X_test)
posterior = get_posterior(term_docs_test, prior, likelihood)
correct = 0.0
for pred, actual in zip(posterior, y_test):
  if actual == 1:
    if pred[1] >= 0.5:
      correct += 1
  elif pred[0] > 0.5:
    correct += 1

print('The accuracy on {0} testing samples is:{1:.1f}%'.format(len(y_test), correct/len(y_test)*100))

The accuracy on 1293 testing samples is:64.0%


  app.launch_new_instance()


In [18]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=1.0, fit_prior=True)
clf.fit(term_docs_train, y_train)
prediction_prob = clf.predict_proba(term_docs_test)
prediction_prob[0:10]

array([[1.46877274e-007, 9.99999853e-001],
       [1.57257306e-019, 1.00000000e+000],
       [3.17877195e-017, 1.00000000e+000],
       [5.00804442e-020, 1.00000000e+000],
       [1.85530452e-021, 1.00000000e+000],
       [4.61941897e-039, 1.00000000e+000],
       [6.10560689e-106, 1.00000000e+000],
       [1.00000000e+000, 4.00769888e-012],
       [1.22570208e-025, 1.00000000e+000],
       [4.12623637e-014, 1.00000000e+000]])

In [19]:
prediction = clf.predict(term_docs_test)
prediction[:10]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1])

In [21]:
accuracy = clf.score(term_docs_test, y_test)
print('The accuracy using MultinomialNB is:{0:.1f}%'.format(accuracy*100))

The accuracy using MultinomialNB is:91.0%
