In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
#Read file and paths
def getPaths(path):
    paths = []
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)
            paths.append(path)
    return paths

def readFiles(path, classification):
    rows = []
    for paths in getPaths(path):
        f = open(paths, 'r', encoding='latin1')
        emails = f.read()
        rows.append({'emails': emails, 'label': classification})
    return pd.DataFrame(rows)

In [3]:
#Read Datasets and classification each dataset with their classification (spam or ham)
data = pd.DataFrame({'emails': [], 'label': []})
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron1\\ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron1\\spam', 'spam'))

data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron2\\ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron2\\spam', 'spam'))

data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron3\\ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron3\\spam', 'spam'))

data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron4\\ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron4\\spam', 'spam'))

data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron5\\ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron5\\spam', 'spam'))

data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron6\\ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\enron6\\spam', 'spam'))

data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\emails\\ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\emails\\spam', 'spam'))

data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\spam_assasin\\easy_ham_2', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\spam_assasin\\hard_ham', 'ham'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\spam_assasin\\spam', 'spam'))
data = data.append(readFiles('C:\\Users\\USER\\Documents\\proj\\spam_assasin\\spam_2', 'spam'))

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40266 entries, 0 to 1396
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   emails  40266 non-null  object
 1   label   40266 non-null  object
dtypes: object(2)
memory usage: 943.7+ KB


In [5]:
data.describe()

Unnamed: 0,emails,label
count,40266,40266
unique,37034,2
top,Subject: \n,ham
freq,51,20697


In [6]:
data.columns

Index(['emails', 'label'], dtype='object')

In [7]:
#Removing symbols so the dataset can be split to train and test
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–","$"]

for char in spec_chars:
    data['emails'] = data['emails'].str.replace(char, ' ')
    
#Removing extra whiteSpaces
data['emails'] = data['emails'].str.split().str.join(" ")
    
X = data['emails']
y = data['label']

In [8]:
#Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.describe()

count       26978
unique      25256
top       Subject
freq           32
Name: emails, dtype: object

In [9]:
#vectorizers break the words and CountVectorizers counts the occurance of each words
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(X_test)

classifier = MultinomialNB()
target = y_test
classifier.fit(counts, target)

#The accuracy of the classifier
print("The Accuracy is: ",classifier.score(counts, target))

The Accuracy is:  0.9844972907886815


In [10]:
example = ['Click here to start making free money!!', "Dear Students PFA..The B.Tech IT Minor Project Final review is scheduled on 7/5/2016 and 9/5/2016.While coming for final review, A) Students should complete the 100% project and need to show the full project demo. B) Each students in a every group have to submit their report and one report + CD(Documentation+ Coding) should be submitted to the department from each group. ( report format attached with this mail ),C) Students those who are not present for a final review at correct time are considered as arrear in minor project.D) 28/4/16 is the last date for the submission of Minor Project Report to get signature from the hod."]
example_count = vectorizer.transform(example)
classifier.predict(example_count)

array(['spam', 'ham'], dtype='<U4')

In [11]:
# plt.bar(data['emails'], height = 4)
# plt.show()

The GUI 

In [12]:
import tkinter as tk
from PIL import Image, ImageTk

In [13]:
window = tk.Tk()
window.title('Email Classifier')
window.geometry('600x180')

def popup(classification):
    popup = tk.Tk()
    popup.title("Email class")
    popup.geometry("300x90")
    email_class = classification
    alert_color = ""
    text_label = ""
    class_img = ""
    if (email_class == "spam"):
        alert_color = "red"
        text_label = "That looks like a Spam!"
        class_img="spamImg.png"
    if (email_class == "ham"):
        alert_color = "green"
        text_label = "That looks Safe!"
        
#     img = ImageTk.PhotoImage(Image.open("spamImg.png"))
#     panel = tk.Label(popup, image = img)
#     panel.pack(side = "bottom", fill = "both", expand = "yes")
    label = tk.Label(popup, text = text_label, fg = alert_color, font=(('Corbel'),15))
    label.pack(padx = 10, pady=10)
    popup.mainloop()
    
def classify():
    text = input_email.get('1.0', 'end')
    result = [text]
    result_count = vectorizer.transform(result)
    popup(classifier.predict(result_count)[0])
    
text = tk.Label(text="Enter emails to classify them!", font=(('Corbel'),12))
input_email = tk.Text(height=5)
input_email.insert(0.0, 'The email goes here!')
classify_button = tk.Button(text="Classify", font=(('Corbel'),12), command=classify)

text.pack(pady=5)
input_email.pack(pady=5, padx=6, ipadx=3)
classify_button.pack(ipadx=2, ipady=1, pady=3)

In [None]:
window.mainloop()