#This Section Will clean and stem the works of an email input

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
import csv
import re
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
porter = PorterStemmer()
lancaster=LancasterStemmer()

def clean_email(email):
    hdrstart = email.find('\n\n')
    email = email[hdrstart:]
    email = email.lower()

    rx = re.compile('<[^<>]+>|\n')
    email= rx.sub(' ', email)

    rx = re.compile('[0-9]+')
    email = rx.sub('number ', email)

    rx = re.compile('(http|https)://[^\s]*')
    email = rx.sub('httpaddr ', email)

    rx = re.compile('[^\s]+@[^\s]+')
    email = rx.sub('emailaddr ', email)

    rx = re.compile('[$]+')
    email = rx.sub('dollar ', email)

    rx = re.compile('[^a-zA-Z0-9 ]')
    email = rx.sub('', email)
    #Tokenise Email
    token_words=word_tokenize(email)
    stem_sentence = []
    
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def vectorise_email(email):
    d = {}
    with open('vocab.txt') as f:
        for line in f:
            (key, val) = line.split()
            d[key] = int(val)

    vector = np.zeros([1, len(d)], dtype=int)

    email = clean_email(email)

    for word in email.split():
        if word in d:
            vector[0, d[word]] = 1
    return vector

def create_vocab(path, length=0):
    vocab = {}
    file_list = os.listdir(path)
    for file in file_list:
        f = open(path+'/'+file, 'r')
        try:
            email = f.read()
            cleaned_email = clean_email(email)
            for word in cleaned_email.split():
                if word in vocab:
                    vocab[word] += 1
                else:
                    vocab[word] = 1
            f.close()
        except UnicodeDecodeError:
            f.close()

    f = open('vocab.txt', 'w')
    for i in range(length):
        highest_occurance = max(vocab, key=vocab.get)
        f.write(highest_occurance+' '+str(i)+'\n')
        del vocab[highest_occurance]
    f.close()


## Create Data from emails

In [4]:
def create_data_file(paths):
    spam_paths = paths['spam']
    ham_paths = paths['ham']
    data_csv = open('vectorized_data.csv', 'w')
    writer = csv.writer(data_csv, delimiter=',')
    for spam_path in spam_paths:
        files = os.listdir(spam_path)
        for file in files:
            f = open(spam_path+'/'+file, 'r')
            try:
                email = f.read()
                data = vectorise_email(email)
                data = np.append(data, 1)
                writer.writerow(data)
                f.close()
            except UnicodeDecodeError:
                f.close()
    for ham_path in ham_paths:
        files = os.listdir(ham_path)
        for file in files:
            f = open(ham_path+'/'+file, 'r')
            try:
                email = f.read()
                data = vectorise_email(email)
                data = np.append(data, 0)
                writer.writerow(data)
                f.close()
            except UnicodeDecodeError:
                f.close()
    data_csv.close()

## Function to find model hyper paramaters (NOT FINISHED)

In [None]:
def find_model_svm():
    data = pd.read_csv('vectorized_data.csv', header=None)
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

    svc = SVC(kernel='linear')
    svc.fit(X_train, y_train)

    y_pred = svc.predict(X_val)
    print("Validation Data Set")
    print(confusion_matrix(y_val,y_pred))
    print(classification_report(y_val,y_pred))
    print('\n\n\n')

    y_pred_test = svc.predict(X_test)
    print("Testset Data Set")
    print(confusion_matrix(y_test,y_pred_test))
    print(classification_report(y_test,y_pred_test))

    return svc

In [13]:

def predict_spam(svc, email):
    email = clean_email(email)
    email = vectorise_email(email)
    email = pd.DataFrame(email)

    return svc.predict(email)

# Create A vocab List from spam emails and then create a data set from the vocab list

In [12]:
path = 'Spam Data/spam' # Paths to spam emails
create_vocab(path, 2000) # creates vocab list

# Adding paths that have data we are using for the data set
options = {'spam': ['Spam Data/spam'],
           'ham': ['Spam Data/easy_ham_2',
                   'Spam Data/hard_ham']}
create_data_file(options)

In [14]:
svc = find_model_svm()

Validation Data Set
[[289  10]
 [ 11 245]]
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       299
           1       0.96      0.96      0.96       256

    accuracy                           0.96       555
   macro avg       0.96      0.96      0.96       555
weighted avg       0.96      0.96      0.96       555





Testset Data Set
[[297   7]
 [ 11 240]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       304
           1       0.97      0.96      0.96       251

    accuracy                           0.97       555
   macro avg       0.97      0.97      0.97       555
weighted avg       0.97      0.97      0.97       555



In [15]:
data = pd.read_csv('vectorized_data.csv', header=None)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [19]:
email="""From social-admin@linux.ie  Sat Jul 20 00:53:54 2002
Return-Path: <social-admin@linux.ie>
Delivered-To: yyyy@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 08692440C8
	for <jm@localhost>; Fri, 19 Jul 2002 19:53:54 -0400 (EDT)
Received: from dogma.slashnull.org [212.17.35.15]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Sat, 20 Jul 2002 00:53:54 +0100 (IST)
Received: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g6JNojJ15484 for
    <jm+ilug-social@jmason.org>; Sat, 20 Jul 2002 00:50:45 +0100
Received: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org
    (8.9.3/8.9.3) with ESMTP id AAA07367; Sat, 20 Jul 2002 00:49:55 +0100
Received: from ie.suberic.net (owsla.ie.suberic.net [62.17.162.83]) by
    lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id AAA07330 for <social@linux.ie>;
    Sat, 20 Jul 2002 00:49:49 +0100
X-Authentication-Warning: lugh.tuatha.org: Host owsla.ie.suberic.net
    [62.17.162.83] claimed to be ie.suberic.net
Received: from owsla.ie.suberic.net (owsla [127.0.0.1]) by ie.suberic.net
    (8.11.6/8.11.6) with ESMTP id g6JNnm914504 for <social@linux.ie>;
    Sat, 20 Jul 2002 00:49:48 +0100
Date: Sat, 20 Jul 2002 00:49:47 +0100
To: ilug social <social@linux.ie>
Message-Id: <20020720004947.A14480@ie.suberic.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.2.5.1i
X-Operating-System: Linux 2.4.18-5 i686
X-GPG-Fingerprint: 9C1D 16F4 11F1 6BD2 933C  048D ACC7 9840 89D0 7646
From: kevin lyda <kevin+dated+1027554588.4a2cc4@ie.suberic.net>
X-Delivery-Agent: TMDA/0.57
Subject: [ILUG-Social] geek cuisine...
Sender: social-admin@linux.ie
Errors-To: social-admin@linux.ie
X-Mailman-Version: 1.1
Precedence: bulk
List-Id: Irish Linux Users' Group social events <social.linux.ie>
X-Beenthere: social@linux.ie

hm, looks good:
http://books.slashdot.org/article.pl?sid=02/07/19/1411209&mode=thread&tid=134

kevin

--
kevin@suberic.net     that a believer is happier than a skeptic is no more to
fork()'ed on 37058400    the point than the fact that a drunken man is happier
meatspace place: home       than a sober one. the happiness of credulity is a
http://ie.suberic.net/~kevin   cheap & dangerous quality -- g.b. shaw

--
Irish Linux Users' Group Social Events: social@linux.ie
http://www.linux.ie/mailman/listinfo/social for (un)subscription information.
List maintainer: listmaster@linux.ie


"""

In [20]:
email = clean_email(email)
email = vectorise_email(email)
email = pd.DataFrame(email)

In [21]:
email

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
svc.class_weight()

False