In [30]:
import os
import pandas as pd
import math
import numpy as np
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
import xml.etree.ElementTree
import lxml.html
import lxml.html.clean
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import mailparser as mp
import re
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

#to clean mails
def dividing_mail(mail):
        m_body = mail.body
        m_headers = mail.headers
        sub = mail.subject
        return m_body, m_headers, sub

def counting_urls(text):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    num_urls = len(urls)
    return num_urls

def cleaning_htmlent(text):
    if text != '':
        doc = lxml.html.fromstring(text)
        cleaner = lxml.html.clean.Cleaner(style=True)
        doc = cleaner.clean_html(doc)
        content = doc.text_content()
        return content
    else:
        return []
   
    
def removing_alphas(doc):
    dellist = []
    for i in range(len(doc)):
        if not doc[i].isalpha():
            dellist.append(i)
       
    for index in sorted(dellist, reverse=True):
        del doc[index]
    return doc
    
def keep_only_important_words(data_txt):
        m_words =[]
        for word, pos in pos_tag(TreebankWordTokenizer().tokenize(data_txt)):
            #if pos not in ['IN', '.', 'CD', 'DT', 'RB', 'VBP', 'TO', 'PRP', 'C']: VB and VBP removed
            if pos in ['NN',  'JJ', 'VBN'] :
                m_words.append(word)
            else:
                pass
        return(m_words)

    
#getting most frequent words from body and subject lists
def extracting_feats(body_list, sub_list, arr):
    i = 0
    fl = []
    for  l in list([body_list, sub_list]):
        flatten_list = [a for t in l for a in t]
        out = Counter(flatten_list)
        out = out.most_common(arr[i])
        i = i + 1 
        cols = [m[0] for m in out]
        fl.append(cols)
    return fl
      

            
     

In [31]:
#transformer for applying diff functions to clean up the mail and get list of words
class preparing_data(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, x, y=None):
        #print('type of x is' , type(x))
        #x = x.lower()
        m_body, m_headers, sub = dividing_mail(x)
        if m_body != '':
            m_body = m_body.lower()
            urls = counting_urls(m_body)
            m_body = cleaning_htmlent(m_body)
            
            name_sender, dom_sender = x.from_[0][0], x.from_[0][1].split('@')[-1]
        
            sub_list = keep_only_important_words(sub)
            body_list = keep_only_important_words(m_body)
            sub_list = removing_alphas(sub_list)
            body_list = removing_alphas(body_list)
            
            ps = PorterStemmer()
            s_c =   [ps.stem(w) for w in sub_list]
            b_c =   [ps.stem(w) for w in body_list]
            b_c.extend([urls, name_sender, dom_sender])
            
            
        else:
            s_c =['','','']
            b_c = ['0','','']
        return s_c, b_c
        




#Giving filepaths for the data
dat_path = "datasets/spamham"
spam_path = dat_path + '/easy_spam'
ham_path = dat_path + '/easy_ham'

len_spam = len(next(os.walk(spam_path))[2])
len_ham = len(next(os.walk(ham_path))[2])

ar_dict = [(spam_path, len_spam), (ham_path, len_ham)] 
sl = math.ceil(0.8 * len_spam)
hl = math.ceil(0.8 * len_ham)

y_train = [0] * sl + [1] * hl
y_test = [0] * (len_spam - sl) + [1] * (len_ham - hl)

n_list = []
sub_list = []
body_list = []

#Reading emails and transforming them in a list of words and combining them for all mails
for sp, num in ar_dict:
    print('preparing data for ', sp)
    for file_num in range(num):
        #Reading an email and getting the content
        data_mail = sp + '/mail_' + str(file_num)
        
        mail = mp.parse_from_file(data_mail)
        obj = preparing_data()
        s, b = obj.fit_transform(mail)
        n_list.append(b[-3:])
        sub_list.append(s)
        body_list.append(b[:-3])
        
#dividing training and testing for all mails            
train_body = body_list[:sl] + body_list[len_spam: len_spam + hl]
test_body = body_list[sl:len_spam] + body_list[len_spam + hl:]
train_sub = sub_list[:sl] + sub_list[len_spam:len_spam + hl]
test_sub = sub_list[sl:len_spam] + sub_list[len_spam + hl:]
n_list_train = n_list[:sl] + n_list[len_spam: len_spam + hl]
n_list_test = n_list[sl:len_spam] + n_list[len_spam + hl:]

#getting all important features to use for classification
print('Making features to use ... ')
featurevectors = extracting_feats(train_body[:sl], train_sub[:sl], [200 ,50])
featurevectorh = extracting_feats(train_body[sl:], train_sub[sl:], [300 ,50])
print('Done')




preparing data for  datasets/spamham/easy_spam
preparing data for  datasets/spamham/easy_ham


More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO ?192.168.0.100? salimma1@212.18.241.211 with plain by smtp.mail.vip.sc5.yahoo.com with SMTP; 10 Oct 2002 10:30:25 -0000
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from p977.as2.cra.dublin.eircom.net HELO mfrenchw2k mfrench42@159.134.179.209 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 22 Aug 2002 22:02:25 -0000
More than one match found for (?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from chassid 4.65.20.230 by out001.verizon.net InterMail vM.5.01.05.09 201-253-122-126-109-20020611 with ESMTP id <20021005011206.OGMC3265.out001.verizon.net@chassid> for <rpm-list@freshrpms.net>; Fri, 4 Oct 2002 20:12:06 -0500
More than one match fo

Making features to use ... 
Done


In [32]:
#making sure same columns are not repeated
bodycols = set(featurevectors[0] + featurevectorh[0])
subcols = set(featurevectors[1] + featurevectorh[1])
colnames =  list(bodycols) + list(subcols) + ['URL_num']
numtoreshape = len(colnames)
totaldata = hl + sl

#making feature vectors for all mails, training and testing and makin dataframes        
def feature_vectors(d_length, blist, slist, nlist):
    sample_value = []
    for num in range(d_length):
        for feature in bodycols:
            if feature in blist[num]:
                sample_value.append(blist[num].count(feature))
            else:
                sample_value.append(0)

        for feature in subcols:
            if feature in slist[num]:
                sample_value.append(slist[num].count(feature))
            else:
                sample_value.append(0)
    
        urlno = nlist[num][0]
        sample_value.append(urlno)

    #reshaping data to convert it in dataframe
    data = np.reshape(sample_value, (-1, numtoreshape)) 
            
    df = pd.DataFrame(data = data, columns = colnames)
    return df   
        
df = feature_vectors(totaldata, train_body, train_sub, n_list_train)       
df_test = feature_vectors(len(test_body), test_body, test_sub, n_list_test)

print('shape of training dataframe: ', df.shape)
print('shape of testing dataframe: ', df_test.shape)


shape of training dataframe:  (2441, 485)
shape of testing dataframe:  (610, 485)


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


clf = RandomForestClassifier(random_state=42) 


params = {
    'n_estimators': randint(10, 50),
     'max_features': randint(1, numtoreshape),
}

newgridsearch = RandomizedSearchCV(clf, param_distributions = params, n_iter=20, cv = 5, scoring='accuracy')
r_grid_search = newgridsearch.fit(df, y_train)

final_model = r_grid_search.best_estimator_
frst_scrs = cross_val_score(final_model, df, y_train, cv = 3, scoring = "accuracy")
print('Training scores for best estimator ')
frst_scrs


Training scores for best estimator 


array([0.99018405, 0.9901599 , 0.9704797 ])

In [35]:
predictions = final_model.predict(df_test)
accuracy = sum(1 for x,y in zip(predictions,y_test) if x == y) / len(predictions)
print('Testing scores for best estimator ')
accuracy

Testing scores for best estimator 


0.9868852459016394

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score = (y_test, y_pred)