In [49]:
import os
import pandas as pd
import math
import numpy as np
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
import xml.etree.ElementTree
import lxml.html
import lxml.html.clean
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import mailparser as mp
import re
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

#-------------------to clean mails----------------------------
def dividing_mail(mail):
        m_body = mail.body
        m_headers = mail.headers
        sub = mail.subject
        return m_body, m_headers, sub

def counting_urls(text):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    num_urls = len(urls)
    return num_urls

def cleaning_htmlent(text):
    if text != '':
        doc = lxml.html.fromstring(text)
        cleaner = lxml.html.clean.Cleaner(style=True)
        doc = cleaner.clean_html(doc)
        content = doc.text_content()
        return content
    else:
        return []
   
    
def removing_alphas(doc):
    dellist = []
    for i in range(len(doc)):
        if not doc[i].isalpha():
            dellist.append(i)
       
    for index in sorted(dellist, reverse=True):
        del doc[index]
    return doc
    
def keep_only_important_words(data_txt):
        m_words =[]
        for word, pos in pos_tag(TreebankWordTokenizer().tokenize(data_txt)):
            #if pos not in ['IN', '.', 'CD', 'DT', 'RB', 'VBP', 'TO', 'PRP', 'C']: VB and VBP removed
            if pos in ['NN',  'JJ', 'VBN'] :
                m_words.append(word)
            else:
                pass
        return(m_words)

    
#-----------getting most frequent words from body and subject lists to make features---------
def extracting_feats(body_list, sub_list, arr):
    i = 0
    fl = []
    for  l in list([body_list, sub_list]):
        flatten_list = [a for t in l for a in t]
        out = Counter(flatten_list)
        out = out.most_common(arr[i])
        i = i + 1 
        cols = [m[0] for m in out]
        fl.append(cols)
    return fl
      


#---------making feature vectors & Df for all mails, training and testing(used by transformer--------       
def feature_vectors(blist, slist, nlist, colnames, bodycols, subcols):
    sample_value = []
    numtoreshape = len(colnames)
    
    for num in range(len(blist)):
        for feature in bodycols:
            if feature in blist[num]:
                sample_value.append(blist[num].count(feature))
            else:
                sample_value.append(0)

        for feature in subcols:
            if feature in slist[num]:
                sample_value.append(slist[num].count(feature))
            else:
                sample_value.append(0)
    
        urlno = nlist[num][0]
        sample_value.append(urlno)

    #reshaping data to convert it in dataframe
    data = np.reshape(sample_value, (-1, numtoreshape)) 
            
    df = pd.DataFrame(data = data, columns = colnames)
    return df   

           
     

In [50]:
#-------------All the transformers-------------------------------


#takes a list of mails object and return a list of words for all the mails
class preparing_data(BaseEstimator, TransformerMixin):
    
    def __init__(self, convertlower = True, counturl = True, senderinfo = True):
        self.convertlower = convertlower
        self.counturl = counturl
        self.senderinfo = senderinfo
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, x, y=None):    
        print('Preparing data...')
        list_words = []
        for filenum in range(len(x)):
            m_body, m_headers, sub = dividing_mail(x[filenum])
            if m_body != '':
                if self.convertlower == True:
                    m_body = m_body.lower()
                if self.counturl == True:
                    urls = counting_urls(m_body)
                else:
                    return 0
            
                m_body = cleaning_htmlent(m_body)
                if self.senderinfo == True:
                    name_sender, dom_sender = x[filenum].from_[0][0], x[filenum].from_[0][1].split('@')[-1]
        
                sub_list = keep_only_important_words(sub)
                body_list = keep_only_important_words(m_body)
                sub_list = removing_alphas(sub_list)
                body_list = removing_alphas(body_list)
            
                ps = PorterStemmer()
                s_c =   [ps.stem(w) for w in sub_list]
                b_c =   [ps.stem(w) for w in body_list]
                b_c.extend([urls, name_sender, dom_sender])
                tojoin = [b_c, s_c]
                
            else:
                s_c =['','','']
                b_c = ['0','','']
                tojoin = [b_c, s_c]
            list_words.append(tojoin)
        return list_words



    
#transformer for getting the features from the lists of words
class making_features(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_f_bodyham = 300 , n_f_bodyspam = 200, n_f_subham = 50, n_f_subspam = 50, cols =[]):
        self.n_f_bodyham = n_f_bodyham
        self.n_f_bodyspam = n_f_bodyspam
        self.n_f_subham = n_f_subham
        self.n_f_subspam = n_f_subspam 
        self.cols = cols
        self.bodycols = cols
        self.subcols = cols
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        
        b_lists = [onelist[0][:-3] for onelist in x]
        s_lists = [onelist[1] for onelist in x]
        n_lists = [onelist[0][-3:] for onelist in x]
        
        if (len(x) > 2400):
            print('making features, inside training')
            featurevectors = extracting_feats(b_lists[:sl], s_lists[:sl], [self.n_f_bodyspam ,self.n_f_subspam])
            featurevectorh = extracting_feats(b_lists[sl:], s_lists[sl:], [self.n_f_bodyham ,self.n_f_subham])
            
            bodycols = set(featurevectors[0] + featurevectorh[0])
            subcols = set(featurevectors[1] + featurevectorh[1])
            colnames =  list(bodycols) + list(subcols) + ['URL_num']   
            
            self.cols = colnames
            self.bodycols = bodycols
            self.subcols = subcols
            
            
            return [b_lists] + [s_lists] + [n_lists] + [bodycols] + [subcols] + [colnames] 
        else:
            print('making features, inside testing')
            return [b_lists] + [s_lists] + [n_lists] + [self.bodycols] + [self.subcols] + [self.cols] 
        
        
#Makes the featurevectors and dataframe by using the function feature_vectors        
class making_fvectors(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        print('preparing feature vectors... ')
        bodycols = x[3]
        subcols = x[4]
        cols = x[-1]
        df = feature_vectors( x[0], x[1], x[2], cols, bodycols, subcols)
        return df


 

In [51]:
#Giving filepaths for the data
dat_path = "datasets/spamham"
spam_path = dat_path + '/easy_spam'
ham_path = dat_path + '/easy_ham'

len_spam = len(next(os.walk(spam_path))[2])
len_ham = len(next(os.walk(ham_path))[2])

ar_dict = [(spam_path, len_spam), (ham_path, len_ham)] 
sl = math.ceil(0.8 * len_spam)
hl = math.ceil(0.8 * len_ham)

y_train = [0] * sl + [1] * hl
y_test = [0] * (len_spam - sl) + [1] * (len_ham - hl)

list_mails = []

#Reading emails and transforming them in a list of words and combining them for all mails
for sp, num in ar_dict:
    print('preparing data for ', sp)
    for file_num in range(num):
        data_mail = sp + '/mail_' + str(file_num)
        mail = mp.parse_from_file(data_mail)
        list_mails.append(mail)
        

#dividing test and train mails from all mails(list_mails). 20 percent is test in each ham and spam
train_mails = list_mails[:sl] + list_mails[len_spam: len_spam + hl]
test_mails = list_mails[sl:len_spam] + list_mails[len_spam + hl:]


preparing data for  datasets/spamham/easy_spam
preparing data for  datasets/spamham/easy_ham


More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO ?192.168.0.100? salimma1@212.18.241.211 with plain by smtp.mail.vip.sc5.yahoo.com with SMTP; 10 Oct 2002 10:30:25 -0000
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from p977.as2.cra.dublin.eircom.net HELO mfrenchw2k mfrench42@159.134.179.209 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 22 Aug 2002 22:02:25 -0000
More than one match found for (?:id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+with(?! cipher)|\s+for|\s+via|;)) in from chassid 4.65.20.230 by out001.verizon.net InterMail vM.5.01.05.09 201-253-122-126-109-20020611 with ESMTP id <20021005011206.OGMC3265.out001.verizon.net@chassid> for <rpm-list@freshrpms.net>; Fri, 4 Oct 2002 20:12:06 -0500
More than one match fo

In [52]:
MyPipeline = Pipeline([
    ('mailstolistwords', preparing_data()),
    ('making_features', making_features()),
    ('making_feature_vectors', making_fvectors())
])

df_train = MyPipeline.fit_transform(train_mails)
df_test = MyPipeline.fit_transform(test_mails)

Preparing data...
making features, inside training
preparing feature vectors... 
Preparing data...
making features, inside testing
preparing feature vectors... 


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


clf = RandomForestClassifier(random_state=42) 


params = {
    'n_estimators': randint(10, 50),
     'max_features': randint(1, df_train.shape[1]),
}

newgridsearch = RandomizedSearchCV(clf, param_distributions = params, n_iter=20, cv = 5, scoring='accuracy')
r_grid_search = newgridsearch.fit(df_train, y_train)

final_model = r_grid_search.best_estimator_
frst_scrs = cross_val_score(final_model, df_train, y_train, cv = 3, scoring = "accuracy")
print('Training scores for best estimator ')
frst_scrs


In [47]:
predictions = final_model.predict(df_test)
accuracy = sum(1 for x,y in zip(predictions,y_test) if x == y) / len(predictions)
print('Testing scores for best estimator ')
accuracy

Training scores for best estimator 


array([0.98773006, 0.9803198 , 0.97416974])

In [48]:
from sklearn.metrics import precision_score, recall_score
ps = precision_score(y_test, predictions)
rs = recall_score (y_test, predictions)
print('Precision score: ', ps)
print('Recall score: ', rs)

Testing scores for best estimator 


0.978688524590164