This is a Assignment project by Sajal Tiwari for BizAnalytix Internship through Internshala

# Installing and Importing Libraries

In [1]:
from os import makedirs, path, remove, rename, rmdir
from tarfile import open as open_tar
from urllib import request, parse
from glob import glob
from os import path
from sklearn.model_selection import StratifiedShuffleSplit
from re import sub
from email import message_from_file
import numpy as np
from collections import defaultdict
%pip install joblib
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from functools import partial
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier



# Fetching data and structuring DIR || Loading Email

In [2]:
def download_corpus(data_dir:str='data'):
    
    """
        Function to fetch data and arrange it in proper manner
        The Dir Structure should look like
        
                Data
                |-- downloads
                |   |--20021010_easy_ham.tar.bz2
                |   |--20021010_spam.tar.bz2
                |
                |-- ham
                |-- spam
    """

    base_URL = "https://spamassassin.apache.org"
    corpus_PATH = "old/publiccorpus"

    files = {
        '20021010_easy_ham.tar.bz2': 'ham',
        '20021010_spam.tar.bz2': 'spam',
    }

    downloads_dir = path.join(data_dir, 'downloads')
    ham_dir = path.join(data_dir, 'ham')
    spam_dir = path.join(data_dir, 'spam')

    makedirs(downloads_dir, exist_ok=True)
    makedirs(ham_dir, exist_ok=True)
    makedirs(spam_dir, exist_ok=True)

    for file, spam_or_ham in files.items():
        # download file
        url = parse.urljoin(base_URL, f'{corpus_PATH}/{file}')
        tar_filename = path.join(downloads_dir, file)
        request.urlretrieve(url, tar_filename)
        
        # list e-mails in compressed file
        emails = []
        with open_tar(tar_filename) as tar:
            tar.extractall(path=downloads_dir)
            for tarinfo in tar:
                if len(tarinfo.name.split('/')) > 1:
                    emails.append(tarinfo.name)
        
        # move e-mails to ham or spam dir
        for email in emails:
            directory, filename = email.split('/')
            directory = path.join(downloads_dir, directory)
            rename(path.join(directory, filename),
                   path.join(data_dir, spam_or_ham, filename))
        rmdir(directory)


In [3]:
download_corpus()

In [4]:
ham_dir = path.join('data', 'ham')
spam_dir = path.join('data', 'spam')

print('Total Hams  =======>', len(glob(f'{ham_dir}/*')))  
print('Total Spams =======>', len(glob(f'{spam_dir}/*')))  



## Loading mail as numpy array

In [5]:
class SimpleEmail:
    '''
    Structure the mail into simple string clean format
    '''
    def __init__(self, subject: str, body: str):
        self.subject = subject
        self.body = body

    @property
    def clean(self):
        sanitizer = '[^A-Za-z]+'
        clean = sub(sanitizer, ' ', f'{self.subject} {self.body}')
        clean = clean.lower()
        return sub('\s+', ' ', clean)

    def __str__(self):
        subject = f'subject: {self.subject}'
        body_first_line = self.body.split('\n')[0]
        body = f'body: {body_first_line}...'
        return f'{subject}\n{body}'
    def __repr__(self):
        return self.__str__()

class EmailIterator:
    def __init__(self, directory: str):
        self._files = glob(f'{directory}/*')
        self._pos = 0
    
    def __iter__(self):
        self._pos = -1
        return self
    
    def __next__(self):
        if self._pos < len(self._files) - 1:
            self._pos += 1
            return self.parse_email(self._files[self._pos])
        raise StopIteration()
        
    @staticmethod
    def parse_email(filename: str) -> SimpleEmail:
        with open(filename,
                  encoding='utf-8',
                  errors='replace') as fp:
            message = message_from_file(fp)
        
        subject = None
        for item in message.raw_items():
            if item[0] == 'Subject':
                subject = item[1]
        
        if message.is_multipart():
            body = []
            for b in message.get_payload():
                body.append(str(b))
            body = '\n'.join(body)
        else:
            body = message.get_payload()
        
        return SimpleEmail(subject, body)

In [6]:
"""
Data Preprocessing
Converting all the emails to numpy array 
"""
ham_emails = EmailIterator('data/ham')
spam_emails = EmailIterator('data/spam')
hams = np.array([email.clean for email in ham_emails])
spams = np.array([email.clean for email in spam_emails])

In [7]:
print("Length of Hams is -- ",len(hams))
print("Example of Hams is -- ",hams[1])

Length of Hams is --  2551
Example of Hams is --   razor users viewing my trust rating hey folks i know this question gets asked a lot but i haven t seen an answer lately any idea when a user will be able to view his own trust rating what are the plans for this will it be built into the client web based emailed to you nightly if you could share your brainstorming on the subject i would greatly appreciate it this sf net email is sponsored by osdn tired of that same old cell phone get a new here for free https www inphonic com r asp r sourceforge refcode vs razor users mailing list razor users lists sourceforge net https lists sourceforge net lists listinfo razor users 


In [8]:
print("Length of Spams is -- ",len(spams))
print("Example of Spams is -- ",spams[1])

Length of Spams is --  501
Example of Spams is --  shape up for summer now as seen on nbc cbs cnn and even oprah the health discovery that actually reverses aging while burning fat without dieting or exercise this proven discovery has even been reported on by the new england journal of medicine forget aging and dieting forever and it s guaranteed click here http web kuhleersparnis ch hgh index html would you like to lose weight while you sleep no dieting no hunger pains no cravings no strenuous exercise change your life forever guaranteed body fat loss improvement wrinkle reduction improvement energy level improvement muscle strength improvement sexual potency improvement emotional stability improvement memory improvement you are receiving this email as a subscriber to the opt in america mailing list to unsubscribe from future offers just click here mailto affiliateoptout btamail net cn subject off 


# Data Distribution

In [9]:
"""
For Even distribution of data into training and test set 
Stratified Shuffel Split is used for even distribution of data into train and test set

"""
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)


In [10]:
emails = np.concatenate((hams, spams))

In [11]:
labels = np.concatenate((np.zeros(hams.size), np.ones(spams.size)))

In [12]:
for train_index, test_index in split.split(emails, labels):
    emails_train, labels_train = \
        emails[train_index], labels[train_index]
    emails_test, labels_test = \
        emails[test_index], labels[test_index]

### This will change the email string into sparse vector that indicates the presence or absence of each possible word.

In [13]:
dictionary = defaultdict(int)
for email in emails_train:
    for word in email.split(' '):
        dictionary[word] += 1

In [14]:
top = 1000
descending_dictionary = sorted(dictionary.items(),
                               key=lambda v: v[1],
                               reverse=True)
dictionary = [
    word for (word, occur) in descending_dictionary
    if len(word) > 1
][:top]

In [15]:
def encode_email(email: SimpleEmail,
                 dictionary_: list,
                 binary: bool = False) -> np.array:

                 '''Encoding mail to vectors based on occurance and non occurance'''
    encoded = np.zeros(dictionary_.size)
    words = email.split(' ')
    
    for word in words:
        index = np.where(dictionary_ == word)[0]
        if index.size == 1:  # we ignore unknown words
            if binary:
                encoded[index[0]] = 1
            else:
                encoded[index[0]] += 1
    return encoded

In [16]:
dictionary = np.array(dictionary)
_encode_email = partial(encode_email, dictionary_=dictionary)
encoded_train = np.array(list(map(_encode_email, emails_train)))
encoded_test = np.array(list(map(_encode_email, emails_test)))

In [17]:
## Deleting non required variables to prevent RAM crash due to exceeding limits
del ham_emails,spam_emails,hams,spams,emails,labels,dictionary,descending_dictionary

# Training Model

In [18]:
knn_clf = KNeighborsClassifier()

In [19]:
labels_pred = cross_val_predict(knn_clf,
                                encoded_train,
                                labels_train,
                                cv=5)

In [20]:
print('accuracy:', accuracy_score(labels_train, labels_pred))

accuracy: 0.956984842277755


In [21]:
print('precision:', precision_score(labels_train, labels_pred))

precision: 0.9539877300613497


In [22]:
print('recall:', recall_score(labels_train, labels_pred))

recall: 0.7755610972568578


In [23]:
print('f1:', f1_score(labels_train, labels_pred))

f1: 0.8555708390646493


### Using GridSearchCV to get optimal params

In [24]:
params_grid = [{
    'n_neighbors': [2, 5, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['minkowski'],
    'metric_params': [{'p': 2}, {'p': 3}, {'p': 4}]
}]

In [25]:
search = GridSearchCV(knn_clf, params_grid, n_jobs=6,
                      scoring='recall', cv=5, verbose=10)

In [26]:
search.fit(encoded_train, labels_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


  return self._fit(X, y)


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=6,
             param_grid=[{'algorithm': ['auto', 'ball_tree', 'kd_tree',
                                        'brute'],
                          'metric': ['minkowski'],
                          'metric_params': [{'p': 2}, {'p': 3}, {'p': 4}],
                          'n_neighbors': [2, 5, 10],
                          'weights': ['uniform', 'distance']}],
             scoring='recall', verbose=10)

In [27]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",search.best_estimator_)
print("\n The best score across ALL searched params:\n",search.best_score_)
print("\n The best parameters across ALL searched params:\n",search.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 KNeighborsClassifier(metric_params={'p': 2}, n_neighbors=2, weights='distance')

 The best score across ALL searched params:
 0.9075925925925926

 The best parameters across ALL searched params:
 {'algorithm': 'auto', 'metric': 'minkowski', 'metric_params': {'p': 2}, 'n_neighbors': 2, 'weights': 'distance'}


In [28]:
labels_pred = cross_val_predict(search.best_estimator_,
                                encoded_train,
                                labels_train,
                                cv=5)
print('Accuracy for best params is ======>>', accuracy_score(labels_train, labels_pred))
print('Precision for best params is =====>>', precision_score(labels_train, labels_pred))
print('Recall for best params is ========>>', recall_score(labels_train, labels_pred))
print('f1 Score for best params is ======>>', f1_score(labels_train, labels_pred))


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Precision for best params is =====>> 0.9357326478149101


In [31]:
knn_clf = KNeighborsClassifier(algorithm='auto',p=2,
                               n_neighbors=2,
                               weights='distance')
knn_clf.fit(encoded_train, labels_train)
labels_pred = knn_clf.predict(encoded_test)
print('accuracy:', accuracy_score(labels_test, labels_pred))
# accuracy: 0.982896846606093
print('precision:', precision_score(labels_test, labels_pred))
# precision: 0.9666666666666667
print('recall:', recall_score(labels_test, labels_pred))
# recall: 0.9666666666666667
print('f1:', f1_score(labels_test, labels_pred))
# f1: 0.9666666666666667

accuracy: 0.9754500818330606
precision: 0.9473684210526315
recall: 0.9
f1: 0.9230769230769231


# Building Pipeline and automating all process of Encoding and Training

In [32]:
class MessageEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, binary: bool = False, top: int = 1000):
        self.dictionary_ = None
        self.binary = binary
        self.top = top
    def fit(self, X, y=None):
        dictionary = defaultdict(int)
        
        for email in X:
            for word in email.split(' '):
                dictionary[word] += 1
        
        descending_dictionary = sorted(dictionary.items(),
                                       key=lambda v: v[1],
                                       reverse=True)
        
        self.dictionary = np.array([
            word for (word, occur) in descending_dictionary
            if len(word) > 1
        ][:self.top])
        
        return self
    def transform(self, X):
        return np.array(list(map(self.encode_message, X)))
    def encode_message(self, message: str):
        encoded = np.zeros(self.dictionary.size)
        words = message.split(' ')
        for word in words:
            index = np.where(self.dictionary == word)[0]
            if index.size == 1:  # we ignore unknown words
                if self.binary:
                    encoded[index[0]] = 1
                else:
                    encoded[index[0]] += 1
        return encoded

In [33]:
encoder = MessageEncoder()
encoder.fit(emails_train)
encoded_emails_train = encoder.transform(emails_train)

In [34]:
pipeline = Pipeline([
    ('encode_messages', MessageEncoder()),
    ('knn_clf', KNeighborsClassifier()),
])

In [35]:
# Here knn_clf__algorithm = 'auto' and knn_clf__weights = distance are used to train model due to shortage of RAM
# the model can be trained using below params if RAM availablity is High
params_grid = [{
    'encode_messages__binary': [True, False],
    'encode_messages__top': [500, 1000],
    'knn_clf__n_neighbors': [2, 5, 10],
    'knn_clf__weights': ['uniform', 'distance'],
    'knn_clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
}]

pipe_search = GridSearchCV(pipeline, params_grid, n_jobs=1,
                           scoring='recall', cv=5, verbose=2)
pipe_search.fit(emails_train, labels_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END encode_messages__binary=True, encode_messages__top=500, knn_clf__algorithm=auto, knn_clf__n_neighbors=2, knn_clf__weights=distance; total time=  11.3s
[CV] END encode_messages__binary=True, encode_messages__top=500, knn_clf__algorithm=auto, knn_clf__n_neighbors=2, knn_clf__weights=distance; total time=  10.8s
[CV] END encode_messages__binary=True, encode_messages__top=500, knn_clf__algorithm=auto, knn_clf__n_neighbors=2, knn_clf__weights=distance; total time=  10.8s
[CV] END encode_messages__binary=True, encode_messages__top=500, knn_clf__algorithm=auto, knn_clf__n_neighbors=2, knn_clf__weights=distance; total time=  10.9s
[CV] END encode_messages__binary=True, encode_messages__top=500, knn_clf__algorithm=auto, knn_clf__n_neighbors=2, knn_clf__weights=distance; total time=  10.9s
[CV] END encode_messages__binary=True, encode_messages__top=500, knn_clf__algorithm=auto, knn_clf__n_neighbors=5, knn_clf__weights=distance

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('encode_messages', MessageEncoder()),
                                       ('knn_clf', KNeighborsClassifier())]),
             n_jobs=1,
             param_grid=[{'encode_messages__binary': [True, False],
                          'encode_messages__top': [500, 1000],
                          'knn_clf__algorithm': ['auto'],
                          'knn_clf__n_neighbors': [2, 5, 10],
                          'knn_clf__weights': ['distance']}],
             scoring='recall', verbose=2)

# Prediction

In [36]:
labels_test_pred = pipe_search.best_estimator_.predict(emails_test)
print(classification_report(labels_test,
                            labels_pred,
                            target_names=['ham', 'spam'],
                            digits=4))

              precision    recall  f1-score   support

         ham     0.9806    0.9902    0.9854       511
        spam     0.9474    0.9000    0.9231       100

    accuracy                         0.9755       611
   macro avg     0.9640    0.9451    0.9542       611
weighted avg     0.9752    0.9755    0.9752       611



# Exporting Trained Model 

In [37]:
import joblib
joblib.dump(pipe_search, 'model_file_name.pkl')

['model_file_name.pkl']

In [38]:
from google.colab import files
files.download("model_file_name.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>