In [283]:
from __future__ import unicode_literals
import os
import codecs
import pandas as pd
import numpy as np
from email.parser import Parser

import regex as re

## I - Data Extraction

In [3]:
# fpath = '20_newsgroups/alt.athesim_comp'
fpath = '20_newsgroups'
messagefolder =[]

for root, dirs, files in os.walk(fpath, topdown=False):
    for name in files:
        filename = os.path.join(root, name)
        messagecat = os.path.basename(os.path.dirname(filename))

        with codecs.open(filename, 'r', encoding='utf_8', errors='ignore') as f:
            messagefolder.append({"Category":messagecat, "Body":f.read()})
        
            
messagefolder = pd.DataFrame(messagefolder)
print("Number of documents: {}".format(len(messagefolder)))
print(messagefolder.tail())

Number of documents: 19997
                                                    Body   Category
19992  Xref: cantaloupe.srv.cs.cmu.edu alt.sci.planet...  sci.space
19993  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  sci.space
19994  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  sci.space
19995  Xref: cantaloupe.srv.cs.cmu.edu sci.space:6078...  sci.space
19996  Newsgroups: sci.space\nPath: cantaloupe.srv.cs...  sci.space


In [8]:
def message_parser(messagefile, cat):
    headers = Parser().parsestr(messagefile,headersonly=True)

    # Get email's body
    if headers.is_multipart():
        for part in headers.get_payload():
            txt = part.get_payload()
    else:
        txt = headers.get_payload()
        
    return {"Category": cat,
                       'Path':headers['Path'],
                       'From': headers['From'],
                      'Newsgroups': headers['Newsgroups'],
                      'Subject': headers['Subject'],
                      'Date': headers['Date'],
                      'Organization':headers['Organization'],
                      'NNTP-Posting-Host':headers['NNTP-Posting-Host'],
                      'References': headers['References'],
                      'Message-ID': headers['Message-ID'],
                      'Sender': headers['Sender'],
                      'Body': txt.replace('\n', '').strip(),
                      'Lines': headers['Lines']}

In [72]:
msg_full = messagefolder.apply(lambda x: pd.Series(message_parser(x['Body'],x['Category'])), axis=1)

### Split data into X and Y

In [159]:
msg = msg_full.ix[:,msg_full.columns != 'Category']
y = msg_full['Category']

### Cleaning of text data

In [253]:
import spacy
import string
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 

nlp = spacy.load('en')
punc = string.punctuation

stopw = [x for x in stopwords]
stopw.extend(['what','when','who','why', 'X', 'article', 'thing', 'way', '-PRON-'])

def clean_text(msg):
    try:
        # Removes emails
        msg = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', msg)
        
        # Removes URLs
        msg = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', msg)
        
        # Removes ASCII
        msg = re.sub(r'[^\x00-\x7F]+',' ', msg)
        
        # Removes newline characters
        msg = ' '.join(msg.split('\n'))
        
        # Convert into spacy token
        msg = nlp(msg)
        
        #Lemmatize, remove stopwords and punctuations
        tokens = [str(token.lemma_) for token in msg]
        tokens = [tok for tok in tokens if (tok not in stopw and tok not in punc)] 
        
        return ' '.join(tokens)
    except:
        print('error')
        return None

In [255]:
msg = msg.assign(clean_body = msg['Body'].apply(clean_text))

In [251]:
msg['Body'][0]

u'misc.entrepreneurs,misc.wanted,pnw.forsale,uw.pc.ibm,seattle.forsale,uw..forsale,misc.forsale,misc.forsale.computers.d,misc.forsale.computers.pc-clone,misc.forsale.coomputers.other,Distribution: worldFollowup-To: From:yuri@atmos.washington.eduReply-To: yuri@atmos.washington.eduOrganization: Subject: 100 simms and 100 sipps  1MB neededKeywords: \t\tI need  100 simms and 100 sipps 1MB, but price should be around $17-20/piece.I am waiting for an offer.\tYuri Yulaev\t6553, 38th ave NE\tSeattle WA 98115\t(206) 524-2806,524-9547 (home)\t(206) 685-3793 (work)\t(206) 524-7218 (FAX)INTERNET: yuri@atmos.washington.eduUUCP:\t  uw-beaver!atmos.washington.edu!yuri'

In [254]:
clean_text(msg['Body'][0])

u'misc.entrepreneurs,misc.wanted,pnw.forsale,uw.pc.ibm,seattle.forsale,uw..forsale,misc.forsale,misc.forsale.computers.d,misc.forsale.computers.pc-clone,misc.forsale.coomputers.other,distribution worldfollowup   subject 100 simms 100 sipp   1 mb neededkeyword \t\t ne   100 simms 100 sipp 1 mb price 17 20/piece wait offer \t yuri yulaev \t 6553 38th ave ne \t seattle wa 98115 \t 206 524 2806,524 9547 home \t 206 685 3793 work \t 206 524 7218 fax)internet   \t   uw beaver!atmos.washington.edu!yuri'

### Feature Engineering
- Categorize emails into real and fake senders based on the mapping of `Path` and `From`

In [256]:
# feature engineering on Path and From
msg = msg.assign(path_token = msg['Path'].apply(lambda x: x.split('!')[-1]))
msg = msg.assign(sender = msg['From'].apply(lambda x: x.split('@')[0]))

# create new column that indicates if email is legit
msg['true_email'] = np.where((msg['path_token'] == msg['sender']), 1, 0) 

In [183]:
print('Total no. of emails: {}'.format(len(train)))
print('No. of emails with legit senders: {}'.format(sum(train['true_email'])))
print('No. of emails with false senders: {}'.format(len(train)-sum(train['true_email'])))

Total no. of emails: 14997
No. of emails with legit senders: 11669
No. of emails with false senders: 3328


### Split data into training and testing

In [257]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(msg, y, random_state=1)

### Feature Union
- Combining features of different data types into a single feature matrix using `FeatureUnion` allows us to do gridsearch easily.
- Using `FunctionTransformer`, `FeatureUnion`, `Pipeline`.

In [146]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()

In [269]:
def get_true_email(df):
    return df.loc[:, ['true_email','Lines']]

In [270]:
# create a stateless transformer from get_true_email function
get_true_email_ft = FunctionTransformer(get_true_email, validate=False)
get_true_email_ft.transform(x_train).shape

(14997, 2)

In [140]:
def get_text(df):
    return df['clean_body']

In [206]:
# create stateless transformer from get_text function
get_text_ft = FunctionTransformer(get_text, validate=False)
get_text_ft.transform(x_train).shape

(14997,)

#### Parameters for gridsearch of nested Pipeline

In [151]:
from sklearn.model_selection import GridSearchCV

In [264]:
pipe = Pipeline([('featureunion', FeatureUnion([
    ('pipeline', Pipeline([
        ('functiontransformer', get_text_ft),
        ('tfidfvectorizer', vect)])),
     ('functiontransformer', get_true_email_ft)
])),
    ('multinomialnb', nb)])


In [279]:
param_grid ={
    'featureunion__pipeline__tfidfvectorizer__token_pattern':[r"\b\w\w+\b", r"'([a-z ]+)'"],
    'featureunion__pipeline__tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)],
    'featureunion__pipeline__tfidfvectorizer__norm':['l1','l2'],
    'multinomialnb__alpha' : [0.1,0.5, 1]}

In [280]:
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

In [259]:
y_train.shape

(14997,)

In [260]:
x_train.shape

(14997, 16)

In [281]:
grid.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[(u'featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[(u'pipeline', Pipeline(steps=[(u'functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function get_text at 0x7fe1a7195aa0>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, va...rmer_weights=None)), (u'multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={u'featureunion__pipeline__tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], u'multinomialnb__alpha': [0.1, 0.5, 1], u'featureunion__pipeline__tfidfvectorizer__token_pattern': [u'\\b\\w\\w+\\b', u"'([a-z ]+)'"], u'featureunion__pipeline__tfidfvectorizer__norm': [u'l1', u'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=u'accuracy', verbose=0)

In [282]:
print(grid.best_score_)
print(grid.best_params_)

0.865973194639
{u'featureunion__pipeline__tfidfvectorizer__ngram_range': (1, 2), u'multinomialnb__alpha': 0.1, u'featureunion__pipeline__tfidfvectorizer__token_pattern': u'\\b\\w\\w+\\b', u'featureunion__pipeline__tfidfvectorizer__norm': u'l2'}
