# IMPORTS

In [4]:
from os import listdir
from os.path import isfile, join, isdir
import re
import nltk
from collections import defaultdict
import datetime
import operator

import sys
import pickle
sys.path.append("../tools/")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.stats import pearsonr
import math

import mpld3
from mpld3 import plugins

%pylab inline

mpld3.enable_notebook()

Populating the interactive namespace from numpy and matplotlib


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation

from time import time

# Regular Expressions

In [5]:
msg_id_regex = re.compile(r'^Message\-ID:\s+<(.*)>$', re.MULTILINE)
date_regex = re.compile(r'^Date:\s+(.*)$', re.MULTILINE)
date_extract_regex = re.compile(r'\d{1,2}\s\w{3}\s\d{4}')

header_regex = re.compile(r'Message-ID:.*((.|\n)*)X-From', re.MULTILINE)

subject_regex = re.compile(r'^Subject:(.*)\n', re.MULTILINE)

mail_sender_regex =  re.compile(r'^From:\s+(.*@[\w\-\.]+\.\w+)$', re.MULTILINE)
#mail_recipient_regex = re.compile(r'^To:\s+(.*@[\w\-\.]+\.\w+)$', re.MULTILINE)
to_recipient_regex = re.compile(r'^To:\s((.|\n)*?)Subject', re.MULTILINE)
cc_recipient_regex = re.compile(r'^[cC]{2}:\s((.|\n)*)Mime', re.MULTILINE)
bcc_recipient_regex = re.compile(r'^Bcc:\s((.|\n)*)X-From', re.MULTILINE)

forwarded_regex = re.compile(r'(\n\s?--{2,})', re.MULTILINE)
regular_body_regex = re.compile(r'^X-FileName:.*((.|\n)*)', re.MULTILINE)
forwarded_body_regex = re.compile(r'^X-FileName:.*((.|\n)+?)(?=\n\s?--{2,})', re.MULTILINE)

date_regex = re.compile(r'^Date:\s+(.*)$', re.MULTILINE)
date_extract_regex = re.compile(r'\d{1,2}\s\w{3}\s\d{4}')
day_of_the_week_extract_regex = re.compile(r'^(\w{2,3}),')
time_extract_regex = re.compile(r'(\d{2}:\d{2}:\d{2})')
time_zone_extract_regex = re.compile(r'[+-]{1}\d{4}')
day_extract_regex = re.compile(r'^\d{1,2}')
month_extract_regex = re.compile(r'\s(\w{2,3})\s')
year_extract_regex = re.compile(r'\d{4}$')

# BUILDING CORPUS

## The Parser Function

This function will go through the email folders and find the sent email folders. We are going to work only over the sent emails to try to detect the POI from the non-POI.

The function takes one argument, the number of folders to process. This one was introduced for debugging purposes only, when I wanted to parse only a handful of folders to see if the function worked correctly. When used for the final run, I just pass a number larger than the total number of folders.

This function will open each email, parse its body (It will try to isolate only what was written by the user, ie discard any forwarded part) and append it to a corpus list. It will also create a label list, associating each entry of the corpus list with its user. The return value is a list made of two elements, the corpus and the labels.

In [6]:
def build_corpus(number_of_users):
    emails_path = "C:\\Users\\e_tak_000\\Documents\\GitHub\\ud120-projects\\enron_mail_20150507\\enron_mail_20150507\\maildir"
    sent_folders_set = ["_sent_mail", "sent", "_sent", "sent_items"]
    corpus = []
    labels = []

    users_folder = [f for f in listdir(emails_path) if isdir(join(emails_path, f))]
    counter = 0

    root_folders_list = []
    #Create the instance for all users
    for top_folder in users_folder:
        root_folders_list.append(top_folder)
        
    #loop over each email owner
    for user_root_folder in root_folders_list:
        if counter > number_of_users:
            break
        counter += 1
        
        print counter, user_root_folder
        user_full_path = join(emails_path, user_root_folder)
        inner_folders = [f for f in listdir(user_full_path) if isdir(join(user_full_path, f))]
        #Loop over all subfolders
        for current_folder in inner_folders:
            #If this folder is in the sent documents, start processing
            if current_folder in sent_folders_set:
                folder_full_path = join(user_full_path, current_folder)
                current_folder_files = [f for f in listdir(folder_full_path) if isfile(join(folder_full_path, f)) ]
                #Loop over all sent emails
                for current_file in current_folder_files:

                    msg = open(join(folder_full_path, current_file), 'r').read()
                    
                    # Get the message body
                    if re.findall(forwarded_regex, msg):
                        #print current_file
                        msg_body = re.findall(forwarded_body_regex, msg)
                        #print "forwarded Message __________________________________"
                    else:
                        msg_body = re.findall(regular_body_regex, msg)

                    if len(msg_body) < 1:
                        print "Empty Message In ", current_file
                        print msg_body
                    else:
                        msg_body = msg_body[0][0]
                        
                    corpus.append(msg_body)
                    labels.append(user_root_folder)
                    
    return [corpus, labels]

## Build the Corpus from the Email Data

In [7]:
t0 = time()
training, labels = build_corpus(200)
print "Corpus built in %0.1fs" % (time() - t0) 

1 allen-p
2 arnold-j
3 arora-h
4 badeer-r
5 bailey-s
6 bass-e
7 baughman-d
8 beck-s
9 benson-r
10 blair-l
11 brawner-s
12 buy-r
13 campbell-l
14 carson-m
15 cash-m
16 causholli-m
17 corman-s
18 crandell-s
19 cuilla-m
20 dasovich-j
21 davis-d
22 dean-c
23 delainey-d
24 derrick-j
25 dickson-s
26 donoho-l
27 donohoe-t
28 dorland-c
29 ermis-f
30 farmer-d
31 fischer-m
32 forney-j
33 fossum-d
34 gang-l
35 gay-r
36 geaccone-t
37 germany-c
38 gilbertsmith-d
39 giron-d
40 griffith-j
41 grigsby-m
42 guzman-m
43 haedicke-m
44 hain-m
45 harris-s
46 hayslett-r
47 heard-m
48 hendrickson-s
49 hernandez-j
50 hodge-j
51 holst-k
52 horton-s
53 hyatt-k
54 hyvl-d
55 jones-t
56 kaminski-v
57 kean-s
58 keavey-p
59 keiser-k
60 king-j
61 kitchen-l
62 kuykendall-t
63 lavorato-j
64 lay-k
65 lenhart-m
66 lewis-a
67 linder-e
68 lokay-m
69 lokey-t
70 love-p
71 lucci-p
72 maggi-m
73 mann-k
74 martin-t
75 may-l
76 mccarty-d
77 mcconnell-m
78 mckay-b
79 mckay-j
80 mclaughlin-e
81 merriss-s
82 meyers-a
83 mims-thursto

In [8]:
#Just in case, keep an original copy of the corpus aside:
backup_corpus_data = [training, labels]

## Transform Labels into Booleans

The boolean values are True for being a POI and False for non-POI

In [9]:
poi_list = ['lay-k', 'skilling-j', 'forney-j', 'delainey-d']

poi_labels = []

for l in labels:
    if l in poi_list:
        status = True
    else:
        status = False
    poi_labels.append(status)

In [10]:
#Convert it into numpy array
poi_labels_array = np.asarray(poi_labels)

In [12]:
poi_labels_array.shape

(126462L,)

# NATURAL LANGUAGE PROCESSING

## Stop Words

In [13]:
#I have created this additional stop words list, just in case in the future work I would watnt to remove some words and see how 
#this will affect the model. But for now, this list is empty
my_additional_stop_words = ()

#Create the stop words list to be filtered from the corpus
stop_words = ENGLISH_STOP_WORDS.union(my_additional_stop_words)

## TF-IDF

### Build the Term Frequency Inverse Document Frequency

In [14]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words=stop_words)

### Transform the Emails Body List into TF-IDF

In [15]:
t0 = time()
X = tfidf_vectorizer.fit_transform(training)
print "Corpus transformed in %0.1fs" % (time() - t0) 

Corpus transformed in 11.9s


In [16]:
X.shape

(126462, 94185)

In [17]:
len(poi_labels_array)

126462

## ESTIMATOR

### Stratify-Split the Data into Training and Testing Sets

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, # X
                                                    poi_labels_array, # y 
                                                    test_size=0.2, # Keep 20% for testing
                                                    random_state=42, # To be able to reproduce the results
                                                    stratify=poi_labels_array) #Stratified split

### Build Model

#### CrossValidation

In [19]:
clf = MultinomialNB()

In [20]:
stratified_k_fold = cross_validation.StratifiedKFold(y_train, n_folds= 10)

In [21]:
result = [ clf.fit(X_train[train], y_train[train]).score(X_train[test], y_train[test]) for train, test in stratified_k_fold ]

In [22]:
result

[0.97262304803320809,
 0.97311721684127295,
 0.97311721684127295,
 0.97331224671345262,
 0.97321340318276173,
 0.97291687259068893,
 0.97281534203242392,
 0.97321075523922496,
 0.9733096085409253,
 0.97301304863582438]

In [23]:
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Evaluation

In [24]:
clf.score(X_test, y_test) 

0.97307555450124539

I am extremely impressed, to say the least, about the results.