In [2]:
%pylab inline

from __future__ import print_function
from __future__ import division

import sklearn
import numpy
import scipy
import pandas
import matplotlib
import seaborn

import json
import re

from matplotlib import pyplot
from collections import Counter



Populating the interactive namespace from numpy and matplotlib


In [3]:
rcParams['figure.figsize'] = 15, 8

# Getting messages

In [115]:
def getlinesep(mail):
    lineseps = ['\r\n', '\n']
    return next(x for x in lineseps if 2 * x in mail)

def parse(mail):
    ls = getlinesep(mail)
    bs = 2 * ls
    
    header, body = mail.split(bs, 1)
    header = header.replace(ls + '\t', ' ').replace(ls + ' ', ' ')
    
    header = [(x, y) for x, y in [z.split(':', 1) for z in header.split(ls)]]
    body = ('body', body)
    
    return header + [body] + [('linesep', ls)]

def try_parse(mail):
    try:
        return parse(mail)
    except ValueError:
        return {'unparseable': True, 'reason': 'badheader'}
    except StopIteration:
        return {'unparseable': True, 'reason': 'nosep'}

## End test

In [None]:
ham_json = json.load(open('data/ham_train.json'))
ham = map(try_parse, ham_json)
print('{}/{} unparseable messages'.format(len([x for x in ham if 'unparseable' in x]), len(ham_json)))

In [None]:
spam_json = json.load(open('data/spam_train.json'))
spam = map(try_parse, spam_json)
print('{}/{} unparseable messages'.format(len([x for x in spam if 'unparseable' in x])), len(spam_json))

In [None]:
def key_barplot(p, limit = 0):
    from operator import itemgetter
    keys = __builtin__.sum((Counter(x.keys()) for x in p), Counter())
    vals = sorted(((x, y) for x, y in keys.iteritems() if y > limit), key = itemgetter(1))
    pyplot.xticks(rotation = 90)
    seaborn.barplot(map(itemgetter(0), vals), map(itemgetter(1), vals))

In [None]:
key_barplot(ham, limit = 5)

In [None]:
key_barplot(spam, limit = 0)

# Testing

In [43]:
import re

In [1]:
def parse(t):
    return [
        # tuple(x.rsplit(':', 1)) if re.match(r'\S*:', x) else ('message', x)
        tuple(x.rsplit(':', 1))
        for x in t.split('\r\n')
    ]

In [51]:
parse(ham_json[0])

[(u'message-id', u' <13247446.1075856567056.javamail.evans#####>'),
 (u'date: mon, 26 jun 2000 14:22', u'00 -0700 (pdt)'),
 (u'from', u' vkaminski#####'),
 (u'to', u' chris#####'),
 (u'subject: re', u' eprm articles'),
 (u'cc', u' vkaminski#####, vkamins#####'),
 (u'mime-version', u' 1.0'),
 (u'content-type', u' text/plain; charset=us-ascii'),
 (u'content-transfer-encoding', u' 7bit'),
 (u'bcc', u' vkaminski#####, vkamins#####'),
 (u'x-from', u' vkaminski#####'),
 (u'x-to', u' chris#####'),
 (u'x-cc', u' vkaminski#####, vkamins#####'),
 (u'x-bcc', u' '),
 (u'x-origin', u' kaminski-v'),
 (u'x-filename', u' vkamins.nsf'),
 (u'',),
 (u"chris,\n\nthanks for the invitation. the evening of july the 18th is fine  with me.\n\nthe list looks fine and it can be easily expanded, if the first set of\narticles is well received. i shall prepare a list of topics that occupy us\nevery day and that we could write about without revealing the details of our\nproprietary research.\n\nplease, feel free to 

In [53]:
ham_json[0].split('\r\n')

[u'message-id: <13247446.1075856567056.javamail.evans#####>',
 u'date: mon, 26 jun 2000 14:22:00 -0700 (pdt)',
 u'from: vkaminski#####',
 u'to: chris#####',
 u'subject: re: eprm articles',
 u'cc: vkaminski#####, vkamins#####',
 u'mime-version: 1.0',
 u'content-type: text/plain; charset=us-ascii',
 u'content-transfer-encoding: 7bit',
 u'bcc: vkaminski#####, vkamins#####',
 u'x-from: vkaminski#####',
 u'x-to: chris#####',
 u'x-cc: vkaminski#####, vkamins#####',
 u'x-bcc: ',
 u'x-origin: kaminski-v',
 u'x-filename: vkamins.nsf',
 u'',
 u"chris,\n\nthanks for the invitation. the evening of july the 18th is fine  with me.\n\nthe list looks fine and it can be easily expanded, if the first set of\narticles is well received. i shall prepare a list of topics that occupy us\nevery day and that we could write about without revealing the details of our\nproprietary research.\n\nplease, feel free to send the message from lacima. i think that it's better\nfor us to sign the articles with our names, 

In [55]:
ham_json[0].replace('\r\n', '\')

u"message-id: <13247446.1075856567056.javamail.evans#####>\ndate: mon, 26 jun 2000 14:22:00 -0700 (pdt)\nfrom: vkaminski#####\nto: chris#####\nsubject: re: eprm articles\ncc: vkaminski#####, vkamins#####\nmime-version: 1.0\ncontent-type: text/plain; charset=us-ascii\ncontent-transfer-encoding: 7bit\nbcc: vkaminski#####, vkamins#####\nx-from: vkaminski#####\nx-to: chris#####\nx-cc: vkaminski#####, vkamins#####\nx-bcc: \nx-origin: kaminski-v\nx-filename: vkamins.nsf\n\nchris,\n\nthanks for the invitation. the evening of july the 18th is fine  with me.\n\nthe list looks fine and it can be easily expanded, if the first set of\narticles is well received. i shall prepare a list of topics that occupy us\nevery day and that we could write about without revealing the details of our\nproprietary research.\n\nplease, feel free to send the message from lacima. i think that it's better\nfor us to sign the articles with our names, giving our respective\naffiliations. in this way, ##### gets the cred

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

&nbsp;

