In [1]:
import mailbox

In [None]:
mbox = mailbox.mbox('enron.mbox')
msgs=mbox.keys()

In [None]:
for msg_key in msgs[10:20]:
    message = mbox.get(msg_key)
    print(msg_key,message['Subject'],message['From'],message['Date'],sep = " / ")
    print("======================")
    print(message.get_payload())
    print("**********************************")
    print("**********************************")

In [None]:
print(mbox)

In [None]:
help(mbox)

In [None]:
import re
def cleanup_email(msgbody):
    regex_replies = re.compile('(\-+Original Message|\-\-+|~~+).*', re.DOTALL) # find 'Original Message...' and variants
    msgbody = re.sub(regex_replies, '', msgbody)
    msgbody = re.sub(r'=\d\d', ' ', msgbody) # remove funny email formatting issues
    msgbody = re.sub(r'\s*>.*', '', msgbody) # remove quotes
    msgbody = re.sub(r'https?://.*?\s', '', msgbody) # remove links
    bigspace = re.compile(r'\n\n\n\n\n+.*', re.DOTALL) # find large gaps
    msgbody = re.sub(bigspace, '', msgbody)
    bigindent = re.compile(r'(\t| {4,}).*', re.DOTALL) # find big indentations (i.e., a quoted document)
    msgbody = re.sub(bigindent, '', msgbody)
    emailpaste = re.compile(r'(From|Subject|To): .*', re.DOTALL) # find pasted emails
    msgbody = re.sub(emailpaste, '', msgbody)
    msgbody = re.sub(r'=(\s*)\n', '\1', msgbody) # fix broken newlines
    msgbody = re.sub(r' ,([stm])', '\'\1', msgbody) # fix funny apostrophe 's and 't and 'm
    msgbody = re.sub(r'([\?\.])\?', '\1', msgbody) # fix funny extra question marks
    msgbody = re.sub(r'\x01', ' ', msgbody) # fix odd spaces
    return msgbody.strip()

In [None]:
for msg_key in msgs[326:500]:
    message = mbox.get(msg_key)
    msgbody = cleanup_email(message.get_payload())
    print(msg_key, message['Subject'], message['From'],message['To'], message['Date'],sep=" / ")
    print("============")
    print(msgbody)
    print("*****")
    print("********")

In [None]:
def check_good_tofrom(msg):
    return (re.match(r'.*(admin|newsletter|list|announce|all[\._]|everyone[\.\_]).*',msg['From'],re.IGNORECASE) is None and
    msg['To'] is not None and
    re.match(r'.*(admin|newsletter|list|announce|all[\._]|everyone[\.\_]).*',msg['To'],re.IGNORECASE) is None and
    re.match(r'.*@enron\.com',msg['From'],re.IGNORECASE) and
    len(msg['To'].split())<=3)

In [None]:
for msg_key in msgs[15000:15500]:
    message = mbox.get(msg_key)
    if check_good_tofrom(message):
        msgbody = cleanup_email(message.get_payload())
        if len(msgbody)>0:
                print(msg_key, message['Subject'], message['From'],message['To'], message['Date'],sep=" / ")
                print("============")
                print(msgbody)
                print("*****")
                print("********")

In [None]:
import spacy
nlp=spacy.load('en')

In [None]:
for token in nlp(u"I want to buy stock."):
    print(token.text,token.lemma_,token.pos_,token.tag_,token.is_alpha,token.is_stop,sep = " / ")

In [None]:
for token in nlp("Bob says he knows about the scam"):
    print(token.text,token.lemma_,token.pos_,token.tag_,token.is_alpha,token.is_stop,sep = " / ")

In [None]:
from spacy import displacy

In [None]:
displacy.render(nlp("I want to buy stock."),style = "dep",jupyter=True)

In [None]:
displacy.render(nlp("I want to buy stock."),style = "dep",jupyter=False)

In [None]:
displacy.render(nlp("I want to buy stock."),style = "dep")

In [None]:
displacy.render(nlp("Bob said he knows about the scam."),style = "dep",jupyter=True)

In [None]:
from spacy.symbols import nsubj, xcomp, dobj, pobj, prep, attr,VERB, PRON, NOUN,PROPN, PUNCT
def extract_relationships(doc):
    relationships = []
    for possible_subject in doc:
        if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
            subj = possible_subject
            verb = possible_subject.head
            print(subj, verb)
            for vr in verb.rights:
                if vr.dep == xcomp:
                    for vc in vr.children:
                        if vc.dep == dobj and vc.pos == NOUN:
                            relationships.append((subj,verb,vr,vc))
                if vr.dep == prep:
                    for vc in vr.children:
                        if vc.dep == pobj and vc.pos == NOUN:
                            relationships.append((subj,verb,vr,vc)) 
                if vr.dep == dobj and vr.pos == NOUN:
                    relationships.append((subj,verb,vr))
                print(vr, vr.dep_,vr.pos_)
    return relationships

In [None]:
extract_relationships(nlp("I want to buy stock."))

In [None]:
extract_relationships(nlp("Bob said he knows about the scam."))

In [None]:
extract_relationships(nlp("I bought stock."))

In [None]:
extract_relationships(nlp("Jane said Bob knows about the scam."))

In [None]:
#simple method for anaphora? resolution
def find_referent(doc, pronoun, msgfrom, msgto):
    if pronoun.text.lower() in ['i','myself','me','we']:
        return msgfrom
    elif pronoun.text.lower() in ['you','your']:
        return msgto
    else:
        w = pronoun
        while w.head != w:
            w = w.head
        for c in w.children:
            if c.dep == nsubj:
                return c.text
        return None

In [None]:
doc = nlp('Bob said he knows about the scam.')
rels = extract_relationships(doc)
for rel in rels:
    for w in rel:
        if w.pos == PRON:
            print(find_referent(doc, w, 'meme','youyou'))

In [None]:
def extract_relationships2(doc,msgfrom,msgto):
    relationships = []
    for possible_subject in doc:
        if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
            subj = possible_subject
            verb = possible_subject.head
            if subj.pos == PRON or subj.pos ==PROPN:
                if subj.pos == PRON:
                    ref = find_referent(doc,subj,msgfrom,msgto)
                    if ref is not None:
                        subj = ref
                    else:
                        subj = subj.text
                else:
                    subj = subj.text
            if subj.lower() in ['they','it']:
                continue
            print(subj, verb)
            
            for vr in verb.rights:
                if vr.dep_ == xcomp:
                    for vc in vr.children:
                        if vc.dep_ == dobj and vc.pos_ == NOUN:
                            if vr.idx < vc.idx:
                                relationships.append((subj,verb.lemma_,vr.lemma_+" "+vc.lemma_))
                            else:
                                relationships.append((subj,verb.lemma_,vc.lemma_+" "+vr.lemma_))
                elif vr.dep_ == prep:
                    for vc in vr.children:
                        if vc.dep_ == pobj and vc.pos_ == NOUN:
                            relationships.append((subj,verb.lemma_,vc.lemma_)) 
                elif vr.dep_ == dobj and (vr.pos_ == NOUN or vr.pos_ == PROPN):
                    has_compound = False
                    for vc in vr.children:
                        if vc.dep_ == 'compound' and vc.pos_ == NOUN:
                            has_compound = True
                            if vr.idx < vc.idx:
                                relationships.append((subj,verb.lemma_,vr.lemma_+" "+vc.lemma_))
                            else:
                                relationships.append((subj,verb.lemma_,vc.lemma_+" "+vr.lemma_))
                        if not has_compound:
                            relationships.append((subj,verb.lemma_,vr.lemma_))
                elif vr.dep_ == attr:
                    relationships.append((subj,verb.lemma_,vr.lemma_))
                print(vr.dep_,vr.pos_,vr.idx,vr.lemma_,relationships)
    return relationships

In [None]:
def extract_relationships2(doc, msgfrom, msgto):
    #print(doc)
    relationships = []
    for possible_subject in doc:
        if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
            subj = possible_subject
            verb = possible_subject.head
            
            if subj.pos == PRON or subj.pos == PROPN:
                if subj.pos == PRON:
                    ref = find_referent(doc, subj, msgfrom, msgto)
                    if ref is not None:
                        subj = ref
                    else:
                        subj = subj.text
                else:
                    subj = subj.text
                
                # ignore worthless subjects
                if subj.lower() in ['they', 'it']:
                    continue
                    
                for vr in verb.rights:
                    if vr.dep == xcomp:
                        for vc in vr.children:
                            if vc.dep == dobj and vc.pos == NOUN:
                                if vr.idx < vc.idx:
                                    relationships.append((subj, verb.lemma_, vr.lemma_ + " " + vc.lemma_))
                                else:
                                    relationships.append((subj, verb.lemma_, vc.lemma_ + " " + vr.lemma_))
                    elif vr.dep == prep:
                        for vc in vr.children:
                            if vc.dep == pobj and vc.pos == NOUN:
                                relationships.append((subj, verb.lemma_, vc.lemma_))
                    elif vr.dep == dobj and (vr.pos == NOUN or vr.pos == PROPN):
                        has_compound = False
                        for vc in vr.children:
                            if vc.dep_ == 'compound' and vc.pos == NOUN:
                                has_compound = True
                                if vr.idx < vc.idx:
                                    relationships.append((subj, verb.lemma_, vr.lemma_ + " " + vc.lemma_))
                                else:
                                    relationships.append((subj, verb.lemma_, vc.lemma_ + " " + vr.lemma_))
                        if not has_compound:
                            relationships.append((subj, verb.lemma_, vr.lemma_))
                    elif vr.dep == attr:
                        relationships.append((subj, verb.lemma_, vr.lemma_))
    return relationships



In [None]:
print(extract_relationships2(nlp('I want to buy stock.'),'me@example','you@example')
,extract_relationships2(nlp('Jane said Bob knows about the scam'),'me@example','you@example')
,extract_relationships2(nlp('Bob knows about the scam.'),'me@example','you@example')
,extract_relationships2(nlp('You bought stock.'),'me@example','you@example'),sep ="\n")

In [None]:
def extract_email_relationships(mbox, msg_key):
    message = mbox.get(msg_key)
    if message['From'] is not None and message['To'] is not None:
        try:
            msgnlp = nlp(cleanup_email(message.get_payload()))
            return extract_relationships2(msgnlp,message['From'],message['To'].split(', ')[0])
        except :
            return []
    else:
        return []

In [None]:
extract_email_relationships(mbox, 15000)

In [None]:
import rdflib
from rdflib import Graph, Literal, RDF
from rdflib.namespace import FOAF

In [None]:
g=Graph()

In [None]:
g.add((Literal('mark.greenberg@enron.com'),RDF.type, FOAF.Person))
g.add((Literal('mark.greenberg@enron.com'),Literal('requested'),Literal('copy')))

In [None]:
for s, p, o in g:
    print(s, p, o)

In [None]:
for person in g.subjects(RDF.type, FOAF.Person):
    print(person)

In [None]:
qres = g.query('SELECT ?s ?o where { ?s ?p ?o . }',initBindings = {'p':Literal('requested')})

for row in qres:
    print("%s -> %s" % row)

In [None]:
def create_graph_from_email_relationships(mbox):
    g = Graph()
    msg_key_idx = {}
    msg_key_idx_reverse = {}
    for msg_key in msgs[15000:15500]:
        rels = extract_email_relationships(mbox, msg_key)
        for (s, p, o) in rels:
            r=(Literal(s),Literal(p),Literal(o))
            g.add(r)
            if r in msg_key_idx:
                msg_key_idx[r].append(msg_key)
            else:
                msg_key_idx[r] = [msg_key]
                
            if msg_key in msg_key_idx_reverse:
                msg_key_idx_reverse[msg_key].append(r)
            else:
                msg_key_idx_reverse[msg_key] = [r]
    return (g, msg_key_idx, msg_key_idx_reverse)
                
                

In [None]:
(g, msg_key_idx,msg_key_idx_reverse) = create_graph_from_email_relationships(mbox)

In [None]:
for s, p, o in g:
    print((s,p,o))

In [None]:
predicates = set()
for s, p, o in g:
    predicates.add(p)

In [None]:
predicates

In [None]:
qres = g.query('SELECT ?s ?o where { ?s ?p ?o . }',initBindings = {'p':Literal('remove')})

for row in qres:
    print("%s -> %s"%row)

In [None]:
msg_key_idx

In [None]:
msg_key_idx_reverse

In [None]:
def query_relationships(predicate, g, msg_key_idx, msg_key_idx_reverse):
    doc = nlp(predicate)
    p = Literal(doc[0].lemma_)
    qres = g.query('SELECT ?s ?o WHERE { ?s ?p ?o . }', initBindings = {'p' : p})

    for row in qres:
        r = (row[0], p, row[1])
        print("%s\t*%s*\t%s -- msg_keys: %s" % (row[0], p, row[1], msg_key_idx[r]))

In [None]:
def create_graph_from_email_relationships(mbox):
    g = Graph('Sleepycat', identifier='enron_relationships') # needs python lib bsddb3
    #installing bsddb3 has issues
    g.open('enron_relationships.rdf', create = True)
    msg_key_idx = {}
    msg_key_idx_reverse = {}
    
    i = 0
    msgs = mbox.keys()
    msg_count = len(msgs)
    for msg_key in msgs: # no limit now, do all messages
        i += 1
        if i % 10000 == 0:
            print("Message %d of %d" % (i, msg_count))
    
        # find relationships
        rels = extract_email_relationships(mbox, msg_key)
        
        # for each relationship
        for (s, p, o) in rels:
            
            r = (Literal(s), Literal(p), Literal(o))
            
            # add relationship to the graph
            g.add(r)
            
            # remember which message(s) had this relationship
            if r in msg_key_idx:
                msg_key_idx[r].append(msg_key)
            else:
                msg_key_idx[r] = [msg_key]
                
            # remember the relationships this message had
            if msg_key in msg_key_idx_reverse:
                msg_key_idx_reverse[msg_key].append(r)
            else:
                msg_key_idx_reverse[msg_key] = [r]
                
    return (g, msg_key_idx, msg_key_idx_reverse)

(g, msg_key_idx, msg_key_idx_reverse) = create_graph_from_email_relationships(mbox)

query_relationships("removed", g, msg_key_idx, msg_key_idx_reverse)
