# Importing the e-mails

In [None]:
import numpy as np
import pandas as pd
import email

In [None]:
emails_df = pd.read_csv("emails.csv")
emails_df = emails_df.sample(n=25900, random_state=123)
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages
set_1 = emails_df[:25000]
set_2 = emails_df[25000:25300]
set_3 = emails_df[25300:25600]
set_4 = emails_df[25600:]
set_1.to_csv("emails_25000.csv")
set_2.to_csv("emails_300_set_1.csv")
set_3.to_csv("emails_300_set_2.csv")
set_4.to_csv("emails_300_set_3.csv")

# Prototype e-mail importance ranker

In [1]:
import spacy
import pandas as pd
import sklearn.feature_extraction
import numpy as np

In [2]:
%%time
nlp = spacy.load("en_core_web_md")

Wall time: 7.42 s


In [3]:
%%time
stop_words = sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS

def clean(doc: str) -> str:
    doc = doc.lower()
    doc = "".join([c for c in doc if c.isalpha() or c.isspace()])
    doc = " ".join([word for word in doc.split() if word not in stop_words])
    return doc

frame = pd.read_excel("Data/emails_300_set_3.xlsx", header=1)[["Content", "Important"]]
important = [nlp(clean(doc)) for doc in frame[frame["Important"] == True]["Content"]]

Wall time: 757 ms


In [4]:
def score(doc: str) -> float:
    doc = nlp(doc)
    sim_important = [doc.similarity(other) for other in important]
    return np.mean(sim_important)

In [5]:
%%time
frame = frame.assign(Score=[score(email) for email in frame["Content"]])

Wall time: 5.92 s


In [6]:
frame.head(10)

Unnamed: 0,Content,Important,Score
0,Dear Louise and Greg:\r\r\n\r\r\nFortune magaz...,False,0.765342
1,"Hey guys,\r\r\n\r\r\nJust wanted to make known...",True,0.756735
2,"Dear Ken,\r\r\n\r\r\nI hope you are faring oka...",True,0.756855
3,\r\r\n<html>\r\r\n<head>\r\r\n<title>SmartRemi...,False,0.459179
4,There will be a Power Group Meeting this morni...,True,0.702207
5,---------------------- Forwarded by Kay Mann/C...,True,0.795505
6,---------------------- Forwarded by Tana Jones...,False,0.456868
7,\r\r\nAs things begin to calm down somewhat af...,True,0.753594
8,Have you guys looked into this or shall I? Le...,False,0.752637
9,Please note the following changes:\r\r\nAmeren...,False,0.643995
