# Enron emails

## Import files

In [80]:
# from kaggle.api.kaggle_api_extended import KaggleApi
# api = KaggleApi()
# api.authenticate()
# api.dataset_download_files("wcukierski/enron-email-dataset")

In [81]:
import pandas as pd
import numpy as np
import pymongo

In [82]:
emails_df = pd.read_csv("emails.csv", nrows=78000)

In [83]:
print(emails_df.shape)
emails_df.head(3)

(78000, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...


In [84]:
print(emails_df['message'][3])

Message-ID: <13505866.1075863688222.JavaMail.evans@thyme>
Date: Mon, 23 Oct 2000 06:13:00 -0700 (PDT)
From: phillip.allen@enron.com
To: randall.gay@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Randall L Gay
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail
X-Origin: Allen-P
X-FileName: pallen.nsf

Randy,

 Can you send me a schedule of the salary and level of everyone in the 
scheduling group.  Plus your thoughts on any changes that need to be made.  
(Patti S for example)

Phillip


### A few figures

In [85]:
print("Number of 'na' values' in dataset messages:")
print(pd.isna(emails_df["message"]).sum())

print("Number of duplicates in dataset messages:")
print(emails_df.duplicated(subset="message").sum())

print("Average email length (medata included)")
print(emails_df["message"].apply(len).mean())
print("Median email length (medata included)")
print(emails_df["message"].apply(len).median())


Number of 'na' values' in dataset messages:
0
Number of duplicates in dataset messages:
0
Average email length (medata included)
3501.264205128205
Median email length (medata included)
1632.0


## Injection into MongoDB

In [86]:
client = pymongo.MongoClient(host="127.0.0.1",
                     port=27017)

db = client["simplon"]
col = db["raw_mail"]
col.drop() #in case we run the cell several times

In [87]:
# make a Python list with all items
mongo_docs = [{"filename": emails_df["file"].iloc[row],
               "message": emails_df["message"].iloc[row]} for row in range(emails_df.shape[0])]

# mongo_docs[0]

In [88]:
# insert the list of documents into the Mongo DB
result = col.insert_many(mongo_docs)
print(len(result.inserted_ids), "inserted documents.")
print("An example:", col.find_one())

78000 inserted documents.
An example: {'_id': ObjectId('5e8595942633d035f648cbac'), 'filename': 'allen-p/_sent_mail/1.', 'message': "Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "}


## A little bit of cleaning

(We do the cleaning on the dataframe)

### Pure Python - didn't work

In [89]:
def get_text(row_idx):
    '''Returns the content from an email string object'''
    filename = emails_df["file"].iloc[row_idx]
    msg = emails_df["message"].iloc[row_idx]
    msg_list = msg.split("\n")
    dic = {"filename": filename}
    
    for row in range(len(msg_list)):
        str_row = msg_list[row]

        if str_row != "":
            sep = str_row.find(": ")
            k = str_row[:sep]
            v = str_row[sep+2 :]
            if v != "":
                dic[k] = v
        else:
            dic["body"] = "\n".join(msg_list[row + 1:-1])
            break
    return dic

# test example
#print(get_text(0).keys())

# Check if all the format is correct
def keys_ok(any_dic):
    for key in any_dic.keys():
        if key.find(".") != -1:
#            print(any_dic)
            return False
    return True

all([keys_ok(get_text(i)) for i in range(emails_df.shape[0])])

False

### Regex cleaning

In [90]:
import re

In [91]:
# rappel
# print(emails_df['message'][101])

In [93]:
def msg_to_dic(cmplte_msg):
    meta_and_body = re.split(r'\n\n', cmplte_msg, maxsplit=1)
    meta = meta_and_body[0]
    body = meta_and_body[1]
    meta_keys = re.findall(r'(.+): .+', meta, flags=re.MULTILINE)
    meta_vals = re.findall(r'.+: (.+)', meta, flags=re.MULTILINE)

    dic = dict(zip(meta_keys, meta_vals))
    dic["body"] = body
    return dic

# msg_to_dic(emails_df['message'][77376])

In [99]:
col2 = db["parsed_mail"]

# Erase the collection's record if needed - we should insert into a blank database
col2.drop()

# create the list of documents to be created
mongo_docs = [{"filename": emails_df["file"][row],
#               "message": msg_to_dic(emails_df["message"][row])} for row in range(len(emails_df))]
               "message": msg_to_dic(emails_df["message"][row])} for row in range(2000)]

In [100]:
# insert the list of documents into the Mongo DB
result = col2.insert_many(mongo_docs)
print(len(result.inserted_ids), "inserted documents.")

2000 inserted documents.


In [101]:
print("An example:", col2.find_one())

An example: {'_id': ObjectId('5e859a132633d035f64a042c'), 'filename': 'allen-p/_sent_mail/1.', 'message': {'Message-ID': '<18782981.1075855378110.JavaMail.evans@thyme>', 'Date': 'Mon, 14 May 2001 16:39:00 -0700 (PDT)', 'From': 'phillip.allen@enron.com', 'To': 'tim.belden@enron.com', 'Mime-Version': '1.0', 'Content-Type': 'text/plain; charset=us-ascii', 'Content-Transfer-Encoding': '7bit', 'X-From': 'Phillip K Allen', 'X-To': 'Tim Belden <Tim Belden/Enron@EnronXGate>', 'X-Folder': "\\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail", 'X-Origin': 'Allen-P', 'X-FileName': 'pallen (Non-Privileged).pst', 'body': 'Here is our forecast\n\n '}}


Seems to be working! :-)

## Querying MongoDB in Python

In [102]:
import pprint

In [105]:
pprint.pprint(col2.find_one({"filename": "allen-p/_sent_mail/1."}))

# to get all
# for mail in col2.find():
#     pprint.pprint(mail)

{'_id': ObjectId('5e859a132633d035f64a042c'),
 'filename': 'allen-p/_sent_mail/1.',
 'message': {'Content-Transfer-Encoding': '7bit',
             'Content-Type': 'text/plain; charset=us-ascii',
             'Date': 'Mon, 14 May 2001 16:39:00 -0700 (PDT)',
             'From': 'phillip.allen@enron.com',
             'Message-ID': '<18782981.1075855378110.JavaMail.evans@thyme>',
             'Mime-Version': '1.0',
             'To': 'tim.belden@enron.com',
             'X-FileName': 'pallen (Non-Privileged).pst',
             'X-Folder': "\\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent "
                         'Mail',
             'X-From': 'Phillip K Allen',
             'X-Origin': 'Allen-P',
             'X-To': 'Tim Belden <Tim Belden/Enron@EnronXGate>',
             'body': 'Here is our forecast\n\n '}}
