In [2]:
import os, sys, email
import json
import time
import datetime

import numpy as np
import pandas as pd

from tqdm import tqdm

In [23]:
TOPIC_KEYWORD = 'email'
POSTS_THRESHOLD = 0 # involve all people
LINKS_THRESHOLD = 20
DEGREE_THRESHOLD = 5
HIGH_CONTRIBUTION_THRESHOLD = 60

# DB_CONNECT_STRING = 'mysql+pymysql://root:Initial0@10.58.78.253:3306/nexus?charset=utf8mb4'

# engine = create_engine(DB_CONNECT_STRING, max_overflow=5)

## 1. Loading dataset and pre-processing

In [24]:
# Read the data into a DataFrame
df_emails = pd.read_csv(filepath_or_buffer='./data/emails.csv')
print(df_emails.shape)
df_emails.head()

(517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [25]:
# A single message looks like this
print(df_emails['message'][0])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


## 2. Extract email message

In [26]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [27]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, df_emails['message']))
df_emails.drop('message', axis=1, inplace=True)

# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    df_emails[key] = [doc[key] for doc in messages]
    
# Parse content from emails
df_emails['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
df_emails['From'] = df_emails['From'].map(split_email_addresses)
df_emails['To'] = df_emails['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
df_emails['user'] = df_emails['file'].map(lambda x:x.split('/')[0])
del messages

df_emails.head()

Unnamed: 0,file,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,allen-p/_sent_mail/1.,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",(phillip.allen@enron.com),(tim.belden@enron.com),,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,allen-p/_sent_mail/10.,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,allen-p/_sent_mail/100.,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,allen-p/_sent_mail/1000.,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",(phillip.allen@enron.com),(randall.gay@enron.com),,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,allen-p/_sent_mail/1001.,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


In [28]:
print('shape of the dataframe:', df_emails.shape)
# Find number of unique values in each columns
for col in df_emails.columns:
    print(col, df_emails[col].nunique())

shape of the dataframe: (517401, 18)
file 517401
Message-ID 517401
Date 224128
From 20328
To 54748
Subject 159290
Mime-Version 1
Content-Type 2
Content-Transfer-Encoding 3
X-From 27980
X-To 73552
X-cc 33701
X-bcc 132
X-Folder 5335
X-Origin 259
X-FileName 429
content 249025
user 150


In [29]:
# Set index and drop columns with two few values
df_emails = df_emails.set_index('Message-ID').drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)
# Parse datetime
df_emails['Date'] = pd.to_datetime(df_emails['Date'], infer_datetime_format=True)
df_emails.dtypes

Date          datetime64[ns]
From                  object
To                    object
Subject               object
X-From                object
X-To                  object
X-cc                  object
X-bcc                 object
X-Folder              object
X-Origin              object
X-FileName            object
content               object
user                  object
dtype: object

In [30]:
df_emails = df_emails.drop(['content'], axis=1)
df_emails.to_csv(path_or_buf=f'./data/email-preprocessed-without-content.csv', encoding='utf-8')

In [31]:
df_emails.head()

Unnamed: 0_level_0,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,user
Message-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00,(phillip.allen@enron.com),(tim.belden@enron.com),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,allen-p
<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00,(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,allen-p
<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 10:00:00,(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,allen-p
<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 13:13:00,(phillip.allen@enron.com),(randall.gay@enron.com),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,allen-p
<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 12:07:00,(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,allen-p


## 3. Analyze the from-to relationship of users

In [32]:
# ignore the useless columns
df_emails = df_emails.drop(['Subject','X-From','X-To','X-cc','X-bcc','X-Folder','X-Origin','X-FileName'], axis=1)

In [33]:
# # remove From or To empty rows 
# df_emails['From'] = df_emails['From'].fillna('-1')
# df_emails['To'] = df_emails['To'].fillna('-1')

# empty_row_indexs = df_emails[(df_emails['To'] == '-1')].index.tolist()

In [34]:
df_emails.head()

Unnamed: 0_level_0,Date,From,To,user
Message-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00,(phillip.allen@enron.com),(tim.belden@enron.com),allen-p
<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00,(phillip.allen@enron.com),(john.lavorato@enron.com),allen-p
<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 10:00:00,(phillip.allen@enron.com),(leah.arsdall@enron.com),allen-p
<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 13:13:00,(phillip.allen@enron.com),(randall.gay@enron.com),allen-p
<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 12:07:00,(phillip.allen@enron.com),(greg.piper@enron.com),allen-p


In [35]:
def compare_email_with_user(email, username):
    if email.endswith('@enron.com'):
        displayname = email.split('@')[0].lower().split('.')
        if len(displayname) == 2:
            return username.startswith(displayname[1]) and username.endswith(displayname[0][0])

    return False

In [36]:
dict_email_user_count = {}
for i, row in tqdm(df_emails.iterrows()):
    key = list(row.From)[0]
    if compare_email_with_user(key, row.user):
        if key not in dict_email_user_count:
            dict_email_user_count[key] = (row.user, 1)
        else:
            dict_email_user_count[key] = (row.user, dict_email_user_count[key][1] + 1)

517401it [00:47, 11002.00it/s]


In [37]:
dict_email_user_count

{'phillip.allen@enron.com': ('allen-p', 2125),
 'john.arnold@enron.com': ('arnold-j', 3491),
 'harry.arora@enron.com': ('arora-h', 97),
 'robert.badeer@enron.com': ('badeer-r', 193),
 'susan.bailey@enron.com': ('bailey-s', 143),
 'eric.bass@enron.com': ('bass-e', 5122),
 'don.baughman@enron.com': ('baughman-d', 164),
 'sally.beck@enron.com': ('beck-s', 4234),
 'steve.beck@enron.com': ('beck-s', 9),
 'robert.benson@enron.com': ('benson-r', 41),
 'lynn.blair@enron.com': ('blair-l', 1112),
 'sandra.brawner@enron.com': ('brawner-s', 480),
 'rick.buy@enron.com': ('buy-r', 1008),
 'larry.campbell@enron.com': ('campbell-l', 1368),
 'mike.carson@enron.com': ('carson-m', 719),
 'michelle.cash@enron.com': ('cash-m', 1749),
 'monika.causholli@enron.com': ('causholli-m', 487),
 'shelley.corman@enron.com': ('corman-s', 639),
 'martin.cuilla@enron.com': ('cuilla-m', 247),
 'jeff.dasovich@enron.com': ('dasovich-j', 9423),
 'dana.davis@enron.com': ('davis-d', 886),
 'clint.dean@enron.com': ('dean-c', 

In [38]:
list_users = []
for k, v in dict_email_user_count.items():
    list_users.append(v[0])

list_no_exist_users = []
for i, row in tqdm(df_emails.iterrows()):
    if row.user not in set(list_users):
        list_no_exist_users.append(row.user)
        
print(set(list_no_exist_users))
    

517401it [00:37, 13880.49it/s]

{'whalley-l', 'crandell-s', 'gilbertsmith-d', 'stclair-c', 'rodrique-r', 'phanis-s', 'lucci-p', 'ybarbo-p', 'williams-w3'}





In [39]:
with open("./data/dict_email_user_count.json",'w',encoding='utf-8') as json_file:
    json.dump(dict_email_user_count,json_file,ensure_ascii=False)

## 4. generate the user profile

In [3]:
# Read the data into a DataFrame
df_emails = pd.read_csv(filepath_or_buffer='./data/email-preprocessed-without-content.csv')
print(df_emails.shape)
df_emails.head()

(517401, 13)


Unnamed: 0,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,user
0,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00,frozenset({'phillip.allen@enron.com'}),frozenset({'tim.belden@enron.com'}),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,allen-p
1,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00,frozenset({'phillip.allen@enron.com'}),frozenset({'john.lavorato@enron.com'}),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,allen-p
2,<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 10:00:00,frozenset({'phillip.allen@enron.com'}),frozenset({'leah.arsdall@enron.com'}),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,allen-p
3,<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 13:13:00,frozenset({'phillip.allen@enron.com'}),frozenset({'randall.gay@enron.com'}),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,allen-p
4,<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 12:07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'greg.piper@enron.com'}),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,allen-p


In [5]:
dict_profile = {}
for i,row in tqdm(df_emails.iterrows()):
    subject_word_count = len(str(row.Subject).split(' ')) if not pd.isnull(row.Subject) else 0
    
    if row.user in dict_profile:
        dict_profile[row.user] = (dict_profile[row.user][0]+1, dict_profile[row.user][1]+subject_word_count)
    else:
        dict_profile[row.user] = (1, subject_word_count)


0it [00:00, ?it/s][A
689it [00:00, 6827.95it/s][A
1557it [00:00, 7765.62it/s][A
2323it [00:00, 7727.91it/s][A
3199it [00:00, 7974.48it/s][A
4047it [00:00, 8073.46it/s][A
4886it [00:00, 8130.10it/s][A
5690it [00:00, 8118.84it/s][A
6541it [00:00, 8165.26it/s][A
7412it [00:00, 8220.17it/s][A
8270it [00:01, 8259.99it/s][A
9112it [00:01, 8270.32it/s][A
9942it [00:01, 8257.32it/s][A
10765it [00:01, 8244.66it/s][A
11584it [00:01, 8236.68it/s][A
12402it [00:01, 8222.48it/s][A
13216it [00:01, 8166.64it/s][A
14033it [00:01, 8166.54it/s][A
14835it [00:01, 8157.64it/s][A
15636it [00:01, 8149.31it/s][A
16436it [00:02, 8139.85it/s][A
17241it [00:02, 8136.04it/s][A
18081it [00:02, 8147.85it/s][A
18894it [00:02, 8145.73it/s][A
19707it [00:02, 8130.53it/s][A
20510it [00:02, 8124.41it/s][A
21341it [00:02, 8131.60it/s][A
22151it [00:02, 8129.48it/s][A
22961it [00:02, 8128.78it/s][A
23787it [00:02, 8132.13it/s][A
24602it [00:03, 8127.14it/s][A
25412it [00:03, 8120.41it/s][

In [6]:
dict_profile

{'allen-p': (3034, 10225),
 'arnold-j': (4898, 16726),
 'arora-h': (654, 3091),
 'badeer-r': (877, 5555),
 'bailey-s': (478, 2225),
 'bass-e': (7823, 26237),
 'baughman-d': (2760, 13798),
 'beck-s': (11830, 52357),
 'benson-r': (767, 3959),
 'blair-l': (3415, 17791),
 'brawner-s': (1026, 4387),
 'buy-r': (2429, 11528),
 'campbell-l': (6490, 35157),
 'carson-m': (1400, 5391),
 'cash-m': (2969, 13642),
 'causholli-m': (943, 4388),
 'corman-s': (2025, 9338),
 'crandell-s': (519, 2748),
 'cuilla-m': (1029, 5551),
 'dasovich-j': (28234, 148728),
 'davis-d': (2249, 9168),
 'dean-c': (2429, 12914),
 'delainey-d': (3566, 12577),
 'derrick-j': (1766, 8812),
 'dickson-s': (395, 1490),
 'donoho-l': (1045, 5238),
 'donohoe-t': (1015, 5794),
 'dorland-c': (2127, 6367),
 'ermis-f': (1230, 7553),
 'farmer-d': (13032, 63159),
 'fischer-m': (1498, 6732),
 'forney-j': (729, 2835),
 'fossum-d': (4796, 20464),
 'gang-l': (590, 2555),
 'gay-r': (1415, 5639),
 'geaccone-t': (1592, 7637),
 'germany-c': (1243

In [7]:
with open("./data/dict_profile_count_subject.json",'w',encoding='utf-8') as json_file:
    json.dump(dict_profile,json_file,ensure_ascii=False)