### Summary
1. Task: Build a person of interest identifier
2. 先整理:
    * message 斷句 "\n" --> 存成不同的 columns
    * 特別看 subject 內容，找 frequency


### Tips
1. 先用少部分的 data 做測試，然後再擴展到整個 dataset. 
e.g. data = pd.read_csv("file path", chunksize = 500), df = next(data)
2. 學習到怎麼處理 text data
    * 把一團 message 分成幾個 columns 儲存
    * 再深度地把那些 columns 的資料分別清理出有用的資料
    * 怎麼寫成一些 functions 去集體處理上述步驟?

In [90]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [91]:
chunk = pd.read_csv("/Users/eve/Desktop/Datasets/Eron_Email_Dataset/emails.csv", chunksize = 500)
data = next(chunk)
data.head(3)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...


In [92]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
file       500 non-null object
message    500 non-null object
dtypes: object(2)
memory usage: 7.9+ KB


In [93]:
data.iloc[0]["file"]

'allen-p/_sent_mail/1.'

In [94]:
test = data.iloc[0]['message']
test

"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "

In [95]:
test.split("\n")

['Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>',
 'Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)',
 'From: phillip.allen@enron.com',
 'To: tim.belden@enron.com',
 'Subject: ',
 'Mime-Version: 1.0',
 'Content-Type: text/plain; charset=us-ascii',
 'Content-Transfer-Encoding: 7bit',
 'X-From: Phillip K Allen',
 'X-To: Tim Belden <Tim Belden/Enron@EnronXGate>',
 'X-cc: ',
 'X-bcc: ',
 "X-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail",
 'X-Origin: Allen-P',
 'X-FileName: pallen (Non-Privileged).pst',
 '',
 'Here is our forecast',
 '',
 ' ']

### 取出 message 內容
Q1: 怎麼只取出 message content?

#### My method

In [96]:
empty_list = []
for i in range(len(data)):
    empty_list.append(data.iloc[i]["message"].split("\n"))

In [97]:
message = pd.DataFrame(empty_list)
message.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,265,266,267,268,269,270,271,272,273,274
0,Message-ID: <18782981.1075855378110.JavaMail.e...,"Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)",From: phillip.allen@enron.com,To: tim.belden@enron.com,Subject:,Mime-Version: 1.0,Content-Type: text/plain; charset=us-ascii,Content-Transfer-Encoding: 7bit,X-From: Phillip K Allen,X-To: Tim Belden <Tim Belden/Enron@EnronXGate>,...,,,,,,,,,,
1,Message-ID: <15464986.1075855378456.JavaMail.e...,"Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)",From: phillip.allen@enron.com,To: john.lavorato@enron.com,Subject: Re:,Mime-Version: 1.0,Content-Type: text/plain; charset=us-ascii,Content-Transfer-Encoding: 7bit,X-From: Phillip K Allen,X-To: John J Lavorato <John J Lavorato/ENRON@e...,...,,,,,,,,,,
2,Message-ID: <24216240.1075855687451.JavaMail.e...,"Date: Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",From: phillip.allen@enron.com,To: leah.arsdall@enron.com,Subject: Re: test,Mime-Version: 1.0,Content-Type: text/plain; charset=us-ascii,Content-Transfer-Encoding: 7bit,X-From: Phillip K Allen,X-To: Leah Van Arsdall,...,,,,,,,,,,


#### Other's method
https://www.kaggle.com/jamestollefson/enron-network-analysis

In [98]:
# 觀察到信件主體大多出現在第十五個 row
test = pd.read_csv("/Users/eve/Desktop/Datasets/Eron_Email_Dataset/emails.csv", chunksize = 500)
data = next(test)
data.head(2)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...


In [99]:
# 把沒有涵蓋'Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: '的 message 給刪掉
def standard_format(df, Series, string, slicer):
    """Drops rows containing messages without some specified value in the expected locations. 
    Returns original dataframe without these values. Don't forget to reindex after doing this!!!"""
    rows = []
    for row, message in enumerate(Series):
        message_words = message.split('\n')
        if string not in message_words[slicer]:
            rows.append(row)
    df = df.drop(df.index[rows])
    return df

In [101]:
x = len(data.index)
headers = ['Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ']
for i, v in enumerate(headers):
    data = standard_format(data, data.message, v, i)
data = data.reset_index()
print("Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.".format(x - len(data.index), np.round(((x - len(data.index)) / x) * 100, decimals=2)))

Got rid of 11 useless emails! That's 2.2% of the total number of messages in this dataset.


In [102]:
def get_text(Series, row_num_slicer):
    """returns a Series with text sliced from a list split from each message. Row_num_slicer
    tells function where to slice split text to find only the body of the message."""
    result = pd.Series(index=Series.index)
    for row, message in enumerate(Series):
        message_words = message.split('\n')
        del message_words[:row_num_slicer]
        result.iloc[row] = message_words
    return result

data['text'] = get_text(data.message, 15)
data.head(3)

Unnamed: 0,index,file,message,text
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"[, Here is our forecast, , ]"
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"[, Traveling to have a business meeting takes ..."
2,2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,"[, test successful. way to go!!!]"


In [103]:
def get_row(Series, row_num):
    """returns a single row split out from each message. Row_num is the index of the specific
    row that you want the function to return."""
    result = pd.Series(index=Series.index)
    for row, message in enumerate(Series):
        message_words = message.split('\n')
        message_words = message_words[row_num]
        result.iloc[row] = message_words
    return result

data['date'] = get_row(data.message, 1)
data.head(2)

Unnamed: 0,index,file,message,text,date
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"[, Here is our forecast, , ]","Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)"
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"[, Traveling to have a business meeting takes ...","Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)"


In [104]:
data['senders'] = get_row(data.message, 2)
data['recipients'] = get_row(data.message, 3)
data['subject'] = get_row(data.message, 4)

data.head(2)

Unnamed: 0,index,file,message,text,date,senders,recipients,subject
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"[, Here is our forecast, , ]","Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)",From: phillip.allen@enron.com,To: tim.belden@enron.com,Subject:
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"[, Traveling to have a business meeting takes ...","Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)",From: phillip.allen@enron.com,To: john.lavorato@enron.com,Subject: Re:


In [105]:
data.date = data.date.str.replace('Date: ', '')
data.date = pd.to_datetime(data.date)

data.head(2)

Unnamed: 0,index,file,message,text,date,senders,recipients,subject
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"[, Here is our forecast, , ]",2001-05-14 23:39:00,From: phillip.allen@enron.com,To: tim.belden@enron.com,Subject:
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"[, Traveling to have a business meeting takes ...",2001-05-04 20:51:00,From: phillip.allen@enron.com,To: john.lavorato@enron.com,Subject: Re:


In [106]:
data.subject = data.subject.str.replace('Subject: ', '')

data.head(2)

Unnamed: 0,index,file,message,text,date,senders,recipients,subject
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"[, Here is our forecast, , ]",2001-05-14 23:39:00,From: phillip.allen@enron.com,To: tim.belden@enron.com,
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"[, Traveling to have a business meeting takes ...",2001-05-04 20:51:00,From: phillip.allen@enron.com,To: john.lavorato@enron.com,Re:


In [108]:
import re

def get_address(df, Series, num_cols=1):
    """returns a specified email address from each row in a Series"""
    address = re.compile('[\w\.-]+@[\w\.-]+\.\w+')
    addresses = []
    result1 = pd.Series(index=df.index)
    result2 = pd.Series(index=df.index)
    result3 = pd.Series(index=df.index)
    for i in range(len(df)):
        for message in Series:
            correspondents = re.findall(address, message)
            addresses.append(correspondents)
            result1[i] = addresses[i][0]
        if num_cols >= 2:
            if len(addresses[i]) >= 3:
                result2[i] = addresses[i][1]
                if num_cols == 3:
                    if len(addresses[i]) >= 4:
                        result3[i] = addresses[i][2]
    return result1, result2, result3

data['recipient1'], data['recipient2'], data['recipient3'] = get_address(data, data.recipients, num_cols=3)
data['sender'], x, y = get_address(data, data.senders)

data.head(2)

Unnamed: 0,index,file,message,text,date,senders,recipients,subject,recipient1,recipient2,recipient3,sender
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,"[, Here is our forecast, , ]",2001-05-14 23:39:00,From: phillip.allen@enron.com,To: tim.belden@enron.com,,tim.belden@enron.com,,,phillip.allen@enron.com
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,"[, Traveling to have a business meeting takes ...",2001-05-04 20:51:00,From: phillip.allen@enron.com,To: john.lavorato@enron.com,Re:,john.lavorato@enron.com,,,phillip.allen@enron.com


In [109]:
del data['recipients']
del data['senders']
del data['file']
del data['message']

data = data[['date', 'sender', 'recipient1', 'recipient2', 'recipient3', 'subject', 'text']]
data.head(2)

Unnamed: 0,date,sender,recipient1,recipient2,recipient3,subject,text
0,2001-05-14 23:39:00,phillip.allen@enron.com,tim.belden@enron.com,,,,"[, Here is our forecast, , ]"
1,2001-05-04 20:51:00,phillip.allen@enron.com,john.lavorato@enron.com,,,Re:,"[, Traveling to have a business meeting takes ..."
