In [1]:
import pandas as pd

In [2]:
enron = pd.read_csv('emails.csv')
enron_subset = enron[:10000]

In [3]:
enron_subset.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [4]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

In [5]:
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'to': map_to_list(emails, 'to'),
        'from_': map_to_list(emails, 'from')
    }

In [6]:
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [7]:
email_df = pd.DataFrame(parse_into_emails(enron_subset.message))
print(email_df.head())

                                                body                       to  \
0                               Here is our forecast     tim.belden@enron.com   
1  Traveling to have a business meeting takes the...  john.lavorato@enron.com   
2                     test successful.  way to go!!!   leah.arsdall@enron.com   
3  Randy,Can you send me a schedule of the salary...    randall.gay@enron.com   
4                                                        greg.piper@enron.com   

                     from_  
0  phillip.allen@enron.com  
1  phillip.allen@enron.com  
2  phillip.allen@enron.com  
3  phillip.allen@enron.com  
4  phillip.allen@enron.com  


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [10]:
dtm = cv.fit_transform(email_df['body'])

In [11]:
dtm

<10000x46835 sparse matrix of type '<class 'numpy.int64'>'
	with 740294 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
LDA = LatentDirichletAllocation(n_components=6,random_state=42)

In [14]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=6, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [15]:
#Get vocabulary of words
len(cv.get_feature_names())

46835

In [16]:
cv.get_feature_names()[30000]

'notguarantee'

In [17]:
import random

In [18]:
for i in range(10):
    random_word_id = random.randint(0,35563)
    print(cv.get_feature_names()[random_word_id])

collar
gooddavid
projectplan
notwalking
realjukeboxfrom
1711
nicolaus
20tracking
attracted
20than


In [19]:
#Get the topics
len(LDA.components_)

6

In [20]:
LDA.components_

array([[2.46677559e+02, 8.19635893e+02, 8.16753436e+00, ...,
        1.67332173e-01, 1.66666670e-01, 1.66667180e-01],
       [6.20972616e+02, 5.69957668e+02, 1.66666819e-01, ...,
        1.66779018e-01, 1.66666669e-01, 3.16322357e+00],
       [1.82203888e+02, 2.08756214e+01, 1.66666937e-01, ...,
        1.66666746e-01, 4.16666665e+00, 1.66667332e-01],
       [1.43787038e+01, 3.62850804e+00, 1.66666851e-01, ...,
        1.66669001e-01, 1.66666670e-01, 1.66667140e-01],
       [3.20549666e+02, 1.90450250e+02, 1.69815410e-01, ...,
        1.67330337e-01, 1.66666669e-01, 1.70107905e-01],
       [1.46217567e+02, 3.26452059e+02, 4.16264963e+00, ...,
        2.16522272e+00, 1.66666668e-01, 1.66666877e-01]])

In [21]:
single_topic = LDA.components_[0]

In [22]:
#return index postions that would sort this array lowest to highest. Helps us find high probability words.
single_topic.argsort()

array([ 5439, 35370, 28768, ..., 38263, 19253, 40631])

In [23]:
# Word most representative of this topic
single_topic[5838]

0.16666666801557883

In [24]:
# Top 10 words for this topic - Index postions sorted from least to greatest
#So TOPTENVALUES wil be the last ten values
single_topic.argsort()[-10:] #<- Grab last ten values of the result of argsort

array([ 1274, 10030, 29363,  3210, 36056, 43176,  6151, 38263, 19253,
       40631])

In [25]:
top_word_indices = single_topic.argsort()[-10:]

In [26]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

20
br
nbsp
3d
right
tr
align
size
font
td


In [27]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['class', 'phillip', 'table', '000', 'left', '20', 'br', 'nbsp', '3d', 'right', 'tr', 'align', 'size', 'font', 'td']


THE TOP 15 WORDS FOR TOPIC #1
['jones', 'price', 'company', 'california', 'natural', 'year', 'market', 'electricity', 'enron', 'prices', 'state', 'energy', 'said', 'power', 'gas']


THE TOP 15 WORDS FOR TOPIC #2
['hr', 'pdx', 'enron_development', 'john', '01', 'forwarded', 'allen', 'phillip', 'enronxgate', 'na', 'ees', 'corp', 'hou', 'enron', 'ect']


THE TOP 15 WORDS FOR TOPIC #3
['robert', 'larry', 'smith', 'jennifer', 'ebs', 'chris', 'mark', 'jeff', 'mike', 'david', 'ees', 'mail', 'john', 'enron', 'com']


THE TOP 15 WORDS FOR TOPIC #4
['713', '2001', 'receive', 'buy', 'caiso', 'information', 'offer', 'company', 'original', 'mail', 'com', 'email', 'message', 'image', '09']


THE TOP 15 WORDS FOR TOPIC #5
['think', 'stock', 'like', 'energy', 'year', 'business', 'trading', 'week', 'time', 'know', 'market', 'new', 'said', 'company', 'enron

In [36]:
dtm

<10000x46835 sparse matrix of type '<class 'numpy.int64'>'
	with 740294 stored elements in Compressed Sparse Row format>

In [38]:
email_df['body']

0                                    Here is our forecast
1       Traveling to have a business meeting takes the...
2                          test successful.  way to go!!!
3       Randy,Can you send me a schedule of the salary...
4                                                        
5       Greg,How about either next Tuesday or Thursday...
6       Phillip Allen (pallen@enron.com)Mike Grigsby (...
7                                                        
8       I don't think these are required by the ISP2. ...
9       ---------------------- Forwarded by Phillip K ...
10      Mr. Buckner,For delivered gas behind San Diego...
11      Lucy,Open them and save in the rentroll folder...
12      ---------------------- Forwarded by Phillip K ...
13      ---------------------- Forwarded by Phillip K ...
14      Dave,Here are the names of the west desk membe...
15                        Paula,35 million is finePhillip
16      ---------------------- Forwarded by Phillip K ...
17      Tim,mi

In [39]:
topic_results = LDA.transform(dtm)

In [45]:
topic_results

array([[0.08333336, 0.58254894, 0.08380941, 0.08333336, 0.08334494,
        0.08362998],
       [0.15746084, 0.0027566 , 0.06011277, 0.00274561, 0.00275168,
        0.7741725 ],
       [0.04183417, 0.0417765 , 0.04189216, 0.04183605, 0.04183928,
        0.79082184],
       ...,
       [0.02095725, 0.0209905 , 0.02110403, 0.46207704, 0.0210681 ,
        0.45380309],
       [0.04185273, 0.04184527, 0.48176653, 0.04225083, 0.04207714,
        0.35020749],
       [0.00982415, 0.00987073, 0.00983427, 0.00984988, 0.0098495 ,
        0.95077147]])

In [50]:
topic_results[0].round(2) #results show this topic of having highest probability of belonging to topic #1

array([0.08, 0.58, 0.08, 0.08, 0.08, 0.08])

In [58]:
email_df['body'][0]

'Here is our forecast'

In [61]:
email_df['Topic'] = topic_results.argmax(axis=1)

In [62]:
email_df

Unnamed: 0,body,to,from_,Topic
0,Here is our forecast,tim.belden@enron.com,phillip.allen@enron.com,1
1,Traveling to have a business meeting takes the...,john.lavorato@enron.com,phillip.allen@enron.com,5
2,test successful. way to go!!!,leah.arsdall@enron.com,phillip.allen@enron.com,5
3,"Randy,Can you send me a schedule of the salary...",randall.gay@enron.com,phillip.allen@enron.com,5
4,,greg.piper@enron.com,phillip.allen@enron.com,0
5,"Greg,How about either next Tuesday or Thursday...",greg.piper@enron.com,phillip.allen@enron.com,0
6,Phillip Allen (pallen@enron.com)Mike Grigsby (...,"david.l.johnson@enron.com, john.shafer@enron.com",phillip.allen@enron.com,3
7,,joyce.teixeira@enron.com,phillip.allen@enron.com,0
8,I don't think these are required by the ISP2. ...,mark.scott@enron.com,phillip.allen@enron.com,5
9,---------------------- Forwarded by Phillip K ...,"""'Pallen@Enron.com'"" <Pallen@Enron.com>",phillip.allen@enron.com,1
