In [4]:
#Necessary imports
import numpy as np
import pandas as pd
import email
from datetime import datetime
import spacy
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from nltk import sent_tokenize
import networkx as nx
import os.path
from os import path

In [5]:
#Assigning global variables and declaring lists
#Loading Spacy English Model for tokenization and Page Rank
try:
    spacy.load('en_core_web_sm')
except:
    !python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
textrank = spacy.load('en_core_web_sm')
punctuations = string.punctuation
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
#List of Part-of-speech tokens, the corresponding tokens can be disregarded
pos_tokens_unused = ['NUM','SYM','PUNCT','X','SPACE']
#List of commonly used Email abbrevations and phrases that indicates priority
email_abbrevations = ['AEAP', 'ASAP', 'AR', 'Action Required','FYA', 'NYR', 'NYRT','NYRQ', 
                      'Quick','attention','critical','immediate','urgent','priority']
#List of Senders picked from Data Pre-processing notebook
email_from = ['exchangeinfo@nymex.com@ENRON', 'Leslie Hansen', 'Sara Shackleton', 'Mark Taylor','Marcus Nettelton']
#Weights given for abbrevations and senders
abbrevation_priority = 0.8
email_from_priority = 0.5
#Get the GloVe vector word embeddings from Stanford NLP library
def generate_word_embeddings():
    if not path.exists('glove.6B.zip'):
        print('Please wait while the Global Vector Word Embeddings are being downloaded.')
        !wget http://nlp.stanford.edu/data/glove.6B.zip
        !unzip glove*.zip
    word_embeddings = {}
    with open('glove.6B.100d.txt',encoding = 'utf-8') as file:
        for line in file:
            words = line.split()
            word = words[0]
            word_vectors = np.asarray(words[1:], dtype='float32')
            word_embeddings[word] = word_vectors
    return word_embeddings
word_embeddings = generate_word_embeddings()

In [3]:
#Reading the pre-processed data and dropping out unwanted columns.
emails_data = pd.read_csv('emails_filtered.csv')
emails_data.drop(['Unnamed: 0','message-id'],inplace=True,axis=1)
emails_data.fillna(value='',axis=1,inplace=True)

In [4]:
#Function to get contents of forward and original messages and store it in a list for further processing
def get_payloads(emails,recieved_email=False):
    payloads = [emails[0].strip()]
    if recieved_email:
        return ''.join(payloads)
    for mail in emails[1:]:
        mail = email.message_from_string(mail.strip())
        mail  = mail.get_payload()
        if mail == None or mail == '':
            continue
        else:
            payloads.append(mail)
    return payloads

#Function to handle abbrevations in email subject
def handle_abbrevations(subject_data):
    return re.sub("[R|r][e|E]:|[F|f][W|w]:",'',subject_data)

In [5]:
def customer_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    tokens = nlp(sentence.strip())
    
    #Pre-processing of tokens
    tokens = [token for token in tokens if token.pos_ not in pos_tokens_unused]
    
    # Lemmatizing each token and converting each token into lowercase
    tokens = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in tokens]
    
    #Consider tokens that are not punctuations and if length of token is greater than 2 and if they are not stop words
    tokens = [ token for token in tokens if token not in punctuations and len(token) > 2 and token not in spacy_stopwords]
    
    # return preprocessed list of tokens
    return tokens

In [6]:
#Pre-processing emails
#Seperating individual emails to get the Current and any previous forwarded/orginal email messages and pre-processing
emails_data['body_seperated']= emails_data.body.str.split('-----Original Message-----')
emails_data['message_payloads'] = emails_data.body_seperated.apply(get_payloads)
emails_data.subject = emails_data.subject.apply(handle_abbrevations)

In [7]:
#Combine name, subject and message for similarity and ranking
emails_data['message'] = emails_data.message_payloads.apply(lambda sentence : ', '.join(sentence))
emails_data['message_recieved'] = emails_data.message_payloads.apply(get_payloads,args=(True,))
emails_data.message = emails_data.from_name + ' ' + emails_data.subject + ' ' + emails_data.message
emails_data = emails_data.drop_duplicates(subset= 'message_recieved').reset_index(drop=True)

In [8]:
#Handling Out of Office Emails
ooo_emails = emails_data[emails_data.subject.str.lower().str.contains(pat = 'out of the office|out of office',regex=True)].index
emails_data = emails_data.drop(ooo_emails).reset_index()
ooo_emails = emails_data[emails_data.message_recieved.str.lower().str.contains(pat = 'out of the office|out of office',regex=True)].index
emails_data = emails_data.drop(ooo_emails).reset_index()

In [9]:
#TF-IDF weighing for documents
def generate_tfidf(messages,abbrevations,customer_tokenizer):
    vectorizer = TfidfVectorizer(stop_words=None,smooth_idf=True,tokenizer=customer_tokenizer,sublinear_tf=True)
    tfidf = vectorizer.fit_transform(messages)
    query_abbrevations = vectorizer.transform(abbrevations)
    query_from = vectorizer.transform(email_from)
    return tfidf,query_abbrevations,query_from

In [10]:
#Generate Cosine Similiary between the query and email
def get_cosine_similarity(tfidf,query):
    similarity = {}
    for ind_query in range(query.get_shape()[0]):
        for index in range(tfidf.get_shape()[0]):
            if 'email_' + str(index) in similarity.keys():
                similarity['email_' + str(index)]+= cosine_similarity(tfidf[index],query[ind_query])
            else:
                similarity['email_' + str(index)] = cosine_similarity(tfidf[index],query[ind_query])
    return similarity

In [11]:
#Generating a list of top ranked emails based on the sender and content priority.
def get_top_emails(similarity_dict_abbrevations,similarity_dict_from):
    #Add up the weights to obtain the final ranking
    final_similarity = Counter(similarity_dict_abbrevations) + Counter(similarity_dict_from)
    top_emails_index = final_similarity.most_common(20)
    top_emails = pd.DataFrame()
    for email in top_emails_index:
        email_no = int(email[0].split('_')[1])
        top_emails = top_emails.append(emails_data.iloc[email_no])
    top_emails = top_emails.reset_index(drop=True)
    return top_emails

In [12]:
#Generate TFIDF Sparse matrix
tfidf,query_abbrevations,query_from = generate_tfidf(emails_data.message_recieved,email_abbrevations,customer_tokenizer)
#Generate Cosine Similarity between the email abbrevations that are commonly used and the emails
similarity_dict_abbrevations = get_cosine_similarity(tfidf,query_abbrevations)
#Getting the mean and assigning the priority for the abbrevations
for index in range(len(similarity_dict_abbrevations)):
    similarity_dict_abbrevations['email_' + str(index)] = similarity_dict_abbrevations['email_' + str(index)] * abbrevation_priority
#Generate Cosine Similarity between the email abbrevations that are commonly used and the emails
similarity_dict_from = get_cosine_similarity(tfidf,query_from) 
#Getting the mean and assigning the priority for the abbrevations
for index in range(len(similarity_dict_from)):
    similarity_dict_from['email_' + str(index)] = similarity_dict_from['email_' + str(index)] * email_from_priority

In [13]:
top_emails = get_top_emails(similarity_dict_abbrevations,similarity_dict_from)

In [14]:
#From the extracted sentences, get the vectors of the sentences
def generate_sentence_vectors(sentences,word_embeddings):
    sentence_vectors = []
    for sentence in sentences:
        if len(sentence) != 0:
            vector = sum([word_embeddings.get(word, np.zeros((100,))) for word in sentence.split()])/(len(sentence.split())+0.001)
        else:
            vector = np.zeros((100,)) 
        sentence_vectors.append(vector)
    return sentence_vectors

In [15]:
#Generate the ranked sentences using Text Rank algorithm
def generate_text_rank(sentence_vectors,sentences):
    cosine_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                cosine_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    summary_graph = nx.from_numpy_array(cosine_mat)
    text_rank = nx.pagerank(summary_graph)
    ranked_sentences_with_scores = sorted(((text_rank[i],s) for i,s in enumerate(sentences)), reverse=True)
    ranked_sentences = [sentence[1] for sentence in ranked_sentences_with_scores[:5]]
    return ranked_sentences

In [16]:
#Tokenize the emails based on sentences and generate ranked sentences.
def generate_summary(data,word_embeddings):
    sentences = []
    for sentence in data:
        sentence =  sent_tokenize(sentence)
        for line in sentence:
            cleaned_sentence = ' '.join(customer_tokenizer(line))
            sentences.append(cleaned_sentence)
    sentence_vectors = generate_sentence_vectors(sentences,word_embeddings)
    ranked_sentences = generate_text_rank(sentence_vectors,sentences)
    return ' '.join(ranked_sentences)

In [17]:
top_emails['email_summary'] = top_emails.message_payloads.apply(generate_summary,args=(word_embeddings,))
print('Summary of Ranked Emails')
for index,row in top_emails.iterrows():
    print('\nEmail {}\n'.format(index+1))
    print(row['email_summary'])
    print('*'*50)

Summary of Ranked Emails

Email 1

request kim detiveaux attach propose form non disclosure agreement comment question mark greenberg senior counsel enclose meet approval execute agreement return fax look forward hear tana thank mark quick response
**************************************************

Email 2

open trade correct tana second user apply company urgent thank camille
**************************************************

Email 3

run mark taylor attorney legal dept attach clean blackline draft reflect change able lawyer need talk mark directly phone look forward hear think fine change tana jones send friday subject aether systems revise nda forward tana jones hou ect |--------+----------------------- tana jones |--------+----------------------- jgatto@aet.hersystems.com teresa smith corp enron@enron mark taylor hou ect@ect subject aether systems revise nda theresa smith forward comment nda
**************************************************

Email 4

want document actually store