In [1]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

bucket='sagemaker-samuel'
prefix = 'sagemaker/nlp-email'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [2]:
import pandas as pd

In [3]:
emails = pd.read_csv('Master_emails.csv')
emails = emails.drop(columns='Unnamed: 0')
emails.head()

Unnamed: 0,Content_Type,From,Message,Subject,Tags,UID
0,text/plain,<grangepayments@westernunionspeedpay.com>,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",Grange Payment Confirmation,Finance,31780
1,text/plain,Chase <no.reply.alerts@chase.com>,This is an Alert to help manage your account ...,Your Debit Card Transaction,Finance,31779
2,text/plain,Amazon Web Services <no-reply-aws@amazon.com>,Please let us know if we helped resolve your i...,Resolved 6559329691: Limit Increase: SageMaker,Productivity,31738
3,text/plain,Lambda Labs <noreply@github.com>,Youve been added to the Labs 18 - Tagger team ...,Bernie Durfee added you to the Lambda Labs tea...,Productivity,31693
4,text/plain,Amazon Web Services <no-reply-aws@amazon.com>,"Hello, We haven't heard back from you regard...",Attention required on case 6559329691: Limit I...,Productivity,31684


In [4]:
emails['Message'] = emails['Message'].apply(str)

In [5]:
def clean_text(text):
    # replace new line and carriage return with space
    text = text.replace("\n", " ").replace("\r", " ")
    
    # replace the numbers and punctuation (exclude single quote) with space
    punc_list = '!"#$%&()*+,-./:;<=>?@[\]^_{|}~' + '0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.translate(t)
    
    # replace single quote with empty character
    t = str.maketrans(dict.fromkeys("''", ""))
    text = text.translate(t)
    
    return text

In [53]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.neighbors import NearestNeighbors
# from sklearn.decomposition import PCA
# import gensim
# from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
# from gensim import corpora
# from gensim.models.ldamulticore import LdaMulticore

In [58]:
from nltk.tokenize.regexp import regexp_tokenize

def regnltk_tokenize(text):
    text = clean_text(text)
    words = regexp_tokenize(text, pattern = '\s+', gaps = True)
    return [word for word in words if (len(word) >= 3)]

In [39]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if (token not in STOPWORDS and len(token) >= 3 and len(token) <= 10)]

In [59]:
my_stopwords = STOPWORDS.union(set(['jacobsohn', 'avraham', 'http', 'https', 'kalman', 'com', 'sdui', 'www']))

In [60]:
tfidf = TfidfVectorizer(tokenizer=regnltk_tokenize, stop_words = my_stopwords, strip_accents='unicode', min_df=0.1)
tfidf.fit(emails['Message'])

sparse_dtm = tfidf.transform(emails['Message'])

dtm = pd.DataFrame(sparse_dtm.todense(), columns=tfidf.get_feature_names())

In [51]:
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [61]:
dtm.head(20)

Unnamed: 0,account,amazon,avenue,click,completed,contact,dear,details,email,free,gmail,greetings,help,helptab,information,mail,manage,message,new,north,note,optional,pages,payment,payments,questions,received,recipient,seattle,sender,sent,subscription,successfully,support,terry,thank,time,unsubscribe,view,visit
0,0.257907,0.0,0.0,0.0,0.0,0.27789,0.26059,0.0,0.423674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.493194,0.0,0.0,0.15438,0.0,0.0,0.458564,0.0,0.284152,0.0,0.0,0.0,0.0,0.225295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.450436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244017,0.0,0.0,0.0,0.484273,0.430684,0.0,0.0,0.0,0.0,0.0,0.266962,0.0,0.496272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.480411,0.0,0.247782,0.0,0.246629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.245969,0.0,0.0,0.0,0.0,0.068506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099975,0.0,0.0,0.756171,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.890892,0.0
4,0.0,0.50351,0.0,0.0,0.0,0.387731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257796,0.0,0.0,0.0,0.0,0.143601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104782,0.0,0.0,0.660441,0.0,0.0,0.2488,0.0,0.0,0.0
5,0.467399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253206,0.0,0.0,0.0,0.50251,0.446903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.467399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253206,0.0,0.0,0.0,0.50251,0.446903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.587745,0.0,0.0,0.0,0.422423,0.0,0.0,0.161008,0.0,0.0,0.0,0.0,0.0,0.0,0.421293,0.0,0.0,0.177523,0.0,0.117337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171237,0.0,0.0,0.431721,0.0,0.108242,0.0,0.0,0.0,0.0
8,0.0,0.832369,0.0,0.0,0.0,0.332355,0.0,0.089528,0.126678,0.0,0.0,0.0,0.0,0.0,0.0,0.331466,0.0,0.0,0.0,0.0,0.092319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134726,0.0,0.0,0.169835,0.0,0.085163,0.0,0.0,0.0,0.0
9,0.467399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253206,0.0,0.0,0.0,0.50251,0.446903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
dtm.columns

Index(['account', 'amazon', 'avenue', 'avraham', 'click', 'com', 'completed', 'contact', 'dear', 'details', 'email', 'free', 'gmail', 'greetings', 'help', 'helptab', 'http', 'https', 'jacobsohn', 'just', 'kalman', 'mail', 'manage', 'message', 'new', 'north', 'note', 'optional', 'pages', 'payment', 'payments', 'questions', 'received', 'recipient', 'sdui', 'seattle', 'sender', 'sent', 'support', 'terry', 'thank', 'time', 'using', 'view', 'visit', 'www'], dtype='object')