In [21]:
# This is the cell where we do all the imports
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
import pandas as pd
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize.regexp import regexp_tokenize
import category_encoders as ce
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
# Connecting to an s3 bucket
role = get_execution_role()

region = boto3.Session().region_name

bucket='sagemaker-samuel'
prefix = 'sagemaker/nlp-email'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [23]:
# Pull in the csv
emails = pd.read_csv('update_twelve_three.csv')
emails.head()

Unnamed: 0,From,Message,Subject,Tags,UID,text,tag_list,first_tag,sender_name,sender_email,domain_name,isNoReply
0,<grangepayments@westernunionspeedpay.com>,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",Grange Payment Confirmation,Finance,31780,<grangepayments@westernunionspeedpay.com> Gran...,['Finance'],Finance,,grangepayments@westernunionspeedpay.com,westernunionspeedpay,False
1,Chase <no.reply.alerts@chase.com>,This is an Alert to help manage your account ...,Your Debit Card Transaction,Finance,31779,Chase <no.reply.alerts@chase.com> Your Debit C...,['Finance'],Finance,Chase,no.reply.alerts@chase.com,chase,True
2,Amazon Web Services <no-reply-aws@amazon.com>,Please let us know if we helped resolve your i...,Resolved 6559329691: Limit Increase: SageMaker,Productivity,31738,Amazon Web Services <no-reply-aws@amazon.com> ...,['Productivity'],Productivity,Amazon Web Services,no-reply-aws@amazon.com,amazon,True
3,Lambda Labs <noreply@github.com>,Youve been added to the Labs 18 - Tagger team ...,Bernie Durfee added you to the Lambda Labs tea...,Productivity,31693,Lambda Labs <noreply@github.com> Bernie Durfee...,['Productivity'],Productivity,Lambda Labs,noreply@github.com,github,True
4,Amazon Web Services <no-reply-aws@amazon.com>,"Hello, We haven't heard back from you regard...",Attention required on case 6559329691: Limit I...,Productivity,31684,Amazon Web Services <no-reply-aws@amazon.com> ...,['Productivity'],Productivity,Amazon Web Services,no-reply-aws@amazon.com,amazon,True


In [24]:
# Take the columns we want
emails = emails[['sender_email', 'Message', 'Subject', 'Tags', 'UID', 'first_tag']]
emails.tail()

Unnamed: 0,sender_email,Message,Subject,Tags,UID,first_tag
11193,noreply@medium.com,Today's highlights Understanding Random For...,Understanding Random Forest | Tony Yiu in Towa...,Entertainment,3693,Entertainment
11194,noreply@glassdoor.com,...,You look like a good fit for the job at Procte...,Productivity,3702,Productivity
11195,aws-marketing-email-replies@amazon.com,Thank you for attending AWS Machine Learning W...,Thank you for attending AWS Machine Learning W...,"Events, Productivity",3706,Events
11196,no-reply-aws@amazon.com,"Hello again, I hope you're having a nice...",RE:[CASE 6570793521] Limit Increase: SageMaker,Productivity,3721,Productivity
11197,noreply@medium.com,Today's highlights How To Wake Up at 5 A.M....,How To Wake Up at 5 A.M. Every Day | Bryan Ye ...,Entertainment,3724,Entertainment


In [25]:
# Make sure all the columns we want to be strings are strings
emails['Message'] = emails['Message'].apply(str)
emails['sender_email'] = emails['sender_email'].apply(str)
emails['Subject'] = emails['Subject'].apply(str)
emails['Tags'] = emails['Tags'].apply(str)
emails['first_tag'] = emails['first_tag'].apply(str)

In [26]:
# Check out all the unique tags
emails['first_tag'].value_counts()

Finance          5678
Entertainment    1636
Other            1199
Shopping         1179
Productivity     1031
Events            199
Social            154
Travel            122
Name: first_tag, dtype: int64

In [27]:
# A little feature engineering to spice things up
emails['Text'] = emails['sender_email'] + ' ' + emails['Message'] + ' ' + emails['Subject']

In [28]:
# emails[['First_Tag','Second_Tag']] = emails['Tags'].str.split(',', expand=True)
# emails.tail()

In [29]:
# Make like a banana and split
train, test = train_test_split(emails, stratify=emails['first_tag'])

In [30]:
# 2's a party and four is a crowd
X_train = train['Text']
y_train = train['first_tag']

X_test = test['Text']
y_test = test['first_tag']

In [31]:
# A function that removes all unnecessary puncuation, html code, and/or any apostrophes lying around
def clean_text(text):
    # replace new line and carriage return with space
    text = text.replace("\n", " ").replace("\r", " ")
    
    # replace the numbers and punctuation (exclude single quote) with space
    punc_list = '!"#$%&()*+,-/:;<=>?[\]^_{|}~' + '0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.translate(t)
    
    # replace single quote with empty character
    t = str.maketrans(dict.fromkeys("''", ""))
    text = text.translate(t)
    
    return text

In [32]:
# The lemmatizer
lemmatizer = WordNetLemmatizer()

# nltk's regexp tokenizer mixed with our personal clean_text function
def regnltk_tokenize(text):
    text = clean_text(text)
    words = regexp_tokenize(text, pattern = '\s+', gaps = True)
    return [lemmatizer.lemmatize(word) for word in words if (len(word) >= 3)]

In [33]:
# gensim's stopwords mixed with a few I added
my_stopwords = STOPWORDS.union(set(['jacobsohn', 'avraham', 'http', 'https', 'kalman', 'com', 'sdui', 'www']))

In [34]:
# Vectorize the X's with the regex tokenize and my_stopwords
vect = TfidfVectorizer(tokenizer=regnltk_tokenize, stop_words=my_stopwords, min_df=0.02, max_df=0.98)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

  'stop_words.' % sorted(inconsistent))


In [35]:
# encode the y's so the computer understands
encoder = ce.OrdinalEncoder()

y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [36]:
model = RandomForestClassifier(n_estimators=500, max_depth=7, random_state=42)
model.fit(X_train, y_train)
model.score(X_test, y_test)

  from ipykernel import kernelapp as app


0.8471428571428572

In [37]:
preds = model.predict(X_test)
df = pd.DataFrame({'Predictions': preds})
df['Predictions'].value_counts()

2    1347
5     603
3     369
4     248
1     233
Name: Predictions, dtype: int64

In [38]:
test['preds'] = preds
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,sender_email,Message,Subject,Tags,UID,first_tag,Text,preds
1807,ksemail7@gmail.com,"youre a doll, thanx :*\n\nOn Thu, Apr 15, 2010...",Re:,"Personal, Other",3137,Other,"ksemail7@gmail.com youre a doll, thanx :*\n\nO...",4
7509,no-reply@amazon.com,"Greetings from Amazon Payments,\n\n\nYour paym...",Payment completed,Finance,28432,Finance,no-reply@amazon.com Greetings from Amazon Paym...,2
1742,austin@successpodcast.com,( https://el2.convertkit-mail2.com/c/r8ul08o5q...,A Beginner's Guide To Body Language & Nonverba...,Entertainment,63232,Entertainment,austin@successpodcast.com ( https://el2.conver...,5
1547,chave@designsbyfmc.com,You are such a big boy.\n\n-----Original Messa...,RE: Did you have a chance to finish that one d...,"Personal, Other",2634,Other,chave@designsbyfmc.com You are such a big boy....,4
5493,no-reply@amazon.com,"Greetings from Amazon Payments,\n\n\nYou recei...",You received $1.00 from Kalman Jacobsohn sent ...,Finance,25508,Finance,no-reply@amazon.com Greetings from Amazon Paym...,2


In [39]:
test.head(50)

Unnamed: 0,sender_email,Message,Subject,Tags,UID,first_tag,Text,preds
1807,ksemail7@gmail.com,"youre a doll, thanx :*\n\nOn Thu, Apr 15, 2010...",Re:,"Personal, Other",3137,Other,"ksemail7@gmail.com youre a doll, thanx :*\n\nO...",4
7509,no-reply@amazon.com,"Greetings from Amazon Payments,\n\n\nYour paym...",Payment completed,Finance,28432,Finance,no-reply@amazon.com Greetings from Amazon Paym...,2
1742,austin@successpodcast.com,( https://el2.convertkit-mail2.com/c/r8ul08o5q...,A Beginner's Guide To Body Language & Nonverba...,Entertainment,63232,Entertainment,austin@successpodcast.com ( https://el2.conver...,5
1547,chave@designsbyfmc.com,You are such a big boy.\n\n-----Original Messa...,RE: Did you have a chance to finish that one d...,"Personal, Other",2634,Other,chave@designsbyfmc.com You are such a big boy....,4
5493,no-reply@amazon.com,"Greetings from Amazon Payments,\n\n\nYou recei...",You received $1.00 from Kalman Jacobsohn sent ...,Finance,25508,Finance,no-reply@amazon.com Greetings from Amazon Paym...,2
3537,no-reply@amazon.com,"Greetings from Amazon Payments,\n\n\nYour paym...",Payment completed,Finance,23258,Finance,no-reply@amazon.com Greetings from Amazon Paym...,2
7347,no-reply@amazon.com,"Greetings from Amazon Payments,\n\n\nYour paym...",Payment completed,Finance,28270,Finance,no-reply@amazon.com Greetings from Amazon Paym...,2
3366,no-reply@amazon.com,"Greetings from Amazon Payments,\n\n\nYou recei...",You received $1.00 from Kalman Jacobsohn sent ...,Finance,23027,Finance,no-reply@amazon.com Greetings from Amazon Paym...,2
9605,deals@jdeal.com,Its a jdeal!\r\n\r\nDonate Thermal Clothing to...,Protect our soldiers from sub zero temperatures,Shopping,3179,Shopping,deals@jdeal.com Its a jdeal!\r\n\r\nDonate The...,1
9366,news@linkedin.com,The week's most popular news:=20\r\n----------...,Top news today: Charles Best: How One Teacher ...,Social,2815,Social,news@linkedin.com The week's most popular news...,5


In [43]:
test['first_tag'].value_counts()

Finance          1420
Entertainment     409
Other             300
Shopping          295
Productivity      258
Events             50
Social             38
Travel             30
Name: first_tag, dtype: int64

In [44]:
train['first_tag'].value_counts()

Finance          4258
Entertainment    1227
Other             899
Shopping          884
Productivity      773
Events            149
Social            116
Travel             92
Name: first_tag, dtype: int64