In [1]:
# This is the cell where we do all the imports
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize.regexp import regexp_tokenize
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
import joblib
import pickle

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Connecting to an s3 bucket
role = get_execution_role()

region = boto3.Session().region_name

bucket='sagemaker-name'
prefix = 'sagemaker/nlp-email'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [3]:
# Pull in the csv
emails = pd.read_csv('name.csv')
emails.head()

Unnamed: 0,From,Message,Subject,Tags,UID,text,tag_list,first_tag,sender_name,sender_email,domain_name,isNoReply
0,<grangepayments@westernunionspeedpay.com>,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",Grange Payment Confirmation,Finance,31780,<grangepayments@westernunionspeedpay.com> Gran...,['Finance'],Finance,,grangepayments@westernunionspeedpay.com,westernunionspeedpay,False
1,Chase <no.reply.alerts@chase.com>,This is an Alert to help manage your account ...,Your Debit Card Transaction,Finance,31779,Chase <no.reply.alerts@chase.com> Your Debit C...,['Finance'],Finance,Chase,no.reply.alerts@chase.com,chase,True
2,Amazon Web Services <no-reply-aws@amazon.com>,Please let us know if we helped resolve your i...,Resolved 6559329691: Limit Increase: SageMaker,Productivity,31738,Amazon Web Services <no-reply-aws@amazon.com> ...,['Productivity'],Productivity,Amazon Web Services,no-reply-aws@amazon.com,amazon,True
3,Lambda Labs <noreply@github.com>,Youve been added to the Labs 18 - Tagger team ...,Bernie Durfee added you to the Lambda Labs tea...,Productivity,31693,Lambda Labs <noreply@github.com> Bernie Durfee...,['Productivity'],Productivity,Lambda Labs,noreply@github.com,github,True
4,Amazon Web Services <no-reply-aws@amazon.com>,"Hello, We haven't heard back from you regard...",Attention required on case 6559329691: Limit I...,Productivity,31684,Amazon Web Services <no-reply-aws@amazon.com> ...,['Productivity'],Productivity,Amazon Web Services,no-reply-aws@amazon.com,amazon,True


In [4]:
# Take the columns we want
emails = emails[['sender_email', 'Message', 'Subject', 'Tags', 'UID', 'first_tag']]
emails.tail()

Unnamed: 0,sender_email,Message,Subject,Tags,UID,first_tag
11193,noreply@medium.com,Today's highlights Understanding Random For...,Understanding Random Forest | Tony Yiu in Towa...,Entertainment,3693,Entertainment
11194,noreply@glassdoor.com,...,You look like a good fit for the job at Procte...,Productivity,3702,Productivity
11195,aws-marketing-email-replies@amazon.com,Thank you for attending AWS Machine Learning W...,Thank you for attending AWS Machine Learning W...,"Events, Productivity",3706,Events
11196,no-reply-aws@amazon.com,"Hello again, I hope you're having a nice...",RE:[CASE 6570793521] Limit Increase: SageMaker,Productivity,3721,Productivity
11197,noreply@medium.com,Today's highlights How To Wake Up at 5 A.M....,How To Wake Up at 5 A.M. Every Day | Bryan Ye ...,Entertainment,3724,Entertainment


In [5]:
# Make sure all the columns we want to be strings are strings
emails['Message'] = emails['Message'].apply(str)
emails['sender_email'] = emails['sender_email'].apply(str)
emails['Subject'] = emails['Subject'].apply(str)
emails['Tags'] = emails['Tags'].apply(str)
emails['first_tag'] = emails['first_tag'].apply(str)

In [6]:
# Check out all the unique tags
emails['first_tag'].value_counts()

Finance          5678
Entertainment    1636
Other            1199
Shopping         1179
Productivity     1031
Events            199
Social            154
Travel            122
Name: first_tag, dtype: int64

In [7]:
# A little feature engineering to spice things up
emails['Text'] = emails['sender_email'] + ' ' + emails['Message'] + ' ' + emails['Subject']

In [8]:
# emails[['First_Tag','Second_Tag']] = emails['Tags'].str.split(',', expand=True)
# emails.tail()

In [9]:
# Make like a banana and split
train, test = train_test_split(emails, stratify=emails['first_tag'])

In [10]:
# 2's a party and four is a crowd
X_train = train['Text']
y_train = train['first_tag']

X_test = test['Text']
y_test = test['first_tag']

In [11]:
# A function that removes all unnecessary puncuation, html code, and/or any apostrophes lying around
def clean_text(text):
    # replace new line and carriage return with space
    text = text.replace("\n", " ").replace("\r", " ")
    
    # replace the numbers and punctuation (exclude single quote) with space
    punc_list = '!"#$%&()*+,-/:;<=>?[\]^_{|}~' + '0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.translate(t)
    
    # replace single quote with empty character
    t = str.maketrans(dict.fromkeys("''", ""))
    text = text.translate(t)
    
    return text

# joblib.dump(clean_text, 'clean_text.joblib')

In [12]:
# The lemmatizer
lemmatizer = WordNetLemmatizer()

# nltk's regexp tokenizer mixed with our personal clean_text function
class tokenize:
    def regnltk_tokenize(text):
        text = clean_text(text)
        words = regexp_tokenize(text, pattern = '\s+', gaps = True)
        return [lemmatizer.lemmatize(word) for word in words if (len(word) >= 3)]

# joblib.dump(regnltk_tokenize, 'regnltk_tokenize.joblib')

In [13]:
# gensim's stopwords mixed with a few I added
my_stopwords = STOPWORDS.union(set(['jacobsohn', 'avraham', 'http', 'https', 'kalman', 'com', 'sdui', 'www']))

In [14]:
# Vectorize the X's with the regex tokenize and my_stopwords
tokeni_zer = tokenize
vect = TfidfVectorizer(tokenizer=tokenize.regnltk_tokenize, stop_words=my_stopwords, min_df=0.02, max_df=0.98)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

pickle.dump(vect, open('vect.pkl', 'wb'))

  'stop_words.' % sorted(inconsistent))


In [15]:
# encode the y's so the computer understands
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

pickle.dump(encoder, open('labeller.pkl','wb'))

In [16]:
model = RandomForestClassifier(bootstrap=False, n_estimators=300, max_depth=110, min_samples_leaf=3, min_samples_split=10, random_state=42)
model.fit(X_train, y_train)

pickle.dump(model, open('randomforest.pkl', 'wb'))

model.score(X_test, y_test)

0.9185714285714286

In [17]:
preds = model.predict(X_test)
str_preds = encoder.inverse_transform(preds)

In [18]:
df = pd.DataFrame({'Predictions': preds,
                  'String Predictions': str_preds})
df['String Predictions'].value_counts()

Finance          1409
Entertainment     392
Productivity      369
Shopping          290
Other             267
Social             33
Travel             20
Events             20
Name: String Predictions, dtype: int64

In [19]:
test_tags = encoder.inverse_transform(y_test)
test_tags

array(['Finance', 'Other', 'Finance', ..., 'Productivity', 'Finance',
       'Entertainment'], dtype=object)

In [20]:
act_pre = pd.DataFrame({'Actual': test_tags,
                       'Preds': preds})

In [21]:
act_pre['Actual'].value_counts()

Finance          1420
Entertainment     409
Other             300
Shopping          295
Productivity      258
Events             50
Social             38
Travel             30
Name: Actual, dtype: int64

In [22]:
test['preds'] = str_preds
test[test['preds'] == 'Travel']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,sender_email,Message,Subject,Tags,UID,first_tag,Text,preds
9499,BAExecutiveClub_US@my.ba.com,\r\n\r\n<!--If you are unable to see the messa...,Important update: Your Executive Club account,Travel,3023,Travel,BAExecutiveClub_US@my.ba.com \r\n\r\n<!--If yo...,Travel
10857,normstormin@gmail.com,"ok On Mon, Mar 27, 2017 at 4:56 PM, <samuel...",Re: Prague travel,"Personal, Travel",18427,Travel,"normstormin@gmail.com ok On Mon, Mar 27, 20...",Travel
9946,express@airbnb.com,RE: Reservation request at Penthouse apartment...,RE: Reservation request at Penthouse apartment...,Travel,1188,Travel,express@airbnb.com RE: Reservation request at ...,Travel
9430,BAExecutiveClub_US@my.ba.com,\r\n\r\n<!--If you are unable to see the messa...,Executive Club Statement,Travel,2915,Travel,BAExecutiveClub_US@my.ba.com \r\n\r\n<!--If yo...,Travel
10117,accountservices@mercuryinsurance.com,[<email.message.Message object at 0x0000022BB4...,Paperless enrollment confirmation for policy F...,Finance,1362,Finance,accountservices@mercuryinsurance.com [<email.m...,Travel
919,BA.CustSvcs@email.ba.com,PLEASE DO NOT RESPOND DIRECTLY TO THIS EMAIL. ...,Password Reset on ba.com,Travel,765,Travel,BA.CustSvcs@email.ba.com PLEASE DO NOT RESPOND...,Travel
10813,rosal@uw.edu,T2suIFRoYW5rcywgU2FtIQ0KDQpSb3NhDQpSb3NhIEwuIE...,RE: FW: COPY: Appointment Confirmation - UW Bo...,"Productivity, Events",15860,Productivity,rosal@uw.edu T2suIFRoYW5rcywgU2FtIQ0KDQpSb3NhD...,Travel
8715,BAExecutiveClub_US@my.ba.com,\r\n\r\n<!--If you are unable to see the messa...,Convert your hotel loyalty points\r\n for Avio...,Travel,1912,Travel,BAExecutiveClub_US@my.ba.com \r\n\r\n<!--If yo...,Travel
10910,normstormin@gmail.com,---------- Forwarded message ---------- From:...,Fwd: Prague ticket purchased,"Personal, Travel",20916,Travel,normstormin@gmail.com ---------- Forwarded mes...,Travel
9939,MileagePlus@news.united.com,<=21doctype html><html><head><meta charset=3D=...,Choose how you use your miles with MileagePlus,Travel,1181,Travel,MileagePlus@news.united.com <=21doctype html><...,Travel


In [30]:
test[(test['sender_email'] == 'normstormin@gmail.com') & (test['first_tag'] == 'Travel') & (test['preds'] == 'Travel')]['Message'][10857]

'ok    On Mon, Mar 27, 2017 at 4:56 PM, <samuelithian@gmail.com> wrote:    > July 17th  >  >  >  > Sent from Mail <https://go.microsoft.com/fwlink/?LinkId=3D550986> for  > Windows 10  >  >  >  > *From: *norman hepner <normstormin@gmail.com>  > *Sent: *Monday, March 27, 2017 4:55 PM  > *To: *Samuel Hepner <samuelithian@gmail.com>  > *Subject: *Re: Prague travel  >  >  >  > when do you need to depart by?  >  >  >  > On Mon, Mar 27, 2017 at 4:53 PM, <samuelithian@gmail.com> wrote:  >  > I just reviewed the information on the flight date and it says that  > students must make their reservations to arrive on July 19th between 9am  > and 1pm at the Vlaclav Havel airport and thats where there will =  be people  > to pick me up. They also say that I should have a round trip ticket due t=  o  > previous experiences with students being denied boarding.  >  >  >  > Sent from Mail <https://go.microsoft.com/fwlink/?LinkId=3D550986> for  > Windows 10  >  >  >  >  >  >  >  '

In [23]:
test['first_tag'].value_counts()

Finance          1420
Entertainment     409
Other             300
Shopping          295
Productivity      258
Events             50
Social             38
Travel             30
Name: first_tag, dtype: int64

In [20]:
# Set up for the GridSearchCV
param_grid = {
    'bootstrap': [False],
    'max_depth': [7, 10, 110],
    'min_samples_leaf': [3, 6, 9],
    'min_samples_split': [10],
    'n_estimators': [500]
}

# Base model
rfc = RandomForestClassifier()

# grid search time!
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, n_jobs=-1, verbose=2)

In [21]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.1min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [False], 'max_depth': [7, 10, 110], 'min_samples_leaf': [3, 6, 9], 'min_samples_split': [10], 'n_estimators': [500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [22]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [23]:
print('Best params', grid_search.best_params_)

best_grid = grid_search.best_estimator_
print('Best grid', best_grid)
grid_accuracy = evaluate(best_grid, X_test, y_test.values)

Best params {'bootstrap': False, 'max_depth': 110, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 500}
Best grid RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=110, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


AttributeError: 'numpy.ndarray' object has no attribute 'values'