In [None]:
# This is the cell where we do all the imports
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize.regexp import regexp_tokenize
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
import joblib
import pickle

nltk.download('wordnet')

In [None]:
# Connecting to an s3 bucket
role = get_execution_role()

region = boto3.Session().region_name

bucket='sagemaker-name'
prefix = 'sagemaker/nlp-email'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [None]:
# Pull in the csv
emails = pd.read_csv('name.csv')
emails.head()

In [None]:
# Take the columns we want
emails = emails[['sender_email', 'Message', 'Subject', 'Tags', 'UID', 'first_tag']]
emails.tail()

In [None]:
# Make sure all the columns we want to be strings are strings
emails['Message'] = emails['Message'].apply(str)
emails['sender_email'] = emails['sender_email'].apply(str)
emails['Subject'] = emails['Subject'].apply(str)
emails['Tags'] = emails['Tags'].apply(str)
emails['first_tag'] = emails['first_tag'].apply(str)

In [None]:
# Check out all the unique tags
emails['first_tag'].value_counts()

In [None]:
# A little feature engineering to spice things up
emails['Text'] = emails['sender_email'] + ' ' + emails['Message'] + ' ' + emails['Subject']

In [None]:
# emails[['First_Tag','Second_Tag']] = emails['Tags'].str.split(',', expand=True)
# emails.tail()

In [None]:
# Make like a banana and split
train, test = train_test_split(emails, stratify=emails['first_tag'])

In [None]:
# 2's a party and four is a crowd
X_train = train['Text']
y_train = train['first_tag']

X_test = test['Text']
y_test = test['first_tag']

In [None]:
# A function that removes all unnecessary puncuation, html code, and/or any apostrophes lying around
def clean_text(text):
    # replace new line and carriage return with space
    text = text.replace("\n", " ").replace("\r", " ")
    
    # replace the numbers and punctuation (exclude single quote) with space
    punc_list = '!"#$%&()*+,-/:;<=>?[\]^_{|}~' + '0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.translate(t)
    
    # replace single quote with empty character
    t = str.maketrans(dict.fromkeys("''", ""))
    text = text.translate(t)
    
    return text

# joblib.dump(clean_text, 'clean_text.joblib')

In [None]:
# The lemmatizer
lemmatizer = WordNetLemmatizer()

# nltk's regexp tokenizer mixed with our personal clean_text function
class tokenize:
    def regnltk_tokenize(text):
        text = clean_text(text)
        words = regexp_tokenize(text, pattern = '\s+', gaps = True)
        return [lemmatizer.lemmatize(word) for word in words if (len(word) >= 3)]

# joblib.dump(regnltk_tokenize, 'regnltk_tokenize.joblib')

In [None]:
# gensim's stopwords mixed with a few I added
my_stopwords = STOPWORDS.union(set(['jacobsohn', 'avraham', 'http', 'https', 'kalman', 'com', 'sdui', 'www']))

In [None]:
# Vectorize the X's with the regex tokenize and my_stopwords
tokeni_zer = tokenize
vect = TfidfVectorizer(tokenizer=tokenize.regnltk_tokenize, stop_words=my_stopwords, min_df=0.02, max_df=0.98)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

pickle.dump(vect, open('vect.pkl', 'wb'))

In [None]:
# encode the y's so the computer understands
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

pickle.dump(encoder, open('labeller.pkl','wb'))

In [None]:
model = RandomForestClassifier(bootstrap=False, n_estimators=300, max_depth=110, min_samples_leaf=3, min_samples_split=10, random_state=42)
model.fit(X_train, y_train)

pickle.dump(model, open('randomforest.pkl', 'wb'))

model.score(X_test, y_test)

In [None]:
preds = model.predict(X_test)
str_preds = encoder.inverse_transform(preds)

In [None]:
df = pd.DataFrame({'Predictions': preds,
                  'String Predictions': str_preds})
df['String Predictions'].value_counts()

In [None]:
test_tags = encoder.inverse_transform(y_test)
test_tags

In [None]:
act_pre = pd.DataFrame({'Actual': test_tags,
                       'Preds': preds})

In [None]:
act_pre['Actual'].value_counts()

In [None]:
test['preds'] = str_preds
test[test['preds'] == 'Travel']

In [None]:
test[(test['sender_email'] == 'normstormin@gmail.com') & (test['first_tag'] == 'Travel') & (test['preds'] == 'Travel')]['Message'][10857]

In [None]:
test['first_tag'].value_counts()

In [None]:
# Set up for the GridSearchCV
param_grid = {
    'bootstrap': [False],
    'max_depth': [7, 10, 110],
    'min_samples_leaf': [3, 6, 9],
    'min_samples_split': [10],
    'n_estimators': [500]
}

# Base model
rfc = RandomForestClassifier()

# grid search time!
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, n_jobs=-1, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
print('Best params', grid_search.best_params_)

best_grid = grid_search.best_estimator_
print('Best grid', best_grid)
grid_accuracy = evaluate(best_grid, X_test, y_test.values)