## Inspect and processing data manually

In [None]:
%%sh
pip -q install gensim

In [None]:
%%sh
# https://s3.amazonaws.com/amazon-reviews-pds/readme.html
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz /tmp
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Luggage_v1_00.tsv.gz /tmp
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Software_v1_00.tsv.gz /tmp
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz /tmp
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz /tmp

In [None]:
import pandas as pd

In [None]:
num_lines = 1000

cameras = pd.read_csv('/tmp/amazon_reviews_us_Camera_v1_00.tsv.gz', 
                      sep='\t', compression='gzip',
                      error_bad_lines=False, dtype='str', nrows=num_lines)

luggage = pd.read_csv('/tmp/amazon_reviews_us_Luggage_v1_00.tsv.gz', 
                      sep='\t', compression='gzip',
                      error_bad_lines=False, dtype='str', nrows=num_lines)

software = pd.read_csv('/tmp/amazon_reviews_us_Software_v1_00.tsv.gz', 
                       sep='\t', compression='gzip',
                       error_bad_lines=False, dtype='str', nrows=num_lines)

jewelry = pd.read_csv('/tmp/amazon_reviews_us_Jewelry_v1_00.tsv.gz', 
                      sep='\t', compression='gzip',
                       error_bad_lines=False, dtype='str', nrows=num_lines)

home = pd.read_csv('/tmp/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz', 
                   sep='\t', compression='gzip',
                   error_bad_lines=False, dtype='str', nrows=num_lines)

In [None]:
data = pd.concat([cameras, luggage, software, jewelry, home])

In [None]:
data = data.dropna()
data.shape

In [None]:
data = data.drop(['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title',
                  'product_category', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 
                  'review_headline', 'review_date', 'star_rating'], axis=1)

In [None]:
data.head()

In [None]:
import string 

def process_text(text):
    for p in string.punctuation:
        text = text.replace(p, '')
    text = text.lower().split()
    return text

In [None]:
%%time
data['review_body'] = data['review_body'].apply(process_text)

In [None]:
data.head()

In [None]:
%%time

from gensim import corpora
dictionary = corpora.Dictionary(data['review_body'])

In [None]:
print(dictionary)

In [None]:
%%time

data['tokens'] = data.apply(lambda row: dictionary.doc2bow(row['review_body']), axis=1)

In [None]:
data = data.drop(['review_body'], axis=1)

In [None]:
from scipy.sparse import lil_matrix

num_lines = data.shape[0]
num_columns = len(dictionary)
token_matrix = lil_matrix((num_lines, num_columns)).astype('float32')

In [None]:
def add_row_to_matrix(line, row):
    for token_id, token_count in row['tokens']:
        token_matrix[line, token_id] = token_count
    return

In [None]:
%%time
line = 0
for _, row in data.iterrows():
    add_row_to_matrix(line, row)
    line+=1

In [None]:
token_matrix

In [None]:
import io, boto3
import sagemaker.amazon.common as smac

buf = io.BytesIO()
smac.write_spmatrix_to_sparse_tensor(buf, token_matrix, None)
buf.seek(0)

In [None]:
import sagemaker

session = sagemaker.Session()

bucket = session.default_bucket()
prefix = 'amazon-reviews-lda'
train_key = 'reviews.protobuf'

obj = '{}/{}'.format(prefix, train_key)
boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
s3_train_path = 's3://{}/{}'.format(bucket,obj)
print(s3_train_path)

## Training

In [None]:
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_output)

In [None]:
# Run this cell if you want to use data processed by SageMaker Processing

import pickle

s3_train_path =

!aws s3 cp DICTIONARY_PATH .

with open('dictionary.pkl', 'rb') as data:
    dictionary = pickle.load()

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

region_name = boto3.Session().region_name
container = get_image_uri(region_name, "lda", "latest")
print(container)

In [None]:
role = sagemaker.get_execution_role()

lda = sagemaker.estimator.Estimator(container,
                                   role, 
                                   train_instance_count=1, 
                                   train_instance_type='ml.c5.2xlarge',
                                   output_path=s3_output,
                                   sagemaker_session=session)

In [None]:
lda.set_hyperparameters(num_topics=5, 
                        feature_dim=num_columns, 
                        mini_batch_size=num_lines,
                        alpha0=0.1)

In [None]:
lda.fit(inputs={'train': s3_train_path})

In [None]:
lda_predictor = lda.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

In [None]:
samples = [
"I sold all of my Canon gear recently and went with this great Fujifilm camera. With the kit lens, it \
focuses with incredible speed and accuracy. The stabilization on the lens is great for video work as well. \
There are so many things that are great about this camera but I can't list them all. One of my favorite \
capabilities of this camera is the ability to use vintage manual focus lenses. With the focus peaking feature \
you are able to confidently nail your focus. This feature has really brought back the fun in photography for \
me and exploring the wealth of affordable lenses.",
    
"When I came across this camera here on Amazon I was only looking for a quick and reliable Vlogging camcorder.\
This is a quality camera and comes with everything you need to take great pictures. It is small, lightweight \
and priced decent for the quality and megapixels. It comes with an extra battery and memory card. It is a \
great entry-level vlogging camera and I do not think you will be disappointed for the price.",
           
"Absolutely love these tiny delicate necklaces. They are well made and so pretty on! Also the pearls are real! \
I didn’t think they would be but they are - pleasantly surprised. This is one of the two diferente sets and \
I love them both! Love love love them! I wonder if they have them in silver as well?? Wyd definitely buy \
them too! ;) you will not be disappointed!",
    
"Took this on a trip recently and it's everything I was hoping for. It's solid, elegant and easy to roll. \
The handle also feels sturdy. The outside is textured and is therefore resistant to apparent scratches. \
There is also a TSA approved lock in case you ever have to check this in the baggage hold. Dimensions are \
adequate and within specs for carry-on, however, you will have difficulty fitting this in the smaller \
regional planes."
]      

In [None]:
def process_samples(samples, dictionary):
    num_lines = len(samples)
    num_columns = len(dictionary)
    sample_matrix = lil_matrix((num_lines, num_columns)).astype('float32')
    for line in range(0, num_lines):
        s = samples[line]
        s = process_text(s)
        s = dictionary.doc2bow(s)
        for token_id, token_count in s:
            sample_matrix[line, token_id] = token_count
        line+=1
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, sample_matrix, None)
    buf.seek(0)
    return buf

In [None]:
from sagemaker.predictor import json_deserializer

lda_predictor.content_type = 'application/x-recordio-protobuf'
lda_predictor.deserializer = json_deserializer

response = lda_predictor.predict(process_samples(samples, dictionary))

In [None]:
import pprint

pprint.pprint(response)

In [None]:
lda_predictor.delete_endpoint()