In [4]:
import sagemaker
import boto3
import pandas as pd

In [6]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket = sess.default_bucket()

## Download data

In [14]:
df = pd.read_csv('Reviews.csv')

In [15]:
df.head()

Unnamed: 0,sentiment,review_body
0,-1,This suit did nothing for me. the top has zero...
1,-1,Like other reviewers i saw this dress on the ...
2,-1,I wish i had read the reviews before purchasin...
3,-1,I ordered these pants in my usual size (xl) an...
4,-1,I noticed this top on one of the sales associa...


## Transforming data

In [16]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
def tokenize(review):
    # delete commas and quotation marks, apply tokenization and join back into a string separating by spaces
    return ' '.join([str(token) for token in nltk.word_tokenize(str(review).replace(',', '').replace('"', '').lower())])
    
def prepare_data(df):
    df['sentiment'] = df['sentiment'].map(lambda sentiment : '__label__{}'.format(str(sentiment).replace('__label__', '')))
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    df['sentiment'] = df['sentiment'].map(lambda review : tokenize(review)) # Replace all None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    return df

In [21]:
df = df[['sentiment', 'review_body']].reset_index(drop=True)
df = prepare_data(df_blazingtext)
df.head()

Unnamed: 0,sentiment,review_body
0,__label__-1,This suit did nothing for me. the top has zero...
1,__label__-1,Like other reviewers i saw this dress on the ...
2,__label__-1,I wish i had read the reviews before purchasin...
3,__label__-1,I ordered these pants in my usual size (xl) an...
4,__label__-1,I noticed this top on one of the sales associa...


## Split the data between training and validaions set

In [22]:
from sklearn.model_selection import train_test_split
# Split all data into 90% train and 10% holdout
df_train, df_validation = train_test_split(df, 
                                           test_size=0.10,
                                           stratify=df_blazingtext['sentiment'])

In [23]:
df_train.head()

Unnamed: 0,sentiment,review_body
4540,__label__0,Interesting slightly transparent fabric. neck...
1753,__label__-1,The cut of the bodice is very strange and unfl...
1158,__label__-1,I'll start by saying over the years i get mo...
239,__label__-1,I loved this dress before putting it on but i...
2423,__label__0,I love this blouse i've received so many comp...


### Save data to csv

In [24]:
train_path = './train.csv'
df_train[['sentiment', 'review_body']].to_csv(train_path, index=False, header=False, sep=' ')

validaion_path = './validation.csv'
df_train[['sentiment', 'review_body']].to_csv(validaion_path, index=False, header=False, sep=' ')




### Upload data to s3 buckets

In [25]:
train_uri = sess.upload_data(bucket=bucket, key_prefix='data', path=train_path )
validation_uri = sess.upload_data(bucket=bucket, key_prefix='data', path=validaion_path )

## Training the model

### Retrieve image container

In [28]:
image_uri = sagemaker.image_uris.retrieve(framework='blazingtext',
                                          region = region
                                         )

### Create an estimator instance

In [30]:
estimator = sagemaker.estimator.Estimator(image_uri=image_uri,
                                          role=role,
                                          instance_count=1,
                                          instance_type='ml.m5.large',
                                          volume_size=30,
                                          max_run = 7200
                                         )

### Tune the hyperparameters

In [31]:
estimator.set_hyperparameters(mode='supervised', 
                              epochs=10, 
                              learning_rate=0.01,
                              min_count=2,
                              vector_dim=300,
                              word_ngrams=3
                             )

### Create data channels

In [36]:
train_data = sagemaker.inputs.TrainingInput(s3_data = train_uri,
                                            distribution='FullyReplicated',
                                            content_type='text/plain',
                                            s3_data_type='S3Prefix'
                                           )

In [37]:
validation_data = sagemaker.inputs.TrainingInput(s3_data = validation_uri,
                                            distribution='FullyReplicated',
                                            content_type='text/plain',
                                            s3_data_type='S3Prefix'
                                           )

In [39]:
data_channels = {
    'train': train_data,
    'validation': validation_data
}

## Fitting the model

In [40]:
estimator.fit(inputs=data_channels,
             wait=False)

## Running the model ≈ 10 minutes

In [43]:
import time

In [44]:
%%time

estimator.latest_training_job.wait(logs=False)


2022-03-14 06:29:21 Starting - Preparing the instances for training
2022-03-14 06:29:21 Downloading - Downloading input data.
2022-03-14 06:29:32 Training - Training image download completed. Training in progress........
2022-03-14 06:30:13 Uploading - Uploading generated training model.....................................................................................
2022-03-14 06:37:24 Completed - Training job completed
CPU times: user 431 ms, sys: 26.9 ms, total: 458 ms
Wall time: 8min 4s


### Get training and validation accuracy

In [45]:
estimator.training_job_analytics.dataframe()



Unnamed: 0,timestamp,metric_name,value
0,0.0,train:accuracy,0.5076
1,0.0,validation:accuracy,0.5076


## Deploying model ≈ 10 minutes

In [48]:
text_classifier = estimator.deploy(initial_instance_count=1,
                                   instance_type='ml.m5.large',
                                   serializer=sagemaker.serializers.JSONSerializer(),
                                   deserializer=sagemaker.deserializers.JSONDeserializer()
                                  )

------!

## Test model

In [50]:
reviews = ['This product is great!',
           'OK, but not great',
           'This is not the right product.'] 


In [51]:
tokenized_reviews = [' '.join(nltk.word_tokenize(review)) for review in reviews]

payload = {'instances' : tokenized_reviews}

print(payload)

{'instances': ['This product is great !', 'OK , but not great', 'This is not the right product .']}


In [53]:
predictions = text_classifier.predict(payload)

for prediction in predictions:
    print('Predicted class: {}'.format(prediction['label'][0].lstrip('__label__')))

Predicted class: 1
Predicted class: 0
Predicted class: -1


In [54]:
predictions

[{'label': ['__label__1'], 'prob': [0.3333541452884674]},
 {'label': ['__label__0'], 'prob': [0.3333447277545929]},
 {'label': ['__label__-1'], 'prob': [0.3333498239517212]}]