## Prequisites and Preprocessing

### Set Permissions and Environment Variables

In [1]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role

# The IAM role arn used to give training and hosting access to your data
role = get_execution_role()

region = boto3.Session().region_name

bucket='test00001a' 
data_key = 'comprehend_baseline/lang_to_be_processed.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
#bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket) # The URL to access the bucket

### Import Libraries

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Connect to Comprehend 

In [3]:
comprehend = boto3.client('comprehend', region_name='us-east-1')

### Load Data

In [4]:
data = pd.read_csv(data_location, index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397365 entries, 474948 to 99999
Data columns (total 17 columns):
review_id         397365 non-null int64
review_heading    16141 non-null object
review_comment    397285 non-null object
brand_id          397365 non-null int64
store_id          397365 non-null int64
platform_id       397365 non-null int64
platform_name     397365 non-null object
brand_name        397365 non-null object
rating            397365 non-null int64
combined          397365 non-null object
aws_lang          0 non-null float64
aws_lang_score    0 non-null float64
aws_mix           0 non-null float64
aws_neg           0 non-null float64
aws_neu           0 non-null float64
aws_pos           0 non-null float64
aws_sent          0 non-null float64
dtypes: float64(7), int64(5), object(5)
memory usage: 54.6+ MB


## Language Detection

In [None]:
%%time

languages = []
scores = []


for i in data3.index:
    d = data3.combined[i]
    
    if d != '':
        res = comprehend.detect_dominant_language(Text=d)
        lang = res.get('Languages')[0]['LanguageCode']
        score = res.get('Languages')[0]['Score']

    
    languages.append(lang)
    scores.append(score)

In [31]:
data['aws_lang'] = languages
data['aws_lang_score'] = scores

In [32]:
data.aws_lang.notnull().sum() == len(data)

True

In [40]:
# Function to upload to S3
from io import StringIO

def write_pd_s3_csv(df, bucket, filepath):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, filepath).put(Body=csv_buffer.getvalue())
    print("The data is successfully written to S3 path:", bucket+"/"+filepath)

# Write to S3
s3_bucket =  'test00001a'  
file_path = 'comprehend_baseline/new_lang3.csv'
write_pd_s3_csv(data3, s3_bucket, file_path)

The data is successfully written to S3 path: test00001a/comprehend_baseline/lang_to_be_processed2.csv


## Sentiment Analysis

In [17]:
%%time 

sentiments = []
positive = []
negative = []
neutral = []
mixed = []

supported_languages = ['en','de', 'fr', 'es', 'it', 'pt']

for i in data.index:
    if data.aws_lang[i] in supported_languages: 
        d = data.combined[i]
        lang = data.aws_lang[i]
    
    
        if d != '':
            res = comprehend.detect_sentiment(Text=d, LanguageCode=lang)
            sent = res.get('Sentiment')
            pos = res.get('SentimentScore')['Positive']
            neg = res.get('SentimentScore')['Negative']
            neu = res.get('SentimentScore')['Neutral']
            mix = res.get('SentimentScore')['Mixed']
        
    else:
        sent = pos = neg = neu = mix = np.nan
    
    
    sentiments.append(sent)
    positive.append(pos)
    negative.append(neg)
    neutral.append(neu)
    mixed.append(mix)

CPU times: user 25.9 s, sys: 700 ms, total: 26.6 s
Wall time: 7min


In [18]:
len(sentiments) == len(data)

True

In [19]:
data['aws_sent'] = sentiments
data['aws_pos'] = positive
data['aws_neg'] = negative
data['aws_neu'] = neutral
data['aws_mix'] = mixed

### Save Data to S3

In [29]:
# Write to S3
 
file_path = 'comprehend_baseline/lang_to_be_processed.csv'
write_pd_s3_csv(data3, s3_bucket, file_path)

The data is successfully written to S3 path: test00001a/comprehend_baseline/lang_to_be_processed.csv
