                          ####  Using Amazon Comprehend Through the boto3 API ####

This notebook shows how to use boto3 Amazon API to use Amazon Comprehend for real time analysis as well as scheduling analysis jobs.

For boto3 to work you need to create an IAM User, receive aws_access_key_id and aws_secret_access_key and configure your credentials using AWS Command Line Interface (AWS CLI).
Cost. If you are using free AWS tier, you can analyze 50K units a month free. In my example, every tweet is a unit. In the scheduled job I am analyzing 10K tweets at once, so the free tier runs out pretty fast, and then it's $1 per 10K. Be sure to check pricing before you proceed. https://aws.amazon.com/comprehend/pricing/
Reference. Boto3 S3: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/comprehend.html Boto3 Comprehend: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html

In [1]:
import logging
import boto3
import pandas as pd 
from dotenv import load_dotenv
from botocore.exceptions import ClientError
import tarfile 
import json 
import os
load_dotenv()

def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

bucket_name = "comprihend-tweet-bucket"
local_file_name = "Comprehend\\amazon_tweets_1.csv"
s3_file_name = "amazon_tweets.csv"
upload_file(local_file_name, bucket_name, s3_file_name)


True

                        ### Downloading the data from the S3 bucket in the form of .tar.gz file ### 

In [2]:
entities_results_S3Url = "https://comprihend-tweet-bucket.s3.amazonaws.com/amazon_tweets_1.csv"
local_results_filename = 'Comprehend/outputs/entities.csv'

s3_name = 's3://' + bucket_name + '/'
results_aws_filename = entities_results_S3Url.replace(s3_name, '')
def download_all_files():
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket_name)
    for file in my_bucket.objects.all():
        print(file.key)
        s3.Bucket(bucket_name).download_file(file.key, local_results_filename)
            
download_all_files()
print(f"File downloaded from {entities_results_S3Url} to {local_results_filename}")

amazon_tweets.csv
File downloaded from https://comprihend-tweet-bucket.s3.amazonaws.com/amazon_tweets_1.csv to Comprehend/outputs/entities.csv


                            ### Extracting the result ###    

In [3]:
def extract_targz(targz_file, output_path = ''):
    if targz_file.endswith("tar.gz"):
        tar = tarfile.open(targz_file, "r:gz")
        tar.extractall(path = output_path)
        tar.close()
    elif targz_file.endswith("tar"):
        tar = tarfile.open(targz_file, "r:")
        tar.extractall(path = output_path)
        tar.close()
output_path = 'Comprehend/outputs/extracted'
extract_targz(local_results_filename, output_path)
print(f"file extracted to {output_path}")

file extracted to Comprehend/outputs/extracted


                                ### Live Single Record Processing ### 

In [4]:
local_file_name = 'Comprehend/amazon_tweets.csv'
df = pd.read_csv(local_file_name, header = None, names = ['amazon_tweets'], dtype = 'str')
df.loc[0].item()
comprehend = boto3.client(service_name='comprehend')
sentiment_output = comprehend.detect_sentiment(Text=df.loc[0].item(), LanguageCode='en')
# print(sentiment_output)
sentiment_output['SentimentScore']


{'Positive': 0.28518974781036377,
 'Negative': 0.0012132528936490417,
 'Neutral': 0.7135248184204102,
 'Mixed': 7.219231338240206e-05}

                                    ### Live Multiple Record Processing ### 

In [5]:
tweet25 = list(df.amazon_tweets[0:5])
for i in range(len(tweet25)):
    sentiment_output = comprehend.detect_sentiment(Text=tweet25[i], LanguageCode='en')
    print(sentiment_output)
    print(sentiment_output['SentimentScore']) 
    # write a output in a file 
    with open('sentiment_output.txt', 'a') as f:
        f.write(str(sentiment_output['SentimentScore']))
        f.write('\n')
        f.close()
tweets25 = list(df.amazon_tweets[0:6])
sentiment_batch = comprehend.batch_detect_sentiment(TextList=tweets25,
                                                    LanguageCode='en')
tweets25[4]
sentiment_batch['ResultList'][4]

In [6]:
entities = comprehend.batch_detect_entities(TextList=tweets25, LanguageCode='en')
tweets25_entities = entities['ResultList']
pd.DataFrame(entities['ResultList'][0]['Entities'])

Unnamed: 0,Score,Type,Text,BeginOffset,EndOffset
0,0.997289,LOCATION,USA,12,15
1,0.992791,ORGANIZATION,FDA,17,20
2,0.856835,ORGANIZATION,GMP,33,36
3,0.87406,ORGANIZATION,Kosher,41,47
4,0.945508,QUANTITY,every time,122,132
5,0.994793,OTHER,https://t.co/i6ZwFpeo4p,134,157
6,0.565103,OTHER,https://t.co/XRU2MvKLVy,158,181


In [7]:
# Function to parse the dictionary
def parse_entities_batch(data):
    df = pd.DataFrame() # declare an empty dataframe
    nested_json = 'Entities' # nested sub-dictiptionary to extract data from
    # populate the dataframe
    for line in data['ResultList']:
        dt_temp = pd.DataFrame(line[nested_json])  # extract data from sub-dictionary
        other_fields = list(line.keys())
        other_fields.remove(nested_json) # remove nested fields        
        for field in other_fields:  # add common fields
            dt_temp[field] = line[field]
            df = pd.DataFrame(pd.concat([df, dt_temp], sort=False))
    return(df)
entities_batch_df = parse_entities_batch(entities)
entities_batch_df.head(15)


Unnamed: 0,Score,Type,Text,BeginOffset,EndOffset,Index
0,0.997289,LOCATION,USA,12,15,0
1,0.992791,ORGANIZATION,FDA,17,20,0
2,0.856835,ORGANIZATION,GMP,33,36,0
3,0.87406,ORGANIZATION,Kosher,41,47,0
4,0.945508,QUANTITY,every time,122,132,0
5,0.994793,OTHER,https://t.co/i6ZwFpeo4p,134,157,0
6,0.565103,OTHER,https://t.co/XRU2MvKLVy,158,181,0
0,0.997085,QUANTITY,$200,0,4,1
1,0.949746,ORGANIZATION,Amazon,5,11,1
0,0.962676,PERSON,@SenSanders,0,11,2


In [8]:
sentiment_batch = comprehend.batch_detect_sentiment(TextList=tweets25,
                                                    LanguageCode='en')
tweets25[4]
sentiment_batch['ResultList'][4]

{'Index': 4,
 'Sentiment': 'NEUTRAL',
 'SentimentScore': {'Positive': 0.3611469268798828,
  'Negative': 0.00013270163617562503,
  'Neutral': 0.6384015679359436,
  'Mixed': 0.0003188896516803652}}

In [9]:
def parse_sentiment_batch(data):
    df = pd.DataFrame() 
    for line in data['ResultList']:
        try:
            dt_temp = pd.DataFrame(line['SentimentScore'], index = [0])  # extract data from sub-dictionary
            for field in list(line.keys())[:-1]:  # add common fields
                dt_temp[field] = line[field]
        
            df = pd.DataFrame(pd.concat([df, dt_temp], sort=False, ignore_index=True))
                  
        except:
            for field in list(line.keys())[:-1]:  # add common fields
                dt_temp[field] = line[field]
        
            df = pd.DataFrame(dt_temp, ignore_index = True)
    
    return(df)

sentiment_batch_df = parse_sentiment_batch(sentiment_batch)

sentiment_batch_df.head()

Unnamed: 0,Positive,Negative,Neutral,Mixed,Index,Sentiment
0,0.28519,0.001213,0.713525,7.2e-05,0,NEUTRAL
1,0.005474,0.000151,0.994367,7e-06,1,NEUTRAL
2,0.001153,0.941665,0.056931,0.000251,2,NEGATIVE
3,0.01145,0.526684,0.461734,0.000131,3,NEGATIVE
4,0.361147,0.000133,0.638402,0.000319,4,NEUTRAL


                                                            ### Key Phrases ###

In [10]:
tweet_dump = [''.join(tweets25)]
key_prase_batch_output = comprehend.batch_detect_key_phrases(TextList=tweet_dump, LanguageCode='en')
pd.DataFrame(key_prase_batch_output['ResultList'][0]['KeyPhrases']) 


Unnamed: 0,Score,Text,BeginOffset,EndOffset
0,0.985102,the USA,8,15
1,0.753778,"FDA registered, GMP and Kosher certified facility",17,66
2,0.974916,"the highest quality, potency and consistency",77,121
3,0.99321,every time,122,132
4,0.802611,https://t.co/i6ZwFpeo4p https://t.co/XRU2MvKLV...,134,211
5,0.773343,Gluten Free Giveaways,221,242
6,0.52755,SenSanders,243,253
7,0.997508,people,272,278
8,0.983171,amazon,293,299
9,0.988972,the richest,313,324
