                                            ***  Asynchronous Processing - Scheduling an Analysis Job  ***
I highly recommend that you run at least one Comprehend job from the point and click interface, especially, if you are new to AWS. This way you can create a data access role (aka data_access_role_arn), and then you can simply copy the role name from the job description.

You will need to create your S3 bucket through the web interface or through BOTO3 API.

Note that I use different folders s3://comprehend-api/input-data and s3://comprehend-api/results for input data and results output. This way, your results are not going to get confused for inputs if you were to analyze all files in the folder.

In [1]:
from dotenv import load_dotenv
import boto3
import json 
import pandas as pd 
from botocore.exceptions import ClientError 
load_dotenv()
import logging
import tarfile
# print(load_dotenv())

def create_bucket(bucket_name):
    """Create an S3 bucket in a specified region
    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).
    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """ 
    if bucket_name != None:
        region = 'us-east-1'
        s3_client = boto3.client('s3', region_name=region)
        s3_client.create_bucket(Bucket=bucket_name)
        return True
    elif bucket_name == None:
        print("Bucket name is empty")
        
        return False
bucket_name = "comphrend-wallmart-bucket"
if bucket_name == "comphrend-wallmart-bucket":
    print("Bucket name is Allready exist")
    pass
else:
    create_bucket(bucket_name)
    print("Bucket created")


Bucket name is Allready exist


                                    *** Uploding the data to S3 *** 

In [2]:
local_file_name = "../Comprehend/wallmarts_tweets_1k.csv"
bucket_name = "comphrend-wallmart-bucket"
aws_file_name = "input-data/wallmarts_tweets_1k.csv"

s3_client = boto3.client('s3')
def upload_file(local_file_name, bucket_name, aws_file_name):
    try:
        s3 = s3_client.upload_file(local_file_name, bucket_name, aws_file_name)
        print("File Uploaded")
    except ClientError as e:
        logging.error(e)
        return False
    return True
print(upload_file(local_file_name, bucket_name, aws_file_name)) 

File Uploaded
True


                                                *** Configure Sentiment Detection Job *** 

In [3]:
comprehend = boto3.client('comprehend')
input_s3_url = "s3://comphrend-wallmart-bucket/input-data/wallmarts_tweets_1k.csv"
output_s3_url = "s3://comphrend-wallmart-bucket/results"
input_doc_format = "ONE_DOC_PER_LINE"
data_acess_role_arn = "arn:aws:iam::893415859041:role/service-role/AmazonComprehendServiceRole-Com-S3"

input_data_config = {"S3Uri": input_s3_url, "InputFormat": input_doc_format}
output_data_config = {"S3Uri": output_s3_url}

start_job_sentiment = comprehend.start_sentiment_detection_job(
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    DataAccessRoleArn=data_acess_role_arn,
    LanguageCode='en',
    JobName='Walmart_1K_tweets')

job_id = start_job_sentiment['JobId']
print(f"Job ID: {job_id}")



Job ID: 7789fcaa43d6874376332374a69fe845


In [4]:
describe_result = comprehend.describe_sentiment_detection_job(JobId=job_id)
job_status = describe_result['SentimentDetectionJobProperties']['JobStatus']
print(f'Job Status: {job_status}')
if job_status == 'FAILED':
    print(f'Reason: {describe_result["SentimentDetectionJobProperties"]["Message"]}')


Job Status: SUBMITTED


In [11]:
results_S3Url = comprehend.describe_sentiment_detection_job(
    JobId=job_id)['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri']
print(f'Results S3 Url: {results_S3Url}')

Results S3 Url: s3://comphrend-wallmart-bucket/results/893415859041-SENTIMENT-7789fcaa43d6874376332374a69fe845/output/output.tar.gz


In [12]:
def s3_bucket_list_obj(bucket):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket)
    for obj in bucket.objects.all():
        print(obj.key)
s3_bucket_list_obj(bucket_name)


input-data/
input-data/wallmarts_tweets_1k.csv
results/
results/.write_access_check_file.temp
results/893415859041-SENTIMENT-7789fcaa43d6874376332374a69fe845/output/output.tar.gz
results/893415859041-SENTIMENT-83481910f807237a7ca5738c53288d8d/output/output.tar.gz


In [13]:
# Give your local results file a name
results_name = 'sentiment'

local_results_filename = '../Comprehend/outputs/' + results_name + '.tar.gz'
s3_name = 's3://' + bucket_name + '/'
results_aws_filename = results_S3Url.replace(s3_name, '')

# Download results
s3 = boto3.client('s3')
s3.download_file(bucket_name,
                results_aws_filename, 
                local_results_filename)

print('Results downloaded to: ' + local_results_filename)

Results downloaded to: ../Comprehend/outputs/sentiment.tar.gz


In [14]:
import tarfile
def extract_targz(targz_file, output_path = ''):
    if targz_file.endswith("tar.gz"):
        tar = tarfile.open(targz_file, "r:gz")
        tar.extractall(path = output_path)
        tar.close()
    elif targz_file.endswith("tar"):
        tar = tarfile.open(targz_file, "r:")
        tar.extractall(path = output_path)
        tar.close()
results_name = 'sentiment'
local_results_filename = '../Comprehend/outputs/' + results_name + '.tar.gz'
output_path = 'Comprehend/outputs/extracted' 
extract_targz(local_results_filename, output_path)
print('Results extracted to: ' + output_path)

Results extracted to: Comprehend/outputs/extracted


In [157]:
# Read JSON into a dictionary   
import pandas as pd
input_file = output_path + '/output'
results = [json.loads(line) for line in open(input_file, 'r')]
print('Number of records in the output:',len(results))

user_input = input("Enter the tweet number to see the sentiment: ") 
df = pd.read_csv(local_file_name, header=None) 
output = json.dumps(results[int(user_input)], indent=4, sort_keys=True)
print(df.iloc[int(user_input), 0] + '\n', output)

sentiment_results = []

Number of records in the output: 10000
@lxoG21 I love me some Walmart candles lol the Sweet Apples and Cactus Aloe 🥴 their wax melts are amazing too
 {
    "File": "wallmarts_tweets_1k.csv",
    "Line": 5,
    "Sentiment": "NEUTRAL",
    "SentimentScore": {
        "Mixed": 7.657633250346407e-05,
        "Negative": 0.002575723920017481,
        "Neutral": 0.9885324239730835,
        "Positive": 0.008815310895442963
    }
}


In [1]:
from datetime import datetime
import json
record_no = 1
# df = pd.read_csv(local_file_name, header=None)

# for i in range(records_no):
#     df.loc[i] = df.loc[i].str.replace(r'\s+', ' ').str.strip()
#     output = json.dumps(results[i], indent=4, sort_keys=True)
#     print('SYNCHRONOUS RESULTS:')
#     print(df.iloc[i, 0] + '\n', output)
#     print('\n')
#     print('Sentiment: ' + results[i]['Sentiment'])
# print('\nASYNCHRONOUS RESULTS:')

# for i in range(records_no):
#     output_asynch = json.dumps(results[i], indent=4, sort_keys=True) 
#     print(df.loc[i] + '\n', output_asynch)
#     print('\n')

print('TWEET TEXT:\n', 
      df.loc[record_no].item())
# Real Time Results
print('\nREAL TIME RESULTS:\n') 
print(comprehend.detect_sentiment(Text=df.loc[record_no].item(), LanguageCode='en')['SentimentScore'])
# Job Resutls
print('\nASYNCHRONOUS RESULTS:')
game = json.dumps(results[record_no], indent=4, sort_keys=True)
print(game)

#  write the results to a json file as current date and time 
output_file_name_json = '../Outputs_file/Wallmarts' + str({record_no}) + datetime.now().strftime("%d-%m-%Y_%H-%M-%S") + '.json' 

with open(output_file_name_json, 'w') as f:
      json.dump(results, f, indent=4, sort_keys=True) 
print('Results saved to: ' + output_file_name_json)

NameError: name 'df' is not defined

In [None]:

output_name = '../Outputs_file/Wallmarts/' + str({record_no}) + datetime.now().strftime("%d-%m-%Y_%H-%M-%S") + '.json'
sentiment_results['Text'] = df.walmart_tweets
# Convert json file to excel 
data_excel = pd.read_json(output_file_name_json).to_excel(output_name, engine = 'xlsxwriter',  encoding = 'utf-8')
print('Results saved to: ' + output_name) 