In [2]:
from IPython import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Part 1: Existing Machine Learning Services

<a href="https://colab.research.google.com/github/peckjon/hosting-ml-as-microservice/blob/master/part1/score_reviews_via_service.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Obtain labelled reviews

In order to test any of the sentiment analysis APIs, we need a labelled dataset of reviews and their sentiment polarity. We'll use NLTK to download the movie_reviews corpus.

In [1]:
from nltk import download

download('movie_reviews')
download('stopwords')
download('wordnet')

from nltk.corpus import stopwords
stop = stopwords.words('english')

from textblob import Word

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Load the data

The files in movie_reviews have already been divided into two sets: positive ('pos') and negative ('neg'), so we can load the raw text of the reviews into two lists, one for each polarity.

In [9]:
from nltk.corpus import movie_reviews

# extract words from reviews, pair with label

reviews_pos = []
for fileid in movie_reviews.fileids('pos'):
    review = movie_reviews.raw(fileid)
    reviews_pos.append(review)

reviews_neg = []
for fileid in movie_reviews.fileids('neg'):
    review = movie_reviews.raw(fileid)
    reviews_neg.append(review)

In [10]:
len(reviews_pos) , len(reviews_neg)
import pandas as pd
import re

(1000, 1000)

In [11]:
# lists to df for faster preprocessing
df = pd.DataFrame(data = {'pos':reviews_pos,'neg':reviews_neg})
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pos     1000 non-null   object
 1   neg     1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


Unnamed: 0,pos,neg
0,films adapted from comic books have had plenty...,"plot : two teen couples go to a church party ,..."
1,every now and then a movie comes along from a ...,the happy bastard's quick movie review \ndamn ...
2,you've got mail works alot better than it dese...,it is movies like these that make a jaded movi...
3,""" jaws "" is a rare film that grabs your atten...",""" quest for camelot "" is warner bros . ' firs..."
4,moviemaking is a lot like being the general ma...,synopsis : a mentally unstable man undergoing ...


In [12]:
%%time
# clean punct,special chars
# df['pos'] = df['pos'].apply(lambda x: re.sub('[^A-Za-z|.]+',' ',x))
# df['neg'] = df['neg'].apply(lambda x: re.sub('[^A-Za-z|.]+',' ',x))

# clean punct,special chars
df['pos'] = df['pos'].apply(lambda x: re.sub('[^A-Za-z]+',' ',x))
df['neg'] = df['neg'].apply(lambda x: re.sub('[^A-Za-z]+',' ',x))

# remove stopwords
df['pos'] = df['pos'].apply(lambda x : ' '.join(x for x in x.split() if x not in stop))
df['neg'] = df['neg'].apply(lambda x : ' '.join(x for x in x.split() if x not in stop))

# lemmatization
df['pos'] = df['pos'].apply(lambda x : " ".join([Word(word).lemmatize() for word in x.split()]))
df['neg'] = df['neg'].apply(lambda x : " ".join([Word(word).lemmatize() for word in x.split()]))



CPU times: user 6.49 s, sys: 22.1 ms, total: 6.52 s
Wall time: 6.52 s


In [13]:
df.head()

Unnamed: 0,pos,neg
0,film adapted comic book plenty success whether...,plot two teen couple go church party drink dri...
1,every movie come along suspect studio every in...,happy bastard quick movie review damn k bug go...
2,got mail work alot better deserves order make ...,movie like make jaded movie viewer thankful in...
3,jaw rare film grab attention show single image...,quest camelot warner bros first feature length...
4,moviemaking lot like general manager nfl team ...,synopsis mentally unstable man undergoing psyc...


In [23]:
df['pos_count'] = df['pos'].apply(lambda x: len(''.join(x.split())))
df['neg_count'] = df['neg'].apply(lambda x: len(''.join(x.split())))
df.head()

Unnamed: 0,pos,neg,pos_count,neg_count
0,film adapted comic book plenty success whether...,plot two teen couple go church party drink dri...,2338,1824
1,every movie come along suspect studio every in...,happy bastard quick movie review damn k bug go...,2201,697
2,got mail work alot better deserves order make ...,movie like make jaded movie viewer thankful in...,1215,1511
3,jaw rare film grab attention show single image...,quest camelot warner bros first feature length...,3325,1603
4,moviemaking lot like general manager nfl team ...,synopsis mentally unstable man undergoing psyc...,2092,2376


In [180]:
len(df[(df['neg_count']>4256) & (df['neg_count']<=4300)]['neg']) , len(df[(df['pos_count']>4256) & (df['pos_count']<=4300)]['pos'])

(1, 3)

In [185]:
len(df[(df['pos_count']<=4400)]) , len(df[(df['neg_count']<=4400)])

(968, 989)

In [184]:
# back to lists
reviews_pos = df[df['pos_count']<=4300]['pos'].to_list()
reviews_neg = df[df['neg_count']<=4300]['neg'].to_list()

len(reviews_pos) , len(reviews_neg)

(960, 984)

### Connect to the scoring API

Fill in this function with code that connects to one of these APIs, and uses it to score a single review:

* [Amazon Comprehend: Detect Sentiment](https://docs.aws.amazon.com/comprehend/latest/dg/API_DetectSentiment.html)
* [Google Natural Language: Analyzing Sentiment](https://cloud.google.com/natural-language/docs/analyzing-sentiment)
* [Azure Cognitive Services: Sentiment Analysis](https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/how-tos/text-analytics-how-to-sentiment-analysis)
* [Algorithmia: Sentiment Analysis](https://algorithmia.com/algorithms/nlp/SentimentAnalysis)

Your function must return either 'pos' or 'neg', so you'll need to make some decisions about how to map the results of the API call to one of these values. For example, Amazon Comprehend can return "NEUTRAL" or "MIXED" for the Sentiment -- if this happens, you may with to inspect the numeric values under the SentimentScore to see whether it leans toward positive or negative.


In [20]:
import requests
# pprint is used to format the JSON response
from pprint import pprint

import os

subscription_key = "3133ab6ea6dd46d399aef0fc954e8a48"
endpoint = "https://text-akj.cognitiveservices.azure.com"

sentiment_url = endpoint + "/text/analytics/v3.0/sentiment"
headers = {"Ocp-Apim-Subscription-Key": subscription_key}

In [35]:
def score_review(review):
    # TBD: call the service and return 'pos' or 'neg'
    documents = {"documents": [{"id": "1", "language": "en","text": review }]}
    
    response = requests.post(sentiment_url, headers=headers, json=documents)
    sentiments = response.json()
    scores =sentiments['documents'][0]['confidenceScores']

    result = None
    if scores['neutral'] > scores['positive'] > scores['negative']:
      result = 'pos'
    elif scores['neutral'] > scores['negative'] > scores['positive']:
      result = 'neg'
    elif max(scores,key=scores.get) == 'negative':
      result = 'neg'
    else:
      result = 'pos'

    return result

### Score each review

Now, we can use the function you defined to score each of the reviews

In [None]:
# Blank result list init
results_pos = []
results_neg = []

In [186]:
# Positive reviews
li = len(results_pos)
for review in reviews_pos[li:li+99]:
    result = score_review(review)
    results_pos.append(result)

print(len(results_pos))
print('Next li:{}'.format(li+99))

951
CPU times: user 761 ms, sys: 49.1 ms, total: 810 ms
Wall time: 13.6 s


In [212]:
# Negative reviews
li = len(results_neg)
for review in reviews_neg[li:li+99]:
    result = score_review(review)
    results_neg.append(result)

print(len(results_neg))
print('Next li:{}'.format(li+99))

984
Next li:990


In [213]:
len(results_pos),len(results_neg)

(951, 984)

In [None]:
# %%time
# results_pos = []
# for review in reviews_pos:
#     result = score_review(review)
#     results_pos.append(result)

# results_neg = []
# for review in reviews_neg:
#     result = score_review(review)
#     results_neg.append(result)

### Calculate accuracy

For each of our known positive reviews, we can count the number which our function scored as 'pos', and use this to calculate the % accuracy. We repeaty this for negative reviews, and also for overall accuracy.

In [215]:
correct_pos = results_pos.count('pos')
accuracy_pos = float(correct_pos) / len(results_pos)
correct_neg = results_neg.count('neg')
accuracy_neg = float(correct_neg) / len(results_neg)
correct_all = correct_pos + correct_neg
accuracy_all = float(correct_all) / (len(results_pos)+len(results_neg))

print('Positive reviews: {}% correct'.format(accuracy_pos*100))
print('Negative reviews: {}% correct'.format(accuracy_neg*100))
print('Overall accuracy: {}% correct'.format(accuracy_all*100))

Positive reviews: 32.3869610935857% correct
Negative reviews: 94.10569105691057% correct
Overall accuracy: 63.77260981912145% correct
