# Testing Sentiment Analysis
-----

In this notebook we test sentiment analysis techniques for analysing natural language for which there are three experiments.

In the first experiment we compare different document scores for each text.

In the second experiment we compare scores for named entities identified in each document.

The sentiment analysis APIs we test are general purpose provided by Text Blob and state-of-the-art provided by both IBM's Watson and Google.

## Instantiate the Pipeline

In [2]:
%%time
import importlib
import pipeline
importlib.reload(pipeline)

cnd = pipeline.CND()

print([name for name in cnd.nlp.pipe_names])

['tagger', 'parser', 'ner', 'Named Entity Matcher', 'merge_entities', 'Concept Matcher']
Wall time: 24.6 s


## Instantiate the Dataset

In [127]:
%%time
import importlib
from IPython.display import clear_output
import cndobjects
importlib.reload(cndobjects)


dirpath = r'C:\\Users\\Steve\\OneDrive - University of Southampton\\CNDPipeline\\dataset'

orators = cndobjects.Dataset(cnd, dirpath)
clear_output(wait=True)

display(orators.summarise())

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Text Count,Word Count
Ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hitler,0,Adolf Hitler,1,706100
bush,1,George Bush,14,143936
king,2,Martin Luther King,5,122815
laden,3,Osama bin Laden,6,93646
Totals,4,,26,1066497


Wall time: 1min


## Experiment 1 - Document Sentiment Scores

In this first experiment we analysis the overall document sentiment scores.

### Test 1 - Overall Document Scores

In this first test we look at how each API scores each text of the dataset overall.

#### - Get Watson API Results and Store on File

Get the data from the online API and store on file to save from repeated calls.

API Documentation
- https://cloud.ibm.com/apidocs/natural-language-understanding

Source Code
- http://watson-developer-cloud.github.io/python-sdk/v1.0.2/_modules/watson_developer_cloud/natural_language_understanding_v1.html

In [143]:
%%time

import json
import jsonlines
import os
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, ConceptsOptions, EmotionOptions, EntitiesOptions, KeywordsOptions, CategoriesOptions, SentimentOptions

# apikey = XXXX
# url = XXXX

# authenticator = IAMAuthenticator(apikey)
# service = NaturalLanguageUnderstandingV1(version='2019-07-12', authenticator=authenticator)
# service.set_service_url(url)

# filepath = os.getcwd()

# watson_results = list()
    
# for text in orators.texts:
#     watson_results.append({
#         "ref" : text.ref,
#         "title" : text.title,
#         "targets" : list(set([str(ent) for ent in text.doc.ents if ent.label_ in ["GPE", "NORP", "ORG", "PERSON"]])),
#         "result" : service.analyze(
#                     text=str(text.doc.text),
#                     features=Features(
#                         concepts=ConceptsOptions(limit=50),
#                         emotion=EmotionOptions(targets = targets),
#                         entities=EntitiesOptions(emotion=True, sentiment=True),
#                         keywords=KeywordsOptions(emotion=True, sentiment=True),
#                         categories=CategoriesOptions(),
#                         sentiment=SentimentOptions()
#                     )).get_result()
#     })
    
# jsonl_filename = "watsonresults.jsonl"
# with jsonlines.open(os.path.join(filepath, jsonl_filename), 'w') as writer:
#     writer.write_all(watson_results)

Wall time: 0 ns


#### - Get Google API Results and store on file

Authentication
- https://cloud.google.com/docs/authentication/getting-started

Dashboard
- https://console.cloud.google.com/home/dashboard?project=modern-heading-262419


In [181]:
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

# Instantiates a client
client = language.LanguageServiceClient()

# The text to analyze
text = u'Hello, world!'
document = types.Document(
    content=text,
    type=enums.Document.Type.PLAIN_TEXT)

# Detects the sentiment of the text
sentiment = client.analyze_sentiment(document=document).document_sentiment

print(f'Text: {text}')
print(f'Sentiment: {sentiment.score}, {sentiment.magnitude}')

Text: Hello, world!
Sentiment: 0.6000000238418579, 0.6000000238418579


In [162]:

import pandas as pd
from textblob import TextBlob
# display(pd.DataFrame(orators["bush"].framework()))

def sentiment_summary(dataset):

    """
    create a summary array of the Text() objects in Orator()
    """
    
    attrs = ["ref", "datestamp", "title", "word count", "TextBlob"]
    
    df = []
    for text in dataset:
        df.append([str(getattr(text, attr)) for attr in attrs[:3]])
        df[-1].append(len(text)) # get word count
        df[-1].append(TextBlob(str(text)).sentiment[0])

    df = pd.DataFrame(df, columns = attrs)
    
    return df

df = sentiment_summary(orators.texts)
df["Watson"] = [obj["result"]["sentiment"]["document"]["score"] for obj in watson_results]
display(df.style.background_gradient(cmap="Blues", subset = df.columns[-2:]))


Unnamed: 0,ref,datestamp,title,word count,TextBlob,Watson
0,hitler,2020-06-30,Mein Kampf,706100,0.096408,0.373203
1,bush,2001-09-11,911 Address to the Nation,3482,0.093816,0.469524
2,bush,2001-09-14,Remarks at the National Day of Prayer & Remembrance Service,5116,0.153784,0.298916
3,bush,2001-09-15,First Radio Address following 911,2667,0.196519,0.35154
4,bush,2001-09-17,"Address at Islamic Center of Washington, D.C.",2695,-0.076026,0.333191
5,bush,2001-09-20,Address to Joint Session of Congress Following 911 Attacks,17321,0.132154,-0.331922
6,bush,2001-10-07,Operation Enduring Freedom in Afghanistan Address to the Nation,5693,0.167453,0.386644
7,bush,2001-10-11,911 Pentagon Remembrance Address,6673,0.061266,-0.331954
8,bush,2001-10-11,Prime Time News Conference on War on Terror,4262,0.147233,0.333918
9,bush,2001-10-11,Prime Time News Conference Q&A,32002,0.118774,0.297858


In [None]:
%%time
# https://stackoverflow.com/questions/20618804/how-to-smooth-a-curve-in-the-right-way

import matplotlib
import matplotlib.pyplot as plt
from collections import OrderedDict
from textblob import TextBlob
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter

def smoothing(sentiment_list, window_size):
    window_size = window_size
    numbers_series = pd.Series(sentiment_list)
    windows = numbers_series.rolling(window_size)
    moving_averages = windows.mean()

    moving_averages_list = moving_averages.tolist()
    
    return moving_averages_list[window_size - 1:]

def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

def x_axis(sentiment_list):
    total_len = len(sentiment_list)
    return [(i / total_len)*100 for i in range(total_len)]

def textblob_sentiment(document):
    sentiment = list()
    max_values = list()
    min_values = list()
    max_value = ("", 0)
    min_value = ("", 0)
    
    for sentence in document.doc.sents:
        rawtext = sentence.text
        sentiment.extend([TextBlob(rawtext).sentiment[0]])
        if sentiment[-1] == 1:
            max_values.append((rawtext, sentiment[-1]))
        if sentiment[-1] == -1:
            min_values.append((rawtext, sentiment[-1]))
        if max(sentiment) < 1 and max(sentiment) > max_value[1]:
            max_value = (rawtext, sentiment[-1])
        if min(sentiment) > -1 and min(sentiment) < min_value[1]:
            min_value = (rawtext, sentiment[-1])
            
#     pd.set_option('display.max_columns', None)
#     display(pd.DataFrame(min_value))
        
    return sentiment

plt.rcParams['figure.figsize'] = [20, 10]
fig, ax = plt.subplots()
axes = plt.gca()
axes.set_ylim([-1,1])
ax.grid()


def add_plot(text, window, poly):
    #y_axis = savgol_filter(get_sentiment(text), window, poly)
    print(len(text) // 1000)
    y_axis = smoothing(textblob_sentiment(text), len(text)//1000)
    
    ax.plot(x_axis(y_axis), y_axis, label = text.title)
    
    
window = 100
poly = 1
text = orators["king"][1]
add_plot(text, 20, poly)
text = orators["hitler"][0]
add_plot(text, 100, poly)

ax.legend(prop={"size":15})
ax.set(xlabel='percentage through the document', ylabel='rolling average sentiment',
       title="Comparing Sentiment Analysis of Different Texts")

plt.show()