# Get Google and Watson API Results and store on file
-----

## Instantiate the Pipeline

In [59]:
%%time
import importlib
import pipeline
importlib.reload(pipeline)

cnd = pipeline.CND()

print([name for name in cnd.nlp.pipe_names])

['tagger', 'parser', 'ner', 'Named Entity Matcher', 'merge_entities', 'Concept Matcher']
Wall time: 16.7 s


## Instantiate the Dataset

In [60]:
%%time
import importlib
from IPython.display import clear_output
import cndobjects
importlib.reload(cndobjects)


dirpath = r'C:\\Users\\Steve\\OneDrive - University of Southampton\\CNDPipeline\\dataset'

orators = cndobjects.Dataset(cnd, dirpath)
clear_output(wait=True)

display(orators.summarise())

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Text Count,Word Count
Ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hitler,0,Adolf Hitler,1,706100
bush,1,George Bush,14,143936
king,2,Martin Luther King,5,122815
laden,3,Osama bin Laden,6,93646
Totals,4,,26,1066497


Wall time: 1min 11s


## Get Google API Results

Get the data from the online API and store on file to save from repeated calls.

Authentication
- https://cloud.google.com/docs/authentication/getting-started

Dashboard
- https://console.cloud.google.com/home/dashboard?project=modern-heading-262419

Documents
- https://cloud.google.com/natural-language/docs/reference/rest

### Initiate API

In [None]:
%%time
import os
# Imports the Google Cloud client library
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r""
# Instantiates a client
client = language.LanguageServiceClient()

### Get overall document analytics for the Google API

In [None]:
%%time
import os
from tqdm import tqdm
import pickle

document_results = dict()

#instantiate dict() for each orator()
orators_dict = {}
    
# iterate through each orator() object
for ref, orator in orators.orators_dict.items():
    
    # instantiate orator dict()
    orators_dict[ref] = list()
   
    
    # iterate over each Text() of the orator() object
    for text in tqdm(orator.texts, total = len(orator.texts), desc = ref):
    # instantiate document dict()
        document = dict()
        document["title"] = text.title
        document["analytics"] = list()

        # Mein Kampf is too large for the Google API, the document sentiment is calculated using an average of sentence sentiments
        if ref == "hitler":
            document["analytics"].append(
                {"sentiment" : 0),
                 "entities" : '',
                 "classifications" : '',
                 "syntax" : ''
                })
            
            orators_dict[ref].append(document)
            break
        
        
        doc_obj = types.Document(
            content=str(text.doc.text),
            type=enums.Document.Type.PLAIN_TEXT)
        encoding_type = enums.EncodingType.UTF8

        document["analytics"] = {"sentiment" : client.analyze_sentiment(document=doc_obj).document_sentiment,
                                 "entities" : client.analyze_entity_sentiment(doc_obj, encoding_type=encoding_type),
                                 "classifications" : client.classify_text(doc_obj),
                                 "syntax" : client.analyze_syntax(doc_obj, encoding_type=encoding_type)
                                }
    
        # append the document object to the orator dict()
        orators_dict[ref].append(document)
        
    # append the orator dict() to the overall dataset
    document_results.update(orators_dict)
    
google_document_results = document_results
# time = 3min 51s

### Save the Google document object to file

the NLP objects for Google can't be stored as a json object, therefore they are serialised using pickle

In [None]:
%%time

# import os
# import pickle
# filepath = os.getcwd()
# pickle_filename = "google_document_analytics"
# with open(os.path.join(filepath, pickle_filename), 'wb') as file:
#     pickle.dump(google_document_results, file)

### Get sentence level sentiment analytics for Google API

In [None]:
%%time
import os
import pickle
from tqdm import tqdm
# instantiate dataset dictionary
sentence_sentiment = dict()

#instantiate dict() for each orator()
orators_dict = {}
    
for ref, orator in orators.orators_dict.items():
    
    # instantiate orator dict()
    orators_dict[ref] = list()
    
    # iterator through orator() texts
    for text in orator.texts:
        
        # instantiate document dict()
        document = dict()
        document["title"] = text.title
        document["sentiments"] = list()
        
        # iterator through each sentence of the text and append sentiment score to the sentiments list
        for sentence in tqdm(text.doc.sents, total = len(list(text.doc.sents)), desc = document["title"]):
            
            sent_obj = types.Document(
                content=str(sentence).strip(),
                type=enums.Document.Type.PLAIN_TEXT)
            document["sentiments"].append(client.analyze_sentiment(document=sent_obj).document_sentiment)
            
        # append the document object to the orator dict()
        orators_dict[ref].append(document)
    
    # append the orator dict() to the overall dataset
    sentence_sentiment.update(orators_dict)
    
google_sentence_sentiment = sentence_sentiment
# time = 27min 52s

### Get average sentence sentiment score for Google API

Since Mein Kampf is too large for Google's  open source API, the score is derived from an average of sentence sentiment scores

In [55]:
from statistics import mean

# code for getting average sentence sentiment score
sentence_sentiment_mean = mean([sentiment.score for sentiment in google_sentence_sentiment["hitler"][0]["sentiments"]])

# check for differences between document and sentence level score
for ref, document in google_document_analytics.items():
    for text in range(len(document)):
        if ref == "hitler":
            continue
        doc_score = google_document_analytics[ref][text]["analytics"]["sentiment"].score 
        sent_score = mean([sentiment.score for sentiment in google_sentence_sentiment[ref][text]["sentiments"]])
        n = 2
        difference = round(abs(doc_score - sent_score), n)
        print(f"difference ({n} decimal places) =  {difference}")



difference (2 decimal places) =  0.06
difference (2 decimal places) =  0.0
difference (2 decimal places) =  0.04
difference (2 decimal places) =  0.07
difference (2 decimal places) =  0.01
difference (2 decimal places) =  0.08
difference (2 decimal places) =  0.02
difference (2 decimal places) =  0.02
difference (2 decimal places) =  0.01
difference (2 decimal places) =  0.06
difference (2 decimal places) =  0.01
difference (2 decimal places) =  0.0
difference (2 decimal places) =  0.03
difference (2 decimal places) =  0.01
difference (2 decimal places) =  0.03
difference (2 decimal places) =  0.02
difference (2 decimal places) =  0.0
difference (2 decimal places) =  0.02
difference (2 decimal places) =  0.03
difference (2 decimal places) =  0.06
difference (2 decimal places) =  0.02
difference (2 decimal places) =  0.01
difference (2 decimal places) =  0.03
difference (2 decimal places) =  0.1
difference (2 decimal places) =  0.0


### Save the Google document object to file

the NLP objects for Google can't be stored as a json object, therefore they are serialised using pickle

In [None]:
%%time
import os
import pickle

# filepath = os.getcwd()
# pickle_filename = "google_sentence_sentiment"
# with open(os.path.join(filepath, pickle_filename), 'wb') as file:
#     pickle.dump(google_sentence_sentiment, file)

### Code for loading Data

In [1]:
%%time
import os
import pickle

google_document_analytics_filename = "google_document_analytics"
google_sentence_sentiment_filename = "google_sentence_sentiment"

filepath = os.getcwd()
with open(os.path.join(filepath, google_document_analytics_filename), 'rb') as file:
    google_document_analytics = pickle.load(file)
    
with open(os.path.join(filepath, google_sentence_sentiment_filename), 'rb') as file:
    google_sentence_sentiment = pickle.load(file)
    
print("doc analytics size:", len(google_document_analytics))
print("sentence sentiment size:", len(google_sentence_sentiment))

doc analytics size: 4
sentence sentiment size: 4
Wall time: 1.38 s


## Get Watson API Results

Get the data from the online API and store on file to save from repeated calls.

API Documentation
- https://cloud.ibm.com/apidocs/natural-language-understanding

Source Code
- http://watson-developer-cloud.github.io/python-sdk/v1.0.2/_modules/watson_developer_cloud/natural_language_understanding_v1.html

### Initiate Watson API

In [None]:
%%time
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, ConceptsOptions, EmotionOptions, EntitiesOptions, KeywordsOptions, CategoriesOptions, SentimentOptions

apikey = ''
url = ''

authenticator = IAMAuthenticator(apikey)
service = NaturalLanguageUnderstandingV1(version='2019-07-12', authenticator=authenticator)
service.set_service_url(url)

### Get overall document analytics for the Watson API

Generates output at the for all of Watson's natural langugage processing features.

Output object is in jsonlines format and stored to file.

In [None]:
%%time
import os
from tqdm import tqdm
import json

# instantiate dataset dictionary
document_results = dict()

#instantiate dict() for each orator()
orators_dict = {}
    
# iterate through each orator() object
for ref, orator in orators.orators_dict.items():
    
    # instantiate orator dict()
    orators_dict[ref] = list()
   
    
    # iterate over each Text() of the orator() object
    for text in tqdm(orator.texts, total = len(orator.texts), desc = ref):
    # instantiate document dict()
        document = dict()
        document["title"] = text.title
        document["ref"] : text.ref
        document["targets"] = targets = list(set([str(ent) for ent in text.doc.ents if ent.label_ in ["GPE", "NORP", "ORG", "PERSON"]])),
        
        document["analytics"] = service.analyze(
                        text=str(text.doc.text),
                        features=Features(
                            concepts=ConceptsOptions(limit=50),
                            emotion=EmotionOptions(targets = targets),
                            entities=EntitiesOptions(emotion=True, sentiment=True),
                            keywords=KeywordsOptions(emotion=True, sentiment=True),
                            categories=CategoriesOptions(),
                            sentiment=SentimentOptions()
                        )).get_result()
        
        # append the document object to the orator dict()
        orators_dict[ref].append(document)
        
    # append the orator dict() to the overall dataset
    document_results.update(orators_dict)
    
watson_document_results = document_results
# time = 1m3s

### Save Watson document object to file

In [None]:
%%time
# import json
# filepath = os.getcwd()
# json_filename = "watson_document_analytics.json"
# with open(os.path.join(filepath, json_filename), 'w') as file:
#     file.write(json.dumps(watson_document_results))

### Get sentence level sentiment analytics for Watson API

The output object uses the same structure as the orators dataset object.

In [None]:
%%time
import json
import os
from tqdm import tqdm

# instantiate dataset dictionary
watson_sentiment = dict()

#instantiate dict() for each orator()
orators_dict = {}
    
for ref, orator in orators.orators_dict.items():
    
    # instantiate orator dict()
    orators_dict[ref] = list()
    
    # iterator through orator() texts
    for text in orator.texts:
        
        # instantiate document dict()
        document = dict()
        document["title"] = text.title
        document["sentiments"] = list()
        
        # iterator through each sentence of the text and append sentiment score to the sentiments list
        for sentence in tqdm(text.doc.sents, total = len(list(text.doc.sents))):
            document["sentiments"].append(service.analyze(
                text=str(sentence.text),
                features=Features(sentiment=SentimentOptions()),
                language = "en").get_result())
            
        # aappend the document object to the orator dict()
        orators_dict[ref].append(document)
    
    # append the orator dict() to the overall dataset
    watson_sentiment.update(orators_dict)

### Save Watson sentence sentiment document to file

In [None]:
%%time
import os
import json

# filepath = os.getcwd()
# json_filename = "watson_sentence_sentiment.json"
# with open(os.path.join(filepath, json_filename), 'w') as file:
#     file.write(json.dumps(watson_sentiment))

### Code for loading Watson data

In [56]:
%%time

import os
import json

watson_document_analytics_filename = "watson_document_analytics.json"
watson_sentence_sentiment_filename = "watson_sentence_sentiment.json"

filepath = os.getcwd()
with open(os.path.join(filepath, watson_document_analytics_filename), 'r') as file:
    watson_document_analytics = json.load(file)
    
with open(os.path.join(filepath, watson_sentence_sentiment_filename), 'r') as file:
    watson_sentence_sentiment = json.load(file)
    
print("doc analytics size:", len(watson_document_analytics))
print("sentence sentiment size:", len(watson_sentence_sentiment))

doc analytics size: 4
sentence sentiment size: 4
Wall time: 87.8 ms


## Extract Relevant Information to Make a Usuable Object for this Task.

Output at the document level is in the following format

`


    google_document_analytics["bush"][4]["analytics"] = #(remember to sort out the metadata clash)
    {"sentiment": {"magnitude": 87.0, "score": -0.10000000149011612},
        "entities": {"entities": [{"name": "al Qaeda", "type": "ORGANIZATION", "metadata": {"key": "mid", "value": "/m/0v74"},
                                   "metadata": {"key": "wikipedia_url", "value": "https://en.wikipedia.org/wiki/Al-Qaeda"},
                                   "salience": 0.009386186487972736,
                                   "mentions": [{"text": {"content": "al Qaeda", "begin_offset": 4680}, "type": "PROPER", "sentiment": {"magnitude": 0.699999988079071, "score": 0.699999988079071}}]}]},
        "classifications": {"categories": {"name": "/Sensitive Subjects",
                                         "confidence": 0.7900000214576721}},
        "syntax": {
        "sentences": {"text": {"content": "The evidence we have gathered all points to a collection of loosely affiliated terrorist organizations known as al Qaeda.", "begin_offset": 3309}},
        "tokens": {"text": {"content": "terrorist", "begin_offset": 3388},
          "part_of_speech": {"tag": "ADJ"},
          "dependency_edge": {"head_token_index": 679, "label": "AMOD"},
          "lemma": "terrorist"}}}

`

`

    google_sentence_sentiment["bush"][4]["sentiments"][32].magnitude = 0.20000000298023224

    google_sentence_sentiment["bush"][4]["sentiments"][32].magnitude.score = -0.20000000298023224

`

Outputs for Watson are the following format:

`
    
    watson_document_analytics["bush"][4]["analytics"] = 
    {"usage": {"text_units": 2, "text_characters": "17321", "features": 5},
     "sentiment": {"document": {"score": -0.331922, "label": "negative"}},
     "language": "en",
     "keywords": [{"text": "United States of America",
                   "sentiment": {"score": -0.535138,
                                 "mixed": "1",
                                 "label": "negative"},
                   "relevance": 0.527585,
                   "emotion": {"sadness": 0.28403,
                               "joy": 0.355131,
                               "fear": 0.328408,
                               "disgust": 0.099274,
                               "anger": 0.113816},
                   "count": 3}],
     "entities": [{"type": "Organization", 
                   "text": "al Qaeda",
                   "sentiment": {"score": -0.738021, 
                                 "mixed": "1", 
                                 "label": "negative"},
                   "relevance": 0.624347,
                   "emotion": {"sadness": 0.174989,
                               "joy": 0.139256,
                               "fear": 0.699752,
                               "disgust": 0.154138,
                               "anger": 0.129947},
                   "disambiguation": {"subtype": ["MembershipOrganization"],
                                      "name": "Al-Qaeda", "dbpedia_resource": "http://dbpedia.org/resource/Al-Qaeda"},
                   "count": 6,
                   "confidence": 1}],
     "concepts": {"text": "Taliban", "relevance": 0.695602, "dbpedia_resource": "http://dbpedia.org/resource/Taliban"},
     "categories": [{"score": 0.954255, "label": "/society/unrest and war"}]
    }

`
                                                            
`
    
    watson_sentence_sentiment["bush"][4]["sentiments"][32] = {'usage': 
                                                             {'text_units': 1, 'text_characters': 134, 'features': 1}, 
                                                              'sentiment': {'document': {'score': 0.920489, 'label': 'positive'}}, 
                                                              'language': 'en'}
                                                        
`
                                                    
The relevant information is as follows:

`
    
    watson_sentiment["bush"][4]["sentiments"][32]["sentiment"]["document"] = 'scores': {'watson': {'score': 0, 'label': 'neutral'}}

`

the output document object for each orator will be a list() of dict() in the following format:

`
    
    document = {"title" : "",
        "sentiment_scores" : {"textblob" : score, "watson" : score, "google" : score},
        "most_pos_sents" : {"textblob" : [(index)], "watson" : [(index)], "google" : [(index)]}, # list of sentences with a score of +1
        "most_neg_sents" : {"textblob" : [(index)], "watson" : [(index)], "google" : [(index)]}, # list of sentence indicies with a score of -1
        "pos_sents" : {"textblob" : [(index, score)], "watson" : [(index, score)], "google" : [(index, score)]}, # sentence indicies with highest score other than +1
        "neg_sents" : {"textblob" : [(index, score)], "watson" : [(index, score)], "google" : [(index, score)]}, # sentence indicies with lowest score other than +1
        "sentences" : []}

`

The output sentence object for each sentence will a list of dict() objects:

`
    
    sent_obj = {"text" : "", "scores" : { 
        "watson" : 0,
        "google" : 0,
        "textblob" : 0
        }}

`

Each sentence object is accessed as follows:

`
    
    <API>_sentiment_analysis["bush"][4]["sentences"][32] = 
    {'text': 'The evidence we have gathered all points to a collection of loosely affiliated terrorist organizations known as al Qaeda.',
    'scores': {'watson': {'score': 0, 'label': 'neutral'}}}

`

In [61]:
import os
import json
from tqdm import tqdm
from textblob import TextBlob
    
# instantiate dataset dictionary
sentiment_analysis = dict()

#instantiate dict() for each orator()
orators_dict = {}

#access each orator
for ref, orator in orators.orators_dict.items():
    
    # instantiate orator dict()
    orators_dict[ref] = list()
    
    # iterator through orator() texts as a index
    for text in range(len(orator.texts)):
        
        # instantiate document dict()
        document = dict()
        document["ref"] = ref
        document["datestamp"] = orators[ref][text].datestamp
        document["title"] = orators[ref][text].title
        document["word count"] = len(orators[ref][text].doc.text)
        scores_obj = dict()
        scores_obj["textblob"] = list()
        scores_obj["watson"] = list()
        scores_obj["google"] = list()
        document["most_pos_sents"] = scores_obj # list of sentences with a score of +1
        document["most_neg_sents"] = scores_obj # list of sentence indicies with a score of -1
        
        scores_obj = dict()
        scores_obj["textblob"] = None
        scores_obj["watson"] = None
        scores_obj["google"] = None
        document["pos_sents"] = scores_obj # sentence indicies with highest score other than +1
        document["neg_sents"] = scores_obj # sentence indicies with lowest score other than +1
        
        # add document level sentiment results
        document["sentiment_scores"] = dict()
        
        #add textblob scores
        document["sentiment_scores"]["textblob"] = TextBlob(str(orators[ref][text].doc.text)).sentiment[0]
        
        # add watson scores
        document["sentiment_scores"]["watson"] = watson_document_analytics[ref][text]["analytics"]["sentiment"]["document"]["score"]
        
        # add google scores
        if ref == "hitler":
             document["sentiment_scores"]["google"] = mean([sentiment.score for sentiment in google_sentence_sentiment["hitler"][0]["sentiments"]])
        else:
            document["sentiment_scores"]["google"] = google_document_analytics[ref][text]["analytics"]["sentiment"].score
        
        # instantiate list of sentences for the sentence objects
        document["sentences"] = list() 
        
        # create a list of sentence texts
        sentence_text = list(orators[ref][text].doc.sents)
        
        textblob_max = 0
        textblob_min = 0
        watson_max = 0
        watson_min = 0
        google_max = 0
        google_min = 0
        api = ""

        # iterate through the sentence_texts using an index
        for sentence in tqdm(range(len(sentence_text)), total = len(sentence_text), desc = document["title"]):
            
            # initiate sent object
            sent = dict()
            
            # get the sentence text
            sent["text"] = str(sentence_text[sentence]).strip()
            
            # initate scores object
            sent["scores"] = dict()
            
            # add textblob scores
            api = "textblob"
            sent["scores"][api] = TextBlob(sent["text"]).sentiment[0]
            
            # initaite sentence scores object for max-min analysis
            sent_score = {"text" : sent["text"], "score" : sent["scores"][api]}
            
            if sent["scores"][api] == 1:
#                 print(f'most_pos_sents ({sent["scores"][api]}) => {sent["text"]}')
                document["most_pos_sents"][api].append(sent["text"])
            
            if sent["scores"][api] == -1:
#                 print(f'most_neg_sents ({sent["scores"][api]}) => {sent["text"]}')
                document["most_neg_sents"][api].append(sent["text"])
            
            if sent["scores"][api] > 0 and sent["scores"][api] < 1 and sent["scores"][api] > textblob_max:
                textblob_max = sent["scores"][api]
#                 print(f'pos_sents ({sent["scores"][api]}) => {sent["text"]}')
                document["pos_sents"][api] = sent_score
            
            if sent["scores"][api] < 0 and sent["scores"][api] > -1 and sent["scores"][api] < textblob_min:
                textblob_min = sent["scores"][api]
#                 print(f'neg_sents ({sent["scores"][api]}) => {sent["text"]}')
                document["neg_sents"][api] = sent_score
            
            # add watson scores
            api = "watson"
            sent["scores"][api] = watson_sentence_sentiment[ref][text]["sentiments"][sentence]["sentiment"]["document"]["score"]
            
            if sent["scores"][api] == 1:
                document["most_pos_sents"][api].append(sent["text"])
            
            if sent["scores"][api] == -1:
                document["most_neg_sents"][api].append(sent["text"])
            
            if sent["scores"][api] > 0 and sent["scores"][api] < 1 and sent["scores"][api] > watson_max:
                watson_max = sent["scores"][api]
                document["pos_sents"][api] = sent_score
            
            if sent["scores"][api] < 0 and sent["scores"][api] > -1 and sent["scores"][api] < watson_min:
                watson_min = sent["scores"][api]
                document["neg_sents"][api] = sent_score
            
            # add google scores
            api = "google"
            sent["scores"][api] = google_sentence_sentiment[ref][text]["sentiments"][sentence].score
            
            if sent["scores"][api] == 1:
                document["most_pos_sents"][api].append(sent["text"])
            
            if sent["scores"][api] == -1:
                document["most_neg_sents"][api].append(sent["text"])
            
            if sent["scores"][api] < 1 and sent["scores"][api] > google_max:
                google_max = sent["scores"][api]
                document["pos_sents"][api] = sent_score
            
            if sent["scores"][api] > -1 and sent["scores"][api] < google_min:
                google_min = sent["scores"][api]
                document["neg_sents"][api] = sent_score
            
            # append the sent object to the list of document objects
            document["sentences"].append(sent)
        
        # append the document object to the orator dict()
        orators_dict[ref].append(document)
    
    # append the orator dict() to the overall dataset
    sentiment_analysis.update(orators_dict)

Mein Kampf: 100%|██████████| 4527/4527 [00:01<00:00, 2723.18it/s]
911 Address to the Nation: 100%|██████████| 37/37 [00:00<00:00, 2318.58it/s]
Remarks at the National Day of Prayer & Remembrance Service: 100%|██████████| 57/57 [00:00<00:00, 2041.22it/s]
First Radio Address following 911: 100%|██████████| 30/30 [00:00<00:00, 2313.67it/s]
Address at Islamic Center of Washington, D.C.: 100%|██████████| 39/39 [00:00<00:00, 2172.15it/s]
Address to Joint Session of Congress Following 911 Attacks: 100%|██████████| 186/186 [00:00<00:00, 3727.79it/s]
Operation Enduring Freedom in Afghanistan Address to the Nation: 100%|██████████| 57/57 [00:00<00:00, 2381.44it/s]
911 Pentagon Remembrance Address: 100%|██████████| 93/93 [00:00<00:00, 2451.73it/s]
Prime Time News Conference on War on Terror: 100%|██████████| 37/37 [00:00<00:00, 1612.94it/s]
Prime Time News Conference Q&A: 100%|██████████| 410/410 [00:00<00:00, 3453.75it/s]
Address on Signing the USA Patriot Act of 2001: 100%|██████████| 64/64 [00

### Save the sentence sentiments object to file

In [62]:
%%time
import os
import pickle

filepath = os.getcwd()
pickle_filename = "sentiment_analysis"
with open(os.path.join(filepath, pickle_filename), 'wb') as file:
    pickle.dump(sentiment_analysis, file)

Wall time: 15 ms


### Code for loading the file

In [None]:
%%time
import os
import pickle

filepath = os.getcwd()
pickle_filename = "sentiment_analysis"
with open(os.path.join(filepath, pickle_filename), 'rb') as file:
    sentiment_analysis = pickle.load(file)