## Instantiate the Pipeline

In [None]:
%%time
import importlib
import cndlib.pipeline
importlib.reload(cndlib.pipeline)

cnd = cndlib.pipeline.CND()

print([name for name in cnd.nlp.pipe_names])

## Instantiate the Dataset

In [None]:
%%time
import importlib
from IPython.display import clear_output
import cndlib.cndobjects
importlib.reload(cndlib.cndobjects)


dirpath = r'C:\\Users\\spa1e17\\OneDrive - University of Southampton\\hostile-narrative-analysis\\dataset'

orators = cndlib.cndobjects.Dataset(cnd, dirpath)
clear_output(wait=True)

display(orators.summarise())

## Create .csv Files of Sentences of Each Orator for Annotation

In this experiment we test sentiment analysis to detect the ingroup elevation and outgroup other phrases. For this experiment, each sentence of Bush and bin Laden’s datasets have been annotated as either ingroup elevation or outgroup othering. Accordingly, annotation was based on two criteria. Firstly, the sentence must contain reference to a named entity, whether explicitly or by either noun phrase or pronoun. Secondly, the reference must be associated with a term in the sentence which either elevates or others the reference entity. For example, with an implicit reference to al Qaeda, the following two sentences from Bush are annotated as othering, “These terrorists don't represent peace”, “They represent evil and war”. Equally, from both datasets the clauses, “God bless America” or “Allah blessed be upon him” are annotated as elevation.

An extra annotation was also added for hostile and anti-sematic sentences. Hostile sentences are those containing a threat of violence. For example from bin Laden, “And whoever has killed our civilians, then we have the right to kill theirs”, or from Bush, “We are sending a signal to the world as we speak that if you harbor a terrorist, there will be a price to pay.”. Some hostile sentences are veiled threat, but in the context of the narrative are determined to be threatening. Bin Laden’s explicit outgroup are Jews and Israel, as such, may of his sentences have been annotated as Ant-Sematic. The International Holocaust Remembrance Alliance (IHRC) definition of anti-Semitism was used as a guide for these annotations . An example of one annotation is, “Behind them stand the Jews, who control your policies, media and economy”. In this sentence bin Laden suggests Jewish people control the wealthy Americans, which conforms with the IHRC’s anti-Semitic characterisation of Jewish control of “Jews controlling the media, economy, government or other societal institutions”.


In [None]:
%time
import os
import csv
import pandas as pd


docs = {"bush" : {"name" : "George Bush", "filename" : "bush_sentences_gold.txt", "sentences" : []},
       "laden" : {"name" : "Osama bin Laden", "filename" : "bush_sentences_gold.txt", "sentences" : []},
       "king" : {"name" : "Martin Luther King", "sentences" : []},
       "hitler" : {"name" : "Adolf Hitler", "sentences" : []}}

dirpath = os.getcwd()

for orator, texts in orators.orators_dict.items():
    
    
    for text in texts.texts:
        for sentence in text.doc.sents:
            sent = {"function" : "", "hostile" : "", "text" : sentence.text.replace('\n', ' ').strip()}
            docs[orator]['sentences'].append(sent)
    
    filename = f"{orator}_sentences.csv"
    df = pd.DataFrame(docs[orator]['sentences'])
    filepath = os.path.join(dirpath, filename)
    
    df.to_csv(filepath, sep=',',index=False)

            
pd.DataFrame([{"Orator" : doc['name'], 
               "Number of Sentences" : len(doc['sentences'])} 
              for doc in docs.values()
             ])
            
            
    

## Import Annotation Results for Scoring by Sentiment Analysis APIs

In [None]:
import os
import csv
import pandas as pd
from cndlib.visuals import display_side_by_side

docs = {
    "bush" : {"name" : "George Bush", "filename" : "bush_sentences_gold.txt", "sentences" : None},
    "laden" : {"name" : "Osama bin Laden", "filename" : "laden_sentences_gold.txt", "sentences" : None}
}

for orator in docs.values():
    filename = filename = os.path.join(os.getcwd(), orator['filename'])

    with open(filename, newline = "") as fp:
        data = csv.DictReader(fp, delimiter = '\t')

        orator['sentences'] = [row for row in data]
        
df = pd.DataFrame()
for orator in docs.values():
    
    summary = []
    summary.append({"Number of Sentences" : len(orator['sentences'])})
    df2 = pd.DataFrame(orator['sentences'])
    summary.extend([{f"{k.title()} Sentences" : str(v) for k, v in df2['function'].value_counts().items() if k}])
    summary.extend([{f"{k.title()} Sentences" : str(v) for k, v in df2['hostile'].value_counts().items() if k}])
    
    df = pd.concat([df, pd.DataFrame({k:v for x in summary for k,v in x.items()}, index = [orator['name']])])
    
display_side_by_side([df.fillna('')], ["Elevation and Othering Annotation Results"])

### Get Google API Results

https://cloud.google.com/natural-language/docs/basics#:~:text=score%20of%200.8%20.-,Interpreting%20sentiment%20analysis%20values,the%20length%20of%20the%20document

In [None]:
%%time
import os
from tqdm import tqdm
import pickle

# instantiate Google Sentiment Analysis
from google.cloud import language_v1
client = language_v1.LanguageServiceClient()

    
# iterate through each orator() object
for orator in docs.values():
   
    # iterate over each Text() of the orator() object
    for sent_obj in tqdm(orator['sentences'], total = len(orator['sentences']), desc = orator['name']):

        text = sent_obj['text']
#         document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
        
#         sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment
#         sent_obj['google sentiment score'] = sentiment.score
#         sent_obj['google sentiment magnitude'] = sentiment.magnitude

display(pd.DataFrame([obj for obj in docs['laden']['sentences']]))
    
# google_document_results = document_results

### Get IBM Watson API Results

In [None]:
%%time
import json
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, SentimentOptions, EmotionOptions

apikey = 'D3ptPkoLkoQNJvIav-reiA5137cr3m8Y1f-mhX1bLile'
url = 'https://api.eu-gb.natural-language-understanding.watson.cloud.ibm.com/instances/204e6ba7-952c-41ae-99e9-fe4e8208bfde'

authenticator = IAMAuthenticator(apikey)
service = NaturalLanguageUnderstandingV1(version='2019-07-12', authenticator=authenticator)
service.set_service_url(url)

In [None]:
for orator in docs.values():
    
    for sent_obj in tqdm(orator['sentences'], total = len(orator['sentences']), desc = orator['name']):
  
        text = sent_obj['text']
        analytics = service.analyze(text=text, features=Features(
                                    sentiment=SentimentOptions(), 
                                    emotion=EmotionOptions()),
                                    language = "en").get_result()
        
        sent_obj['watson sentiment'] = analytics['sentiment']['document']['score']
        emotion = analytics['emotion']['document']['emotion']
        sent_obj.update({f"Watson {k}" : v for k, v in emotion.items()})

### Get Microsoft Azure API Results

https://docs.microsoft.com/en-us/azure/cognitive-services/text-analytics/quickstarts/client-libraries-rest-api?tabs=version-3-1&pivots=programming-language-python#sentiment-analysis

In [None]:
import json
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

filename = "C:\\Users\\spa1e17\\OneDrive - University of Southampton\\CNDWip\\APIKeys\\AzureKeys.json"

with open(filename, 'r') as fp:
    keys = json.load(fp)
    apikey = keys['KEY 1']
    endpoint = keys['Endpoint']

credential = AzureKeyCredential(apikey)
endpoint=endpoint

text_analytics_client = TextAnalyticsClient(endpoint, credential)

for orator in docs.values():
    
    for sent_obj in tqdm(orator['sentences'], total = len(orator['sentences']), desc = orator['name']):
  
        text = [sent_obj['text']]
        
        response = text_analytics_client.analyze_sentiment(text, language="en")
        label = response[0].sentiment
        score = response[0].confidence_scores[label]

        if label == "negative":
            score = score*-1
        
        sent_obj['azure sentiment'] = score

## Get TextBlob API Results

https://textblob.readthedocs.io/en/dev/quickstart.html#sentiment-analysis

In [None]:
from textblob import TextBlob

for orator in docs.values():
    
    for sent_obj in tqdm(orator['sentences'], total = len(orator['sentences']), desc = orator['name']):
  
        text = sent_obj['text']
        
        sent_obj['textblob sentiment'] = TextBlob(text).sentiment[0]

## Write Results to Disc

In [None]:
import os
from cndlib.cndutils import dump_jsonl
import json
print(type(docs))
filename = os.path.join(os.getcwd(), "gold_results.json")
with open(filename, 'w') as file:
     file.write(json.dumps(docs))
    
    
    

In [None]:
df = pd.DataFrame(doc for doc in docs['laden']['sentences'])

labels = ['text', 'function', 'hostile'] + [label for label in df.keys() if 'sentiment' in label and 'magnitute' not in label]
display(df[labels][df.function.eq('elevation')])


In [None]:
def get_function(orator, entity):

    """
    function to get the grouping of an entity from the orator's groupings
    """
    if entity in docs[orator]['text']['groups']['ingroup']:
        return "ingroup"
    if entity in docs[orator]['text']['groups']['outgroup']:
        return "outgroup"
    return "not found"

def assessment_test(col1, col2):

    """
    function to test whether a sentiment scores matches ingroup/outgroup
    """

    if col1 == "positive" or col1 == "neutral" and col2 == "ingroup":
        return "pass"
    if col1 == "negative" and col2 == "ingroup":
        return "fail"
    if col1 == "negative" and col2 == "outgroup":
        return "pass"
    if col1 == "positive" or col1 == "neutral" and col2 == "outgroup":
        return "fail"
    
# create new dataframe based on filtered columns
scores = lambda table, column, labels: table[table.column.isin(labels)], ignore_index = True)

## iterate through the docs
for orator in docs:
    
    # capture results
    results = pd.DataFrame(docs[orator]['sentences'])
    
    ## create a dataframe for positive and negative results
    dfs = dict()
    dfs = {"elevation" : {"result" : None, "df" : scores(results, 'function', ['elevation'])}, 
           "othering" : {"result" : None, "df" : scores(results, 'function', ['othering'])}}

    for obj in dfs.values():
        
        df = obj["df"]
            
        # get the grouping for each entity
        df["grouping"] = df.apply(lambda x: get_group(orator, x["text"]), axis = 1)
        
        # test whether sentiment score matches ingroup/outgroup        
        df["test result"] = df.apply(lambda x: assessment_test(x["label"], x["grouping"]), axis=1)
        
        # get the success scores for ingroup and outgroup
        obj["result"] = format(df["test result"].value_counts(normalize = True)["pass"], '.0%')
        
        # format dataframe
        df.drop('mixed', axis = 1, inplace = True)
        df['text'] = df['text'].str.title()
        df.rename(columns = {"score" : "sentiment score", "text" : "entity text"}, inplace = True)
        df.columns = df.columns.str.title()

    docs[orator]['text']['analytics']['sentiment']['dfs'] = dfs
    
#     # display the outputs
#     display_side_by_side([output["df"] for output in dfs.values()],
#                          [f"{key.title()} scores for {docs[orator]['name']} has a True Positive Score of {obj['result']} from a total of {len(obj['df'])} Entities"
#                          for key, obj in dfs.items()])
#     print()

dfs = []
captions = []
for orator in docs.values():
    for group, df in orator['text']['analytics']['sentiment']['dfs'].items():
        dfs.append(df['df'])
        captions.append(f"{group.title()} scores for {orator['name']} has a Success of {df['result']} from a total of {len(df['df'])} Entities")
        
display_side_by_side(dfs, captions)