# Creating Benchmark Data
---
 
In this notebook the benchmark data is creating for testing the NLP technologies in each experiment.

## InGroup and Outgroup For Each Orator

In this cell a JSON object is created containing the ingroups and outgroups of each orator. These groups are noun phrases identifying the groups and are taken from the speech in which each orator identified their outgroup. For bin Laden, this was his first speech published on 23/08/1996; for Bush he first identified hi outgroup in his State of the Union address on 20/09/2001

In [5]:
import os
import json
from datetime import datetime

groups_benchmark = {
    
    "bush" : {
        "ingroup" : ["america", "american people", "americans", "united states", "united states of America", "my fellow americans", "fellow americans"],
        "outgroup" : ["al qaeda", "taliban regime", "taliban", "egyptian islamic jihad", "islamic movement of uzbekistan"]
    },
    
    "binladen" : {
        "ingroup" : ["people of islam", "islamic world", "ummah of islam", "muslims", "muslim people", "muslim nation"],
        "outgroup" : ["zionist-crusaders alliance", "american crusaders", "american zionist alliance", "american-israeli alliance", \
                      "Jewish-crusade alliance", "saudi regime", "american enemy", "zionist-crusaders", "Christian armies of the Americans", 
                     "american people", "american army", "the bush administration"]
    }
}

filepath = "C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Data/"

with open(os.path.join(filepath, "groups_benchmark.json"), "wb") as f:
    f.write(json.dumps(groups_benchmark).encode("utf-8"))
    
print("complete at: ", datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))    

complete at:  19/06/2020 - 15:22:57


In [6]:
# https://stackoverflow.com/questions/19736080/creating-dataframe-from-a-dictionary-where-entries-have-different-lengths

import pandas as pd

keys = list(groups_benchmark.keys())
print(keys)

frames = []
for value in groups_benchmark.values():
    frames.append(pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in value.items() ]), index = None).fillna(""))

display(pd.concat(frames , keys = keys))

['bush', 'binladen']


Unnamed: 0,Unnamed: 1,ingroup,outgroup
bush,0,america,al qaeda
bush,1,american people,taliban regime
bush,2,americans,taliban
bush,3,united states,egyptian islamic jihad
bush,4,united states of America,islamic movement of uzbekistan
bush,5,my fellow americans,
bush,6,fellow americans,
binladen,0,people of islam,zionist-crusaders alliance
binladen,1,islamic world,american crusaders
binladen,2,ummah of islam,american zionist alliance


## Instantiate the Pipeline

In [1]:
%%time
import importlib
import pipeline
importlib.reload(pipeline)

dirpath = r'C:\\Users\\Steve\\OneDrive - University of Southampton\\CNDPipeline\\dataset'
print("initiating custom pipeline")
cnd = pipeline.CND()

print([name for name in cnd.nlp.pipe_names])

initiating custom pipeline
['tagger', 'parser', 'ner', 'Named Entity Matcher', 'merge_entities', 'Concept Matcher']
Wall time: 19.9 s


## Creating the data

In [2]:
%%time

filepath = r"C:\Users\Steve\OneDrive - University of Southampton\CNDPipeline\dataset\Osama bin Laden\fulltext.txt"

with open(filepath, "r") as f:
    fulltext = f.read()
    
doc = cnd(fulltext)

sents_dict = dict()

for sent in doc.sents:
    if doc[sent.end -1].text == '\n':
        sents_dict[len(sents_dict)] = str(sent)
    else:
        sents_dict[len(sents_dict)] = str(sent)
        
print(len(sents_dict))

740
Wall time: 13 s


## Capturing Sentences Relating to Ingroup and Outgroup

In this notebook we iterate over all the sentence in the speech if appropriate manually classify each sentence as either ingroup elevation or outgroup othering.

In [8]:
from datetime import datetime
import os
import json
from IPython.display import clear_output
from spacy import displacy
from visuals import sent_frame

sents_dict = dict()

dirpath = os.getcwd()
ingroup = dict()
outgroup = dict()
index = dict()
ingroup_file = "ingroup_sents.json"
ingroup_filepath = os.path.join(dirpath, ingroup_file)
outgroup_file = "outgroup_sents.json"
outgroup_filepath = os.path.join(dirpath, outgroup_file)
index_filepath = os.path.join(dirpath, "index.json")

# open previous file and progress index

try:
    with open(ingroup_filepath, 'r') as fp:
        ingroup = json.load(fp)
except:
    pass

try:
    with open(outgroup_filepath, 'r') as fp:
        outgroup = json.load(fp)
except:
    pass

try:
    with open(index_filepath, 'r') as fp:
        index = json.load(fp)
except:
    index = 0

#iterate over each sentence dictionary for classification of ingroup or outgroup
while index < len(sents_dict):

    # record progress  through dictionary object
    with open(index_filepath, "wb") as f:
            f.write(json.dumps(index).encode("utf-8"))

    # clear screen
    clear_output(wait=True)# get text
    
    # show progress through input_dict
    print(f'{index} / {len(sents_dict)}')
    
    # get sentence text
    text = sents_dict[index]

    # parse text
    doc = cnd(text)

    # if the option to show the dependency parse is passed display it
#     displacy.render(doc, style="dep")

    # display the sentence frame in compact form
    display(sent_frame(doc))

    entry = input('ingroup(i) / outgroup(o) / delete (d) / back(b)').lower()
    
    # ask if sentence is refering to an ingroup or outgroup
    if entry in ['i', 'o']:        
        if entry == 'i': # add sentence to ingroup dictionary if user selects ingroup
            print(len(ingroup), ' => ingroup add: ', text)
            ingroup[len(ingroup)] = text
            
            # write dictionary to file
            with open(ingroup_filepath, "wb") as f:
                f.write(json.dumps(ingroup).encode("utf-8"))
            
        else: # else add sentence to outgroup dictionary
            print(len(outgroup), ' => outgroup add: ', text)
            outgroup[len(outgroup)] = text
            
            # write dictionary to file
            with open(outgroup_filepath, "wb") as f:
                f.write(json.dumps(outgroup).encode("utf-8"))
                
        # increase index by 1
        index += 1
    
    # if user enters 'd' then go back by 1 in the dictionary and delete
    elif entry == 'd': 
        if index != 0:
            
            # test whether the previous sentence was ingroup or outgroup and delete from respective dictionary
            
            if index >= 0 and len(ingroup) - 1 >= 0 and sents_dict[index-1] == ingroup[len(ingroup) - 1]:
                print('deleting from ingroup: ', ingroup.pop())

                with open(ingroup_filepath, "wb") as f:
                    f.write(json.dumps(ingroup).encode("utf-8"))

            elif index >= 0 and len(outgroup) - 1 >= 0 and sents_dict[index-1] == outgroup[len(outgroup) - 1]:
                print('deleting from outgroup: ', outgroup.pop())

                with open(outgroup_filepath, "wb") as f:
                    f.write(json.dumps(outgroup).encode("utf-8"))

            index -= 1
        
        else:
            print('iterating backwards by one sentence')
            pass
        
    # quit    
    elif entry == 'q':
        break
        
    else:
        index += 1

print("complete at: ", datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))  #1220

complete at:  19/06/2020 - 15:23:36
