# Creating Benchmark Data
---
 
In this notebook the benchmark data is creating for testing the NLP technologies in each experiment.

## InGroup and Outgroup For Each Orator

In this cell a JSON object is created containing the ingroups and outgroups of each orator. These groups are noun phrases identifying the groups and are taken from the speech in which each orator identified their outgroup. For bin Laden, this was his first speech published on 23/08/1996; for Bush he first identified hi outgroup in his State of the Union address on 20/09/2001

In [2]:
import os
import json
from datetime import datetime

groups_benchmark = {
    
    "bush" : {
        "ingroup" : ["america", "american people", "americans", "united states", "united states of America", "my fellow americans", "fellow americans"],
        "outgroup" : ["al qaeda", "taliban regime", "taliban", "egyptian islamic jihad", "islamic movement of uzbekistan"]
    },
    
    "binladen" : {
        "ingroup" : ["people of islam", "islamic world", "ummah of islam", "muslims", "muslim people", "muslim nation"],
        "outgroup" : ["zionist-crusaders alliance", "american crusaders", "american zionist alliance", "american-israeli alliance", "saudi regime", "american enemy", "zionist-crusaders"]
    }
}

filepath = "C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Data/"

with open(os.path.join(filepath, "groups_benchmark.json"), "wb") as f:
    f.write(json.dumps(groups_benchmark).encode("utf-8"))
    
print("complete at: ", datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))    

complete at:  24/02/2020 - 15:40:04


In [14]:
# https://stackoverflow.com/questions/19736080/creating-dataframe-from-a-dictionary-where-entries-have-different-lengths

import pandas as pd

keys = list(groups_benchmark.keys())
print(keys)

frames = []
for value in groups_benchmark.values():
    frames.append(pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in value.items() ]), index = None).fillna(""))

display(pd.concat(frames , keys = keys))

['bush', 'binladen']


Unnamed: 0,Unnamed: 1,ingroup,outgroup
bush,0,america,al qaeda
bush,1,american people,taliban regime
bush,2,americans,taliban
bush,3,united states,egyptian islamic jihad
bush,4,united states of America,islamic movement of uzbekistan
bush,5,my fellow americans,
bush,6,fellow americans,
binladen,0,people of islam,zionist-crusaders alliance
binladen,1,islamic world,american crusaders
binladen,2,ummah of islam,american zionist alliance


## Creating the dataset

In [None]:
%%time

import datetime
import os

FileList = ['20010114-Remarks at the National Day of Prayer & Remembrance Service.txt',
            '20010115-First Radio Address following 911.txt',
            '20010117-Address at Islamic Center of Washington, D.C..txt',
           '20010120-Address to Joint Session of Congress Following 911 Attacks.txt',
           '20010911-Address to the Nation.txt',
           '20011007-Operation Enduring Freedom in Afghanistan Address to the Nation.txt',
           '20011011-911 Pentagon Remembrance Address.txt',
           '20011011-Prime Time News Conference on War on Terror.txt',
           '20011026-Address on Signing the USA Patriot Act of 2001.txt',
           '20011110-First Address to the United Nations General Assembly.txt',
           '20011211-Address to Citadel Cadets.txt',
           '20011211-The World Will Always Remember 911.txt',
           '20020129-First (Official) Presidential State of the Union Address.txt',
           ]
raw = ''

filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Speeches/'

binladenpath = os.path.join(filepath, 'Osama bin Laden/')
bushpath = os.path.join(filepath, 'George Bush/')

for f in FileList:
    with open(bushpath + f, 'r') as text:
        bushraw = bushraw + text.read()

FileList = ['19960823-OBL Declaration.txt',
            '20011007-OBL Full Warning.txt',
            '20011109-OBL.txt',
            '20021124-OBL Letter to America.txt',
            '20041101-Al Jazeera Speech.txt'
           ]

for f in FileList:
    with open(binladenpath + f, 'r') as text:
        binladenraw = binladenraw + text.read()
        
# with open(os.path.join(filepath, "fulltext.txt"), 'w') as text:
#         text.write(raw)

print('length of doc: ', len(raw))
print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

## Capturing Sentences Relating to Ingroup and Outgroup

In this notebook we iterate over all the sentence in the speech if appropriate manually classify each sentence as either ingroup elevation or outgroup othering.

In [None]:
from spacy import displacy
import os

# dictionary object for the sentences from each file
sentences = dict()

# iterate over sentences from each orator, remove any return symbols and add to dictionary object
# note, sentences are identified by their index in a document rather than the word
for sentence in doc.sents:
    if doc[sentence.end -1].text == '\n':
        sentences[len(sentences)] = [sentence.start, sentence.end - 1]
    else:
        sentences[len(sentences)] = [sentence.start, sentence.end]
    
# print the first five sentences of each sentence dictionary
i=0
for key, value in sentences.items():
    print(key, '=>', doc[value[0]:value[1]])
    i+=1
    if i == 5:
        break
print()

ingroup = dict()
outgroup = dict()
index = dict()

# open previous file and progress index

filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/'

try:
    with open(os.path.join(filepath, "ingroup_sentences.json"), 'r') as fp:
        ingroup = json.load(fp)
except:
    pass

try:
    with open(os.path.join(filepath, "outgroup_sentences.json"), 'r') as fp:
        outgroup = json.load(fp)
except:
    pass

try:
    with open(os.path.join(filepath, "index.json"), 'r') as fp:
        index = json.load(fp)
except:
    index["index"] = 0


print(index)

#iterate over each sentence dictionary for classification of ingroup or outgroup
while i < len(sentences):
    # get the current progress through the dictionary object
    i = index["index"]
    
    # record progress  through dictionary object
    with open(os.path.join(filepath, "index.json"), "wb") as f:
            f.write(json.dumps(index).encode("utf-8"))
    
    # print current sentence
    print('-----')
    print('index: ', i)
    displacy.render(doc[sentences[i][0]:sentences[i][1]], style = 'ent')
    print(i, '/', len(sentences), '=>', doc[sentences[i][0]:sentences[i][1]])
    entry = input('ingroup(i) / outgroup(o) / back(b)').lower()
    
    # ask if sentence is refering to an ingroup or outgroup
    if entry in ['i', 'o']:        
        if entry == 'i': # add sentence to ingroup dictionary if user selects ingroup
            print(len(ingroup), ' => ingroup add: ', doc[sentences[i][0]:sentences[i][1]])
            ingroup[len(ingroup)] = [sentences[i][0], sentences[i][1]]
            
            # write dictionary to file
            with open(os.path.join(filepath, "ingroup_sentences.json"), "wb") as f:
                f.write(json.dumps(ingroup).encode("utf-8"))
            
        else: # else add sentence to outgroup dictionary
            print(len(outgroup), ' => outgroup add: ', doc[sentences[i][0]:sentences[i][1]])
            outgroup[len(outgroup)] = [sentences[i][0], sentences[i][1]]
            
            # write dictionary to file
            with open(os.path.join(filepath, "outgroup_sentences.json"), "wb") as f:
                f.write(json.dumps(outgroup).encode("utf-8"))
                
        # increase index by 1
        index["index"] += 1
    
    
    # if user enters 'b' then go back by 1 in the dictionary and delete
    elif entry == 'b': 
        if i != 0:
            
            # test whether the previous sentence was ingroup or outgroup and delete from respective dictionary
            
            if len(ingroup) - 1 >= 0 and sentences[i-1] == ingroup[len(ingroup) - 1]:
                print('deleting from ingroup: ', doc[ingroup[len(ingroup) - 1][0]:ingroup[len(ingroup) - 1][1]])
                del(ingroup[len(ingroup) - 1])

                with open(os.path.join(filepath, "ingroup_sentences.json"), "wb") as f:
                    f.write(json.dumps(ingroup).encode("utf-8"))

            elif len(outgroup) - 1 >= 0 and sentences[i-1] == outgroup[len(outgroup) - 1]:
                print('deleting from outgroup: ', doc[outgroup[len(outgroup) - 1][0]:outgroup[len(outgroup) - 1][1]])
                del(outgroup[len(outgroup) - 1])

                with open(os.path.join(filepath, "outgroup_sentences.json"), "wb") as f:
                    f.write(json.dumps(outgroup).encode("utf-8"))

            index["index"] -= 1
        
        else:
            print('iterating backwards by one sentence')
            pass
        
    # quit    
    elif entry == 'q':
        break
        
    else:
        index["index"] += 1

print(f'completed at {str(datetime.datetime.now())}') #1220