In [None]:
import gensim
import datetime

print('creating word2vec model from GoogleNews-vectors-negative300.bin.gz')

text = 'C:/Users/Steve/Documents/Cultural Violence/GoogleNews-vectors-negative300.bin.gz'
#text = ''

model = gensim.models.KeyedVectors.load_word2vec_format(text, binary=True)

vocab = model.vocab.keys()
print(f"{len(vocab):,d}")
print(f'completed at {str(datetime.datetime.now())}')

In [2]:
%%time

import datetime
import os

def get_dataset_dirpath(cwd):
    
    one_up = os.path.dirname(cwd)
    
    return os.path.join(one_up, 'dataset')

dataset_dirpath = get_dataset_dirpath(os.getcwd())

binladenpath = os.path.join(dataset_dirpath, 'Osama bin Laden/')
bushpath = os.path.join(dataset_dirpath, 'George Bush/')


Bush_FileList = [
    '20010914-Remarks at the National Day of Prayer & Remembrance Service.txt',
    '20010915-First Radio Address following 911.txt',
    '20010917-Address at Islamic Center of Washington, D.C..txt',
    '20010920-Address to Joint Session of Congress Following 911 Attacks.txt',
    '20010911-911 Address to the Nation.txt',
    '20011007-Operation Enduring Freedom in Afghanistan Address to the Nation.txt',
    '20011011-911 Pentagon Remembrance Address.txt',
    '20011011-Prime Time News Conference on War on Terror.txt',
    '20011026-Address on Signing the USA Patriot Act of 2001.txt',
    '20011110-First Address to the United Nations General Assembly.txt',
    '20011211-Address to Citadel Cadets.txt',
    '20011211-The World Will Always Remember 911.txt',
    '20020129-First (Official) Presidential State of the Union Address.txt'
]

binLaden_FileList = [
    '19960823-Declaration of Jihad Against the Americans Occupying the Land of the Two Holiest Sites.txt',
    '20010107-Osama Bin Laden Letter Calling For Global Islamic State.txt',
#     '20011109-Bin Laden\'s Statement The Sword Fell.txt',
    '20021124-OBL Letter to America.txt',
    '20041101-Al Jazeera Speech.txt'
           ]

records = {
    "bush": {"filepath": bushpath, "texts": Bush_FileList},
    "binladen": {"filepath": binladenpath, "texts": binLaden_FileList}
}

for orator, record in records.items():

    raw = ""
    for text in record["texts"]:
        with open(os.path.join(record["filepath"], text), 'r') as text:
            raw = raw + text.read()
                         
    record['fulltext'] = raw
    print(f'{orator} doc length = {len(raw)}')
    
    fulltext_path = os.path.join(record["filepath"], 'fulltext.txt')
        
    with open(fulltext_path, 'w') as text:
            text.write(raw)

print(f'completed at: {datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")}')

bush doc length = 111934
binladen doc length = 86384
completed at: Oct 09 2022 07:26:12
CPU times: user 2.39 ms, sys: 3.62 ms, total: 6.02 ms
Wall time: 8.03 ms


In [5]:
# Applying pipeline

import spacy


print('setting up pipeline')
nlp = spacy.load('en_core_web_md')
print('applying pipeline')

bush_text = records['bush']['fulltext']

doc = nlp(bush_text)

print(f"document length: {len(doc)}")
print(f"number of entities: {len(doc.ents)}")

entset = set()

for ent in doc.ents:
    if ent.label_ in {'ORG', 'GPE', 'NORP'}:
        entset.add(ent.text.replace(' ', '_'))
    
print(f"number of entities: {len(entset)}")

print(f'completed at {str(datetime.datetime.now())}')

setting up pipeline
applying pipeline
document length: 22591
number of entities: 995
number of entities: 155
completed at 2022-10-09 07:26:42.191008


In [6]:
### the purpose of this cell is to test the different ways to find similarity

import pandas as pd

array = {'Enemy', 'Friend', 'Terrorist', 'al_Qaida'}

word1 = input('Target Word 1: ')
word2 = input('Opposing Word 2: ')

print()
print(f"{word1}-{word2}") # showing words most similar to the target word, minus the opposing word
print(pd.DataFrame(model.most_similar(positive=[word1]), columns = [f"Closest to {word1} |", 'Similarity |']).T)
print('-----')
print('Similarity')
print(f"{word1}-{word2} similarity = {model.similarity(word1, word2)}") #showing the similarity of each word

Target Word 1:  Enemy
Opposing Word 2:  Friend



Enemy-Friend


NameError: name 'model' is not defined

In [None]:
import pandas as pd
### the purpose of this cell is to find the closest entity to a word

entity_scores = []
target_word = input('target word:')

#terrorism, islam, taliban, al_Qaeda, afghans, Usama_bin_Laden, terrorist, somalia, islamic jihad, iraq

#gypsy,

for ent in entset:
    try: 
        entity_scores.append([ent, model.similarity(target_word, ent)]) #showing the entities most closely associated with the target word
    except:
        pass

df2 = pd.DataFrame(entity_scores).sort_values(by=[1], ascending=False)
print(df2[0:31])

## rank all entities of the text relative to the in-group and out-group to each seed words, at least 5 seed words per group
## sum up scores of each entity to each word and divide by number of seed words
## create a method to measure closeness to seed word and distance from its opposite
## output: 2 scores: average distance to ingroup and average to outgroup chose lowest score to infer group membership

## for entities e
##    average = 0
##    for seed words w
##       average = average + distance(e,w)
##       divide by number of seed words
## average = average/seed words
## e.outgroupscore = average
## if 

##{entity1: {'ingroup': {'friend': 0.5, 'ally': 0.4}, 'outgroup': {'enemy': 0.1, 'adversary': 0.15}}}


In [None]:
### the purpose of this cell is to determine whether entities are ingroup or outgroup

entity_dict = {}

#ingroup_nouns = model.most_similar('friend')
#outgroup_nouns = model.most_similar('enemy')

## note, Google News model does not have all words in lower case and noun phrases are connected with underscore
## note, model also does not convert words to lower case

outgroup_nouns = {'Enemies', 'Enemy', 'Terrorists', 'Terrorist', 'Nazi', 'Nazis', 'enemies', 'enemy', 'terrorists', 'terrorist', 'nazi', 'nazis'} # ingroup and outgroup words taken from commonly used words in speeches
ingroup_nouns = {'Friend', 'Friends', 'Sons', 'Daughters', 'Brethren', 'Hero', 'Heroes', 'friend', 'friends', 'sons', 'daughters', 'brethren', 'hero', 'heroes'} 

for ent in new_doc.ents: # entity extraction completed by Spacy pipeline
    if ent.label_ in {'ORG', 'GPE', 'NORP'}: #creates a set of entity types: organisation (ORG), geopolitical (GPE) or nationality or religious or political group (NORP)
        entset.add(ent.text.replace(' ', '_')) # noun phrases in model are created using _

entity_dict = {}

for i, ent in enumerate(entset):
    ingroup_list = []
    outgroup_list = []
    ingp_score = 0
    outgp_score = 0
       
    for ingp, outgp in zip(ingroup_nouns, outgroup_nouns): # this needs changing since the lists are of different length
        try:
            ingp_sim =  model.similarity(ent, ingp) # score for most similar word to ingroup entity
            outgp_sim = model.similarity(ent, outgp) # score for most similar word to outgroup entity

            ingroup_list.append((ingp, ingp_sim )) # create a list of words most similar to ingroup entity
            outgroup_list.append((outgp, outgp_sim)) # create a list of word most similar to outgroup entity

            ingp_score += ingp_sim # create total score for words most similar to ingroup words
            outgp_score += outgp_sim # create total score for words most similary to outgroup words
            
            entity_dict[ent] = { # create a dict object for each entity
                              'ingroup': sorted(ingroup_list, key=lambda tup:(-tup[1], tup[0])), # add list of ingroup words similar to entity and sort
                              'outgroup' : sorted(outgroup_list, key=lambda tup:(-tup[1], tup[0])), # add list of outgroup words similar to entity and sort
                              'ingroup_score': ingp_score/len(ingroup_nouns), # average score for ingroup association
                              'outgroup_score': outgp_score/len(outgroup_nouns) # average score for outgroup association
                             }
        except:
            pass
       

    
for ent, key in sorted(zip(entity_dict, entity_dict.values())):
    if key['ingroup_score'] < key['outgroup_score']: # if average outgroup score is higher, the entity is more likely to be outgroup
        print(f"{ent} is outgroup:")
        print(f"ingroup score = {key['ingroup_score']}, outgroup score = {key['outgroup_score']}")
        print(f"distance between ingroup average and outgroup average: {key['outgroup_score'] - key['ingroup_score']}")
    else:
        print(f"{ent} is ingroup:") # else the entity is more likely to be ingroup
        print(f"ingroup score = {key['ingroup_score']}, outgroup score = {key['outgroup_score']}")
        print(f"distance between ingroup average and outgroup average: {key['ingroup_score'] - key['outgroup_score']}")
    print()          
    
    print("outgroup words: ", end = "")
    for word in key['outgroup'][0:3]:
              print(word, end = ' | ')
    print()
    
    print("ingroup word: ", end = "")          
    for word in key['ingroup'][0:3]:
              print(word, end = ' | ')
    print()
    print('-----')
    

In [None]:
for i, ent in enumerate(entset):
    print(f"{i}/{len(entset)}")
    try:
        print(f"{ent}:")
        for word in model.wv.most_similar(ent):
            print(word, end = ' | ')
    except:
        print(f"{ent}: not in vocabulary")
    print('-----')

In [110]:
### The purpose of this cell is to identify the ingroup and outgroups recursively

import itertools as it
import statistics

ent_set = set()

# organisation (ORG), geopolitical (GPE) or nationality or religious or political group (NORP)

for ent in new_doc.ents: # entity extraction completed by Spacy pipeline 
    if ent.label_ in {'ORG', 'GPE', 'NORP'} and ent.text in model: #creates a set of entity types and checks if in model: 
        ent_set.add(ent.text.replace(' ', '_')) # noun phrases in model are created using _
       

seed_dict = {'IN' : ['friend', 'Friend'],
             'OUT' : ['enemy', 'Enemy']}

#data initialisation complete

##recursive categoriser

def grp_categoriser(ent_set, seed_dict):

    def ent_scores(ent_set, seed_dict): #function to score each entity of ent_set relative to ingroup/outgroup similarity
        entity_dict = {}
        n = 0
        for i, entity in enumerate(ent_set):
            ingrp_scores = {}
            outgrp_scores = {}
            ingrp_ave = 0
            outgrp_ave = 0
            
            try:
                #create dict of ingroup scores for each seed word
                ingrp_scores = {'seed words' : seed_dict['IN'], 'ingrp_scores' : [model.similarity(entity, ingp) for ingp in seed_dict['IN']]} 
                
                #create dict of outgroup scores for each seed word
                outgrp_scores = {'seed words' : seed_dict['OUT'], 'outgrp_scores' : [model.similarity(entity, ingp) for ingp in seed_dict['OUT']]} 

                # calculate averge score signifying closeness to ingroup
                ingrp_ave = sum(ingrp_scores['ingrp_scores']) / float(len(ingrp_scores['ingrp_scores'])) 
                
                # calculate averge score signifying closeness to outgroup
                outgrp_ave = sum(outgrp_scores['outgrp_scores']) / float(len(outgrp_scores['outgrp_scores'])) 

                
                if entity == 'America':
                    print(f'InGroup average: {ingrp_ave}')
                    print(pd.DataFrame.from_dict(ingrp_scores).T)
                    print('-----')
                    print(f'OutGroup average: {outgrp_ave}')
                    print(pd.DataFrame.from_dict(outgrp_scores).T)
                    print('-------------')
                
                if ingrp_ave > outgrp_ave: # if the ingrp_ave is higher that outgrp entity is in the ingroup
                    entity_dict[i] = {'entity' : entity,
                                        'group' : 'IN',
                                        'score' : ingrp_ave}
                else:
                    entity_dict[i] = {'entity' : entity, # ...otherwise entity is in the outgroup
                                        'group' : 'OUT',
                                        'score' : outgrp_ave}
            except:
                print('passing: ', n)
                n+= 1
                pass

        return entity_dict.pop(max(entity_dict.items(), key=lambda score: score[1]['score'])[0]) #return max entry from entity dictionary

    def group_append(max_key, ent_set, seed_dict):
        if len(ent_set) == 1:
            return seed_dict

        else:  
            
            ent_set.remove(max_key['entity']) # remove max entity from entity set
            
            seed_dict[max_key['group']].append(max_key['entity']) # add max entity to appropriate seed word grouping
            
            return group_append(ent_scores(ent_set, seed_dict), ent_set, seed_dict) # re-evaluate based on new seed words
        
    return group_append(ent_scores(ent_set, seed_dict), ent_set, seed_dict)


print('Total ', len(ent_set), 'entities')
groupings = (grp_categoriser(ent_set, seed_dict))
print('InGroup', len(groupings['IN']), 'entities')
print(sorted(groupings['IN']))
print('----')
print('OutGroup', len(groupings['OUT']), 'entities')
print(sorted(groupings['OUT']))




Total  96 entities


  if np.issubdtype(vec.dtype, np.int):


InGroup average: 0.03736474830657244
                      0          1
seed words       friend     Friend
ingrp_scores -0.0187747  0.0935042
-----
OutGroup average: 0.16214321553707123
                      0         1
seed words        enemy     Enemy
outgrp_scores  0.174609  0.149677
-------------
InGroup average: 0.03736474830657244
                      0          1
seed words       friend     Friend
ingrp_scores -0.0187747  0.0935042
-----
OutGroup average: 0.14297782878081003
                      0         1         2
seed words        enemy     Enemy   Taliban
outgrp_scores  0.174609  0.149677  0.104647
-------------
InGroup average: 0.03736474830657244
                      0          1
seed words       friend     Friend
ingrp_scores -0.0187747  0.0935042
-----
OutGroup average: 0.1353284865617752
                      0         1         2        3
seed words        enemy     Enemy   Taliban   Afghan
outgrp_scores  0.174609  0.149677  0.104647  0.11238
-------------
InGroup 