In [5]:
%%time

import datetime
import os

def get_dataset_dirpath(cwd):
    
    one_up = os.path.dirname(cwd)
    
    return os.path.join(one_up, 'dataset')

dataset_dirpath = get_dataset_dirpath(os.getcwd())

binladenpath = os.path.join(dataset_dirpath, 'Osama bin Laden/')
bushpath = os.path.join(dataset_dirpath, 'George Bush/')


Bush_FileList = [
    '20010914-Remarks at the National Day of Prayer & Remembrance Service.txt',
    '20010915-First Radio Address following 911.txt',
    '20010917-Address at Islamic Center of Washington, D.C..txt',
    '20010920-Address to Joint Session of Congress Following 911 Attacks.txt',
    '20010911-911 Address to the Nation.txt',
    '20011007-Operation Enduring Freedom in Afghanistan Address to the Nation.txt',
    '20011011-911 Pentagon Remembrance Address.txt',
    '20011011-Prime Time News Conference on War on Terror.txt',
    '20011026-Address on Signing the USA Patriot Act of 2001.txt',
    '20011110-First Address to the United Nations General Assembly.txt',
    '20011211-Address to Citadel Cadets.txt',
    '20011211-The World Will Always Remember 911.txt',
    '20020129-First (Official) Presidential State of the Union Address.txt'
]

text = ''

raw = ""
for file in Bush_FileList:
    with open(os.path.join(bushpath, file), 'r') as text:
        raw = raw + text.read()
        
print(f'doc length: {len(raw)}')

doc length: 111934
CPU times: user 1.72 ms, sys: 3.28 ms, total: 5 ms
Wall time: 5.56 ms


In [None]:
import spacy
from spacy.pipeline import merge_entities


print('setting up pipeline')
coref_nlp = spacy.load('en_coref_md')
spacy_nlp = spacy.load('en_core_web_md')

spacy_nlp.add_pipe(merge_entities)

print('applying pipelines')
coref_doc = coref_nlp(doc)
new_doc = spacy_nlp(coref_doc._.coref_resolved) #replace co-reference entities with root reference

print('complete')

In [None]:
print(coref_doc._.coref_resolved)

In [None]:
#extract named entities

named_entities = [x.text for x in new_doc.ents]

In [None]:
from spacy import displacy
from pathlib import Path

# displacy.render(new_doc, jupyter=True, style='ent')

html = displacy.render(new_doc, jupyter=True, style='dep', page=True)
file_name = 'GB Speech' + '.html'
output_path = Path('C:/Users/Steve/Documents/Cultural Violence/George Bush/' + file_name)
output_path.open('w', encoding='utf-8').write(html)

#sentences = [x for x in new_doc.sents]

#for sentence in sentences[0:5]:
#    displacy.render(sentence, jupyter=True, style='ent')
    
# ent = entity resolution
# dep = dependency resolution

In [2]:
%%time

#clean document

entity_list = []
doc_array = []

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

print('Tokenizing document')

for word in spacy_nlp.Defaults.stop_words:
    lex = spacy_nlp.vocab[word]
    lex.is_stop = True

for sentence in new_doc.sents:
    doc_array.append([lemmatizer.lemmatize(token.text).strip().lower() for token in sentence if 
                      (not token.is_punct and token.text.find('\n') and not token.is_space and token.text !="'s")])
    
# not token.is_stop and
    
print('complete')
# for i in doc_array:
#     print(i)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/stephenanningcorp/nltk_data'
    - '/Users/stephenanningcorp/opt/anaconda3/envs/spaCy_v3/nltk_data'
    - '/Users/stephenanningcorp/opt/anaconda3/envs/spaCy_v3/share/nltk_data'
    - '/Users/stephenanningcorp/opt/anaconda3/envs/spaCy_v3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [3]:
nltk.download('stopwords')


NameError: name 'nltk' is not defined

In [None]:
print('Creating Word2Vec model')
from gensim.models import Word2Vec
model = Word2Vec(doc_array, size=500, window=5, min_count=1, workers=4, sg=0)
print('complete')

In [None]:
print('Create a list of named entities') #does this need to be named entities or mentions?
    
# document level
entities = [(e.text, e.start_char, e.end_char, e.label_) for e in new_doc.ents]

# entity level
entity_list = []

for entity in entities:
    if entity[3] in ['PERSON', 'ORG', 'NORP', 'GPE'] and entity[0] != '\n':
        entity_list.append(entity[0].lower())
        
entity_set = set(entity_list)
print('complete')

In [None]:
print('Creating a list of influence words')

# this is a set of elevation and otherising words. The idea is to measure the proximity of these words to the named entities

biblical_set = {'god', 'evil'}
ideology_set = {'nazism'}
crime_set = {'murder', 'murderers', 'murderous', 'terrorist', 'terrorism'}
total_set = biblical_set | entity_set | ideology_set | crime_set

print('complete')

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

def tsne_plot(model, set_array):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    print('creating tokens and labels')
    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    print('building tsne model')
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    
    print('constructing graph')
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        
        if labels[i] in set_array:
            plt.annotate(labels[i],
                xy=(x[i], y[i]),
                xytext=(5, 2),
                textcoords='offset points',
                ha='right',
                va='bottom')
    plt.show()
    
tsne_plot(model, total_set)

In [None]:
#model.wv.most_similar_cosmul(positive=['taliban', 'america'], negative = ['enemy'])

model.wv.most_similar('them')


In [None]:
model.wv.similarity('enemy', 'al qaeda')

In [None]:
from gensim.summarization import keywords

keyword_array = []

#words = keywords(doc[0])
#words = words.split('\n')

print(keywords(doc))