https://radimrehurek.com/gensim/models/word2vec.html

https://rare-technologies.com/word2vec-tutorial/

In [None]:
%%time
import pandas as pd
import dataframe_image as dfi

from gensim.models import Word2Vec

import spacy

nlp = spacy.load("en_core_web_md")
nlp.add_pipe('merge_entities', after = 'ner')
nlp.add_pipe('entityfishing', after = 'merge_entities')

display(pd.DataFrame({'spaCy pipeline components': nlp.pipe_names}).T)

In [None]:
%%time

import datetime
import os

def get_dataset_dirpath(cwd):
    
    two_up = os.path.dirname(os.path.dirname(cwd))
    
    return os.path.join(two_up, 'dataset')

dataset_dirpath = get_dataset_dirpath(os.getcwd())

binladenpath = os.path.join(dataset_dirpath, 'Osama bin Laden/')
bushpath = os.path.join(dataset_dirpath, 'George Bush/')


Bush_FileList = [
    '20010914-Remarks at the National Day of Prayer & Remembrance Service.txt',
    '20010915-First Radio Address following 911.txt',
    '20010917-Address at Islamic Center of Washington, D.C..txt',
    '20010920-Address to Joint Session of Congress Following 911 Attacks.txt',
    '20010911-911 Address to the Nation.txt',
    '20011007-Operation Enduring Freedom in Afghanistan Address to the Nation.txt',
    '20011011-911 Pentagon Remembrance Address.txt',
    '20011011-Prime Time News Conference on War on Terror.txt',
    '20011026-Address on Signing the USA Patriot Act of 2001.txt',
    '20011110-First Address to the United Nations General Assembly.txt',
    '20011211-Address to Citadel Cadets.txt',
    '20011211-The World Will Always Remember 911.txt',
    '20020129-First (Official) Presidential State of the Union Address.txt'
]

text = ''

raw = ""
for file in Bush_FileList:
    with open(os.path.join(bushpath, file), 'r') as text:
        raw = raw + text.read()
        
print(f'doc length: {len(raw)}')

# Pre-Process text using spaCy

https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/

In [None]:
%%time
from tqdm import tqdm
import string
from typing import List
from spacy.tokens import Doc

def create_doc_array(doc: Doc) -> List:    
    
    doc_array = []
    
    for sent in tqdm(doc.sents):
        
        sent_array = []
        
        for token in sent:
            
            if token.is_punct:
                continue

            if token.is_stop:
                continue

            if token.is_space:
                continue

            if '\n' in token.text:
                continue
                
            text = token.lemma_.lower()
            
            if token._.normal_term:
                text = token._.normal_term.lower()
                
            text = text.translate(str.maketrans('', '', string.punctuation))
        
            sent_array.append(text)
                        
        yield sent_array

doc = nlp(raw)
doc_array = create_doc_array(doc)

data = {
    'Original': [sent.text for sent in list(doc.sents)[0:4]],
    'Pre-Processed': [' '.join([token for token in sent]) for sent in list(doc_array)[0:4]]
}

display(pd.DataFrame(data))


# Create word2vec model

In [None]:
%%time
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

doc_array = list(create_doc_array(doc))
model = Word2Vec(sentences=doc_array, vector_size=500, window=5, min_count=1, workers=4, sg=1)

def most_similar_terms(seed_terms, model):
    
    data = {}
    
    for seed_term in seed_terms:
        try:
            data[seed_term] = [f'{sim[0].title()}, ({round(sim[1], 3)})' for sim in model.wv.most_similar(seed_term.lower(), topn=10)]
        except:
            print(f'{seed_term} not in vocab')
            pass
        
    return pd.DataFrame(data)
    
seed_terms_good = ['friend', 'good']
seed_terms_bad = ['enemy', 'terrorist', 'terror', 'bad', 'evil', 'murder']
seed_terms = seed_terms_good + seed_terms_bad
df = most_similar_terms(seed_terms, model)
display(df)
dfi.export(df, 'seed_terms.png')

In [None]:
outgroups = ['al Qaeda', 'Taliban', 'Usama bin Laden', 'the Egyptian Islamic Jihad', 'the Islamic Movement of Uzbekistan', 'North Korea', 'Iran', 'Iraq', 'axis of evil']
df = most_similar_terms(outgroups)
display(df)
dfi.export(df, 'outgroup_terms.png')

In [None]:
data = {}

for outgroup in outgroups:
        
    data[outgroup] = {}
    data[outgroup]['count'] = 0
    
    for sentence in create_doc_array(doc):
        if outgroup.lower() in sentence:
            
            index = sentence.index(outgroup.lower())

            if index - 5 >= 0:
                left = index - 5
            else:
                left = 0

            if index + 5 <= len(sentence):
                right = index + 5
            else:
                right = len(sentence)

            new_sent = [token for token in sentence[left : right]]
            
            data[outgroup]['count'] += 1
            data[outgroup]['Co-Occurring with seed term'] = outgroup in new_sent
            
display(pd.DataFrame(data))

In [None]:
ingroups = ['America', 'Americans', 'Great Britain', 'The United States', 'The United States of America', 'the United States']
df = most_similar_terms(ingroups)
display(df)
dfi.export(df, 'ingroup_terms.png')

In [None]:
entities = ['The United States of America', 'Americans', 'al Qaeda', 'Taliban', 'Usama bin Laden', 'the Egyptian Islamic Jihad', 'the Islamic Movement of Uzbekistan']
df = most_similar_terms(entities)
display(df)
dfi.export(df, 'entities.png')

In [None]:
ents_of_interest = ['GPE', 'ORG', 'NORP', 'PERSON']
ents_refined = [ent.text for ent in doc.ents if ent.label_ in ents_of_interest]
pd.set_option('display.max_rows', 20)
display(most_similar_terms(ents_refined).T)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

def tsne_plot(model, set_array):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    print('creating tokens and labels')
#     for word in model.wv.key_to_index:
#         tokens.append(model.wv[word])
#         labels.append(word)
    
    set_array = [token.lower() for token in set_array]
    
    for word in set_array:
        if word in model.wv.index_to_key:
            tokens.append(model.wv[word])
            labels.append(word)
    
    
    print('building tsne model')
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    
    print('constructing graph')
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(10, 10)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        
        if labels[i] in set_array:
            plt.annotate(labels[i],
                xy=(x[i], y[i]),
                xytext=(5, 2),
                textcoords='offset points',
                ha='right',
                va='bottom')
            
    plt.savefig('vector_distribution.png')
    plt.show()
    
terms = seed_terms + ingroups + outgroups
tsne_plot(model, terms)

In [None]:
terms = set([span.text for span in doc.noun_chunks])
tsne_plot(model, terms)

In [None]:
outgroups = ['Taliban', 'al Qaeda']
seed_terms = ['terrorist', 'terror', 'murder', 'regime']
    
def get_word_contexts(iterable, terms_of_interest, seed_terms):
    
    for sentence in iterable:
    
        for term in terms_of_interest:

            term = term.lower()

            if term in sentence:

                index = sentence.index(term)

                if index - 5 >= 0:
                    left = index - 5
                else:
                    left = 0

                if index + 5 <= len(sentence):
                    right = index + 5
                else:
                    right = len(sentence)
                    
                new_sent = [token for token in sentence[left : right]]
                
                for seed_term in seed_terms:
                    new_sent.insert(0, seed_term in new_sent)
                    break
                    
                yield new_sent
                
with pd.option_context('display.max_rows', 100, 'display.max_colwidth', None):
        
    display(pd.DataFrame(get_word_contexts(create_doc_array(doc), outgroups, seed_terms)).fillna(value=''))

In [None]:
terms_array = [outgroups, ingroups, seed_terms_bad]


for terms in terms_array:
    data = {}
    for term in terms:
        data[term] = {}
        data[term]['Occurence count'] = doc.text.lower().count(term.lower())
    
    display(pd.DataFrame(data))