## Detecting the emergence of new terminology and establishing automated ways of understanding the meaning of these terms (by analyzing their similar words)

<br /> 
<ul>
<li>Analyzing the feasibility of maintaining word lists that are automatically augmented with similar words from word embedding models.</li>
<li>Because words derive their meanings from the context words that they keep!</li>
</ul>
<br /> 

#### General Pipeline v.02 (2020 July)
1. Scan the entire data file and count the number of unique words (create a dictionary at this time)
<ul>
    <li>Set a minimum frequency (i.e. 5) for dropping less frequent words</li>
</ul>

In [None]:
import sys
import os
import io 
import csv

import numpy as np
from pdb import set_trace as bp
from string import punctuation

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.test.utils import datapath
from gensim import utils

import gensim.models
from gensim.models import KeyedVectors

In [None]:
# filename of the original corpus
fname = "wiki_processed.txt"

if not os.path.isfile(fname):
    # double check if the data file exists
    print("File {} does not exist. Exiting...".format(fname))
    #sys.exit()
    
# A lookup table {key: None for key in string.punctuation}
punc_table = str.maketrans(dict.fromkeys(punctuation))  
vocab_dict = {} # A dictionary comprehension

with open(fname, encoding='utf8', errors='ignore') as fp:
    cnt = 0
    for line in fp:
        #print("processing line {} ".format(cnt))
        line = line.lower().strip()
        new_line = line.translate(punc_table)  # Output: string without punctuation

        for word in new_line.split():
            # only add word to the dictionary when the key does not exist
            if word not in vocab_dict:
                vocab_dict[word] = 0
            vocab_dict[word] += 1 # word count
        #bp()
        cnt += 1

print("Finished reading the data file!")
print("Number of unique words in vocab_dict: {}".format(len(vocab_dict.items())))

# Remove less frequent words from the vocab dict
unknown_dict = []
for word, count in list(vocab_dict.items()):
    if count < 5:
        unknown_dict.append(word) # save them in an array for the future use
        del vocab_dict[word]

print("After removing less frequent words, ")
print("Number of unique words in vocab_dict: {}".format(len(vocab_dict.items()))) 

2. Build 3 word embedding models (skip-gram, cbow, glove) and query top K neighbour words for every word in the dictionary (for each word embedding model)

In [None]:
"""  If True (sg=1), skip-gram is used; if False (sg=0), CBOW is used.  """
### Building a Skip-gram model with default settings
def build_skipgram_model(sentences, model_name):
    # sg: 1 for skip-gram; otherwise CBOW
    skipgram_model = gensim.models.Word2Vec(sentences=sentences, window=5, min_count=5, seed=1, workers=1, sg=1)
    skipgram_model.wv.save_word2vec_format(model_name) # save the model

def build_cbow_model(sentences, model_name):
    # sg: 1 for skip-gram; otherwise CBOW
    cbow_model = gensim.models.Word2Vec(sentences=sentences, window=5, min_count=5, seed=1, workers=1, sg=0)
    cbow_model.wv.save_word2vec_format(model_name) # save the model

class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    def __iter__(self):
        for line in open(fname, encoding='utf8', errors='ignore'):
            yield utils.simple_preprocess(line)

In [None]:
sentences = MyCorpus()

### This can be a one-time proces
print("... Start building a Skipgram model ...")
#build_skipgram_model(sentences, 'w2v_model_skipgram')
print("... Start building a CBOW model ...")
#build_cbow_model(sentences, 'w2v_model_cbow')

### Load the built models to save some time
#skipgram_model = KeyedVectors.load_word2vec_format("w2v_model_skipgram") 
#cbow_model = KeyedVectors.load_word2vec_format("w2v_model_cbow") 
# glove vector file still needs to be generated on Linux environment
#glove_model = KeyedVectors.load_word2vec_format("wiki_glove.word2vec") 
print("---> Three word embedding models were loaded!!")

# Get top K similar words for each word in the dictionary
with open('Data/original_top10.tsv', 'w', newline='') as fp:
    headers = ['target_word', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'top_6', 'top_7', 'top_8', 'top_9', 'top_10']
    tsv_writer = csv.writer(fp, delimiter='\t')
    tsv_writer.writerow(headers)
    
    for word, count in list(vocab_dict.items()):
        print("Word: {}, Frequency: {}".format(word, count))
        try:
            line = []
            line.append(word)
            for item in cbow_model.similar_by_word(word):
                line.append(item) # adding tuples
            tsv_writer.writerow(line)
        except:
            print("--> {} not found in model vocabulary".format(word))

3. Pick two random words (i_word and j_word) from the dictionary

In [None]:
####### Pick two random words from the dictionary
rand_i = np.random.randint(len(vocab_dict)) 
rand_j = np.random.randint(len(vocab_dict))
assert rand_i != rand_j
# list(vocab_dict.items())[rand_i] will return a (word, count) tuple
i_word = list(vocab_dict.items())[rand_i][0]
j_word = list(vocab_dict.items())[rand_j][0]

print("Two randomly picked words from the dictionary:")
print(list(vocab_dict.items())[rand_i])
print(list(vocab_dict.items())[rand_j])

4.  Generate new corpus files with random word replacements 

In [None]:
def get_context_words(line, phrase, window_size):
    line = line.split()
    phrase = phrase.split()
    word_cnt = len(phrase)

    for i,word in enumerate(line):
        if word == phrase[0] and line[i:i+word_cnt] == phrase:
            start = max(0, i-window_size)
            left = ' '.join(line[start:i])
            right = ' '.join(line[i+word_cnt:i+word_cnt+window_size])
            return left + ' ' + right

# new data file with random words replacement
replaced_fname = "Data/replaced_" + i_word +"_with_" + j_word + ".txt"

l_i_word = [] # list of sentences that contain target word
l_j_word = [] # list of sentences that contain replacing word

with open(fname, encoding='utf8', errors='ignore') as fp, open(replaced_fname, 'w', encoding='utf8') as rfp:# replace 100%
    cnt = 0
    for line in fp:
        #print("processing line {} ".format(cnt))
        line = line.lower().strip()
        new_line = line.translate(punc_table)  # Output: string without punctuation
        if i_word in new_line.split():
            l_i_word.append(get_context_words(new_line, i_word, 5))
            # perform word replacement here
            line = new_line.replace(i_word,j_word) # re-use the variable?
        if j_word in new_line.split():
            l_j_word.append(get_context_words(new_line, j_word, 5))
        rfp.write(line) # for writing to our new corpus file
        rfp.write("\n")
        cnt += 1

5. Calculate the average Jaccard Similarity (with window size 5) of two random words

In [None]:
js_sum = 0
for i, win_i in enumerate(l_i_word):
    for j, win_j in enumerate(l_j_word):
        print("Processing {} - {} th item ...".format(i+1,j+1))
        # tokenization
        tok_i = win_i.split()
        tok_j = win_j.split()
        # union and intersection of two sets
        union = set(tok_i).union(set(tok_j))
        intersection = set(tok_i).intersection(set(tok_j))
        js_score = len(intersection)/len(union)
        print("Jaccard similarity score: {}".format(js_score))
        js_sum += js_score


## ZeroDivisionError: division by zero <--- needs to be considered
avg_js = js_sum / (len(l_i_word)*len(l_j_word))
print("i_word: {} and j_word: {}".format(i_word, j_word))
print("Average Jaccard similarity: {}".format(avg_js))

6. Build 3 word embedding models with the new data files and query top K neighbour words for every word in a dictionary again
<br>
<i>(i_word doesn’t exist in the new data file at this point)</i>

In [None]:
class MyCorpus_v2(object):
    """An interator that yields sentences (lists of str)."""
    def __iter__(self):
        for line in open(replaced_fname, encoding='utf8', errors='ignore'):
            yield utils.simple_preprocess(line)

#sentences_v2 = MyCorpus_v2()
##### Sanity checking
sentences_v2 = MyCorpus()

# because we're replacing different random word every time, can't skip building models this time
print("... Start building a Skipgram model ...")
#build_skipgram_model(sentences_v2, 'w2v_model_skipgram_replaced')
print("... Start building a CBOW model ...")
#build_cbow_model(sentences_v2, 'w2v_model_cbow_replaced')


### Load the built models to save some time
#skipgram_model_v2 = KeyedVectors.load_word2vec_format("w2v_model_skipgram_replaced") 
# cbow and glove?
#cbow_model_v2 = KeyedVectors.load_word2vec_format("w2v_model_cbow_replaced") 
print("---> new word embedding models (after random word replacements) were loaded!!")

# Get top K similar words for each word in the dictionary
with open('Data/replaced_top10.tsv', 'w', newline='') as fp:
    headers = ['target_word', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'top_6', 'top_7', 'top_8', 'top_9', 'top_10']
    tsv_writer = csv.writer(fp, delimiter='\t')
    tsv_writer.writerow(headers)
    
    for word, count in list(vocab_dict.items()):
        print("Word: {}, Frequency: {}".format(word, count))
        try:
            line = []
            line.append(word)
            for item in cbow_model_v2.similar_by_word(word):
                line.append(item) # adding tuples
            tsv_writer.writerow(line)
        except:
            print("--> {} not found in model vocabulary".format(word))

7. Analyze those generated CSV/TSV files and detect the replaced words with least overlaps in top K similar word (most likely they have changes in their meaning)

In [None]:
import pandas as pd

# original_top10.tsv
# replaced_top10.tsv
df1 = pd.read_csv("Data/original_top10.tsv", sep='\t', encoding="ISO-8859-1")
df1.head()

In [None]:
df2 = pd.read_csv("Data/replaced_top10.tsv", sep='\t', encoding="ISO-8859-1")
df2.head()

In [None]:
# preprocess the dataframes -- we only need words from the tuple strings
df1['top_1'] = df1['top_1'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_2'] = df1['top_2'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_3'] = df1['top_3'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_4'] = df1['top_4'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_5'] = df1['top_5'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_6'] = df1['top_6'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_7'] = df1['top_7'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_8'] = df1['top_8'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_9'] = df1['top_9'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1['top_10'] = df1['top_10'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df1.head()

In [None]:
df1_dict = df1.set_index('target_word').T.to_dict('list')
print(len(df1_dict))

In [None]:
# preprocess the dataframes -- we only need words from the tuple strings
df2['top_1'] = df2['top_1'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_2'] = df2['top_2'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_3'] = df2['top_3'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_4'] = df2['top_4'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_5'] = df2['top_5'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_6'] = df2['top_6'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_7'] = df2['top_7'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_8'] = df2['top_8'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_9'] = df2['top_9'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2['top_10'] = df2['top_10'].str.strip("()").str.replace("'","").str.split(',',expand=True)
df2.head()
# dataframe to dictionary
df2_dict = df2.set_index('target_word').T.to_dict('list')
print(len(df2_dict))

In [None]:
overlap_dict = {} # if target_word exists, calculate the intersection
notFound = [] # if target_word is not found
for key in df2_dict.keys():
    if key in df1_dict.keys():
        overlap = set(df2_dict[key]).intersection(df1_dict[key])
        overlap_dict[key] = len(overlap)
        # debugging purpose
        if overlap:
            print("{} -- {}".format(key, overlap))  # or save this info into another dictionary
    else:
        notFound.append(key)

In [None]:
len(notFound)

In [None]:
# sort dictionary by values(int)
from collections import OrderedDict
od = OrderedDict(sorted(overlap_dict.items(), key=lambda x: x[1]))
od

In [None]:
hist = {}
zero = []

for word, count in list(od.items()):
#     if count not in hist:
#         hist[count] = 0
    hist[count] += 1
    if count == 0:
        zero.append(word)
        
import matplotlib.pyplot as plt

plt.bar(hist.keys(), hist.values())
plt.show()

In [None]:
hist

In [None]:
print(len(df2_dict))
