In [6]:
# This remains here as a trophey of triumph over tribulation
" " == " "

False

In [12]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from scipy import spatial
import pandas as pd
import json
import sys
import os.path
from scipy.stats import pearsonr
import os
parentdir = os.getcwd()[:-4]

### Helper Methods

In [2]:
def find_intersection_of_words(m1, m2, targetWord, k):
    '''
    description:
        Finds top k words from m1 that are in m2
    '''
    # retrieve k*searchFactor nearest words as candidates
    searchFactor = 150 
    if targetWord == "village":
        output = m1.wv.most_similar(targetWord, topn=k*3000) # village is especially sparse
    else:
        output = m1.wv.most_similar(targetWord, topn=k*searchFactor)
    candidate_words = [word for word,_ in output]
    final_words = list()
    
    for word in candidate_words:
        try:
            # breaks if word not in m2
            m2.wv[word]
            if len(final_words) < k:
                final_words.append(word)
            else:
                break
        except KeyError:
            continue
    if len(final_words) == k:
        return final_words
    else:
        sys.exit(f"Neighborhood too sparse for word: {targetWord}")

In [3]:
def compare_nearest_neighbors(m1, m2, targetWord, k):
    '''
    inputs:
        m1: model the top k words will be derived from
        m2: other model
        targetWord: word from which top k words will be derived
        k: number of words to compare
        
    output:
        pearson correlation between the two datasets
        difference in similarity between m1 and m2's average similarities to m1's k nearest neighbors
        
    description:
        Returns the difference between the average similarity of the k most similar words to the 
        target word from m1 and the average similarity of the same words in m2.
    '''
    
    # finds k most similar words in m1 that are in m2.
    top_words = find_intersection_of_words(m1, m2, targetWord, k)

    # stores similarities of top_words to target word
    m1Similarities = list()
    m2Similarities = list()
    
    m1TargetVec = m1.wv[targetWord]
    m2TargetVec = m2.wv[targetWord]
    # create list of similarities
    for word in top_words:

        m1ComparisonVec = m1.wv[word]
        m1Similarity = 1 - spatial.distance.cosine(m1TargetVec, m1ComparisonVec)
        m1Similarities.append(m1Similarity)
        
        m2ComparisonVec = m2.wv[word]
        m2Similarity = 1 - spatial.distance.cosine(m2TargetVec, m2ComparisonVec)
        m2Similarities.append(m2Similarity)
        
    # return (sum(m1Similarities) / len(m1Similarities)) - (sum(m2Similarities) / len(m2Similarities))
    corr, p_value = pearsonr(m1Similarities, m2Similarities)
    # if p_value > 0.1:
    #     print(f"p-value is: {p_value} for {targetWord}")
    return corr
        

In [4]:
def compare_a_domain(m1, m2, wordList, k):
    '''
    description:
        Calculates the average pearson correlation values of words in a domain as well as the standard deviation.
        
        NOTE: Function is for testing/debugging purposes only.
    '''
    similarities = list()
    for word in wordList:
        wordComparison = (compare_nearest_neighbors(m1, m2, word, k) + compare_nearest_neighbors(m2, m1, word, k)) / 2
        similarities.append(wordComparison)

    data = pd.Series(similarities)
     
    return  f"Mean difference: {round(data.mean(), 4)}     Standard Deviation: {round(data.std(), 4)}"
    
    


In [5]:
def compare_all_domains(m1, m2, k, domains_dict):
    '''
    description:
        Calculates the average and standard deviation of alignment of words across domains.
    input:
        m1: one of two models. Order doesn't matter
        m2: second of two models. Order doesn't matter
        k: how many nearest words to compare when calculating alignment of single word
        domains_dict: keys are domain names, values are lists of words of that domain
    output:
        dataframe containing alignment statistics across domains
    '''
    results = {"Domain": [], "Alignment": [], "Standard Deviation": []}
    domains = list(domains_dict.keys())
    alignments = []
    std = []
    for domain in domains:
        wordList = domains_dict[domain]
        similarities = list()
        for word in wordList:
            wordComparison = (compare_nearest_neighbors(m1, m2, word, k) + compare_nearest_neighbors(m2, m1, word, k)) / 2
            similarities.append(wordComparison)

        data = pd.Series(similarities)
        
        alignments.append(round(data.mean(), 4))
        std.append(round(data.std(), 4))
        
    return pd.DataFrame({"Domain" : domains, "Alignments": alignments, "Standard Deviation": std})


### Load and Run

In [13]:
# domains.json includes all domain words that exist in both datasets
with open(os.path.join(parentdir, "data", "domains.json"), "rt") as f:
    domains = json.load(f) 

In [20]:
model1 = Word2Vec.load(os.path.join(parentdir, "models", "NANTeC_clean.bin"))
model2 = Word2Vec.load(os.path.join(parentdir, "models", "indicorp_clean.bin"))

In [27]:
# k = how many nearest words to compare when calculating alignment of single word
k = 100
# run code
results = compare_all_domains(model1, model2, 100, domains)
results = results.sort_values(by=['Alignments'], ascending=False, ignore_index=True)
# write code to results
with open(os.path.join(parentdir, "results", f"results_{k}.csv"), "wt") as f:
    results.to_csv(f)
# display results
results

Unnamed: 0,Domain,Alignments,Standard Deviation
12,Quantity,0.6105,0.2143
7,Kinship,0.4674,0.1253
20,Time,0.4639,0.1415
11,Possession,0.4264,0.1702
16,Speech and language,0.4031,0.1314
8,Miscellaneous function words,0.3931,0.2298
4,Cognition,0.3787,0.1267
10,Motion,0.352,0.1603
2,Basic actions and technology,0.3298,0.1771
9,Modern world,0.3239,0.157
