In [1]:
# This remains here as a trophy of triumph over tribulation
" " == " "

False

In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from scipy import spatial
import pandas as pd
import json
import sys
import os.path
from scipy.stats import pearsonr
import os
parentdir = os.getcwd()[:-4]

### Helper Methods

In [3]:
def find_intersection_of_words(m1, m2, targetWord, k):
    '''
    description:
        Finds top k words from m1 that are in m2
    '''
    # retrieve k*searchFactor nearest words as candidates
    searchFactor = 150 
    if targetWord == "village":
        output = m1.wv.most_similar(targetWord, topn=k*3000) # village is especially sparse
    else:
        output = m1.wv.most_similar(targetWord, topn=k*searchFactor)
    candidate_words = [word for word,_ in output]
    final_words = list()
    
    for word in candidate_words:
        try:
            # breaks if word not in m2
            m2.wv[word]
            if len(final_words) < k:
                final_words.append(word)
            else:
                break
        except KeyError:
            continue
    if len(final_words) == k:
        return final_words
    else:
        sys.exit(f"Neighborhood too sparse for word: {targetWord}")

In [11]:
def compare_nearest_neighbors(m1, m2, targetWord, k):
    '''
    inputs:
        m1: model the top k words will be derived from
        m2: other model
        targetWord: word from which top k words will be derived
        k: number of words to compare
        
    output:
        pearson correlation between the two datasets
        difference in similarity between m1 and m2's average similarities to m1's k nearest neighbors
        
    description:
        Returns the difference between the average similarity of the k most similar words to the 
        target word from m1 and the average similarity of the same words in m2.
    '''
    
    # finds k most similar words in m1 that are in m2.
    top_words = find_intersection_of_words(m1, m2, targetWord, k)

    # stores similarities of top_words to target word
    m1Similarities = list()
    m2Similarities = list()
    
    m1TargetVec = m1.wv[targetWord]
    m2TargetVec = m2.wv[targetWord]
    # create list of similarities
    for word in top_words:

        m1ComparisonVec = m1.wv[word]
        m1Similarity = 1 - spatial.distance.cosine(m1TargetVec, m1ComparisonVec)
        m1Similarities.append(m1Similarity)
        
        m2ComparisonVec = m2.wv[word]
        m2Similarity = 1 - spatial.distance.cosine(m2TargetVec, m2ComparisonVec)
        m2Similarities.append(m2Similarity)
        
    # return (sum(m1Similarities) / len(m1Similarities)) - (sum(m2Similarities) / len(m2Similarities))
    corr, p_value = pearsonr(m1Similarities, m2Similarities)
    # if p_value > 0.1:
    #     print(f"p-value is: {p_value} for {targetWord}")
    return corr
        

In [12]:
def compare_a_domain(m1, m2, wordList, k):
    '''
    description:
        Calculates the average pearson correlation values of words in a domain as well as the standard deviation.
        
        NOTE: Function is for testing/debugging purposes only.
    '''
    similarities = list()
    for word in wordList:
        wordComparison = (compare_nearest_neighbors(m1, m2, word, k) + compare_nearest_neighbors(m2, m1, word, k)) / 2
        similarities.append(wordComparison)

    data = pd.Series(similarities)
     
    return  f"Mean difference: {round(data.mean(), 4)}     Standard Deviation: {round(data.std(), 4)}"
    
    


In [13]:
def compare_all_domains(m1, m2, k, domains_dict):
    '''
    description:
        Calculates the average and standard deviation of alignment of words across domains.
    input:
        m1: one of two models. Order doesn't matter
        m2: second of two models. Order doesn't matter
        k: how many nearest words to compare when calculating alignment of single word
        domains_dict: keys are domain names, values are lists of words of that domain
    output:
        dataframe containing alignment statistics across domains
    '''
    results = {"Domain": [], "Alignment": [], "Standard Deviation": []}
    domains = list(domains_dict.keys())
    alignments = []
    std = []
    granular_results = dict()
    for domain in domains:
        wordList = domains_dict[domain]
        similarities = list()
        for word in wordList:
            wordComparison = (compare_nearest_neighbors(m1, m2, word, k) + compare_nearest_neighbors(m2, m1, word, k)) / 2
            similarities.append(wordComparison)
        
        # get all domain values
        granular_results.update({domain:similarities})

        # get stats on domain values
        similarity_stats = pd.Series(similarities)
        alignments.append(round(similarity_stats.mean(), 4))
        std.append(round(similarity_stats.std(), 4))
        
        
    return granular_results, pd.DataFrame({"Domain" : domains, "Alignments": alignments, "Standard Deviation": std})


### Load and Run

In [7]:
# domains.json includes all domain words that exist in both datasets
with open(os.path.join(parentdir, "data", "domains.json"), "rt") as f:
    domains = json.load(f) 

In [8]:
model1 = Word2Vec.load(os.path.join(parentdir, "models", "NANTeC_clean.bin"))
model2 = Word2Vec.load(os.path.join(parentdir, "models", "indicorp_clean.bin"))

In [21]:
# k = how many nearest words to compare when calculating alignment of single word
k = 100
# run code
granular_results, stat_results = compare_all_domains(model1, model2, k, domains)
stat_results = stat_results.sort_values(by=['Alignments'], ascending=False, ignore_index=True)
# write code to results
with open(os.path.join(parentdir, "results", f"results_statistical_{k}.csv"), "wt") as f:
    stat_results.to_csv(f)
# load and write granular results to csv
granular_results_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in granular_results.items()]))
with open(os.path.join(parentdir, "results", f"results_full_{k}.csv"), "wt") as f:
    granular_results_df.to_csv(f)
# display results
granular_results

## Workspace

In [76]:
def compare_nearest_neighbors2(m1, m2, targetWord, k):
    '''
    description:
        Used to facilitate making the alignments figure.
    '''
    
    # finds k most similar words in m1 that are in m2.
    top_words = find_intersection_of_words(m1, m2, targetWord, k)

    # stores similarities of top_words to target word
    m1Similarities = list()
    m2Similarities = list()
    # comparisons = list()
    
    m1TargetVec = m1.wv[targetWord]
    m2TargetVec = m2.wv[targetWord]
    # create list of similarities
    for word in top_words:

        m1ComparisonVec = m1.wv[word]
        m1Similarity = round(1 - spatial.distance.cosine(m1TargetVec, m1ComparisonVec), 3)
        m1Similarities.append([word, m1Similarity])
        
        m2ComparisonVec = m2.wv[word]
        m2Similarity = round(1 - spatial.distance.cosine(m2TargetVec, m2ComparisonVec), 3)
        m1Similarities[-1].append(m2Similarity)
        m1Similarities[-1].append(round((m1Similarities[-1][2] - m1Similarities[-1][1]), 2))
        
        # comparisons.append(round((m1Similarity - m2Similarity), 3))
        
        

    return m1Similarities
        

In [78]:
compare_nearest_neighbors2(model2, model1, "heat", 11)

[['sweltering', 0.674, 0.509, -0.17],
 ['humid', 0.659, 0.466, -0.19],
 ['scorching', 0.653, 0.358, -0.3],
 ['broiling', 0.642, 0.345, -0.3],
 ['humidity', 0.634, 0.546, -0.09],
 ['dust', 0.621, 0.461, -0.16],
 ['vapors', 0.614, 0.398, -0.22],
 ['heatwave', 0.612, 0.334, -0.28],
 ['cooler', 0.598, 0.421, -0.18],
 ['heating', 0.596, 0.455, -0.14],
 ['convection', 0.588, 0.438, -0.15]]