In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
import multiprocessing
import os

In [2]:
def guess_encoding(csv_file):
    import io
    import locale
    with io.open(csv_file, "rb") as f:
        data = f.read(5)
    if data.startswith(b"\xEF\xBB\xBF"):  # UTF-8 with a "BOM"
        return "utf-8-sig"
    elif data.startswith(b"\xFF\xFE") or data.startswith(b"\xFE\xFF"):
        return "utf-16"
    else:  # in Windows, guessing utf-8 doesn't work, so we have to try
        try:
            with io.open(csv_file, encoding="utf-8") as f:
                preview = f.read(222222)
                return "utf-8"
        except:
            return locale.getdefaultlocale()[1]

In [3]:
def preprocess_data(embeddings_filename):
    embeddings_fp = open(embeddings_filename, encoding = guess_encoding(embeddings_file))
    file_content = embeddings_fp.readlines()
    file_content = [line.split(' ') for line in file_content]
    file_content = [ [elem[0]] + list(map(float, elem[1:])) for elem in file_content]
    
    embeddings_colnames = ['E_' + str(i) for i in range(len(file_content[0]) - 1)]
    embeddings_df = pd.DataFrame(file_content, columns = ['Word'] + embeddings_colnames)
    
    return embeddings_df

In [4]:
embeddings_file = "D:/Google Drive/_cloudifier_data/09_tests/_glove_words/glove.6B.50d.txt"
embeddings_df = preprocess_data(embeddings_file)

In [7]:
embeddings_df.head()

(400000,)

In [39]:
class WordEmbeddingsExperiment():
    def __init__(self, embeddings_df):
        self.embeddings_df = embeddings_df
        self.all_words = embeddings_df['Word'].values
        self.np_embeddings = embeddings_df.iloc[:, 1:].values
        self.words_by_pos = dict(zip(self.all_words, range(0, len(self.all_words))))
        
    def _print_stats(self, word, word_vec, k):
                
        dist_word = pairwise_distances(word_vec.reshape(1,-1), self.np_embeddings).flatten()
       
        print("Top {} closest words for {}".format(k, word))
        top_k_indexes = np.argsort(dist_word1)[1 : (k + 1)]
        print("{}".format(np.take(self.all_words, top_k_indexes)))
        print()
        
    def word_simple_op(self, word1, word2, simple_op):
        word1_vec = self.np_embeddings[self.words_by_pos[word1]]
        word2_vec = self.np_embeddings[self.words_by_pos[word2]]
        
        words_dist = np.sqrt(np.sum((word1_vec - word2_vec)**2))
        print("Distance between {} and {} is {}".format(word1, word2, words_dist))
        
        self._print_stats(word1, word1_vec, 20)
        self._print_stats(word2, word2_vec, 20)
        
        res_vec = simple_op(word1_vec, word2_vec)
        
        print()
        
        if simple_op == operator.sub:
            k = 10
            print("Top {} closest words for difference".format(k*2))
            res_diff = pairwise_distances(diff_vec.reshape(1,-1), self.np_embeddings).flatten()
        
            sort_indexes = np.argsort(dist_diff)
            top_k_indexes_minus = sort_indexes[:k]
            top_k_indexes_plus = sort_indexes[-(k+1):]
            print("{}".format(np.take(self.all_words, top_k_indexes_minus)))
            print("{}".format(np.take(self.all_words, top_k_indexes_plus)))
        else:
            self._print_stats("difference", word)
        
        print()
        print()
        
    def word_avg(self, words_list):

        words_vec = [self.np_embeddings[self.words_by_pos[word]] for word in  words_list]
        
        avg_vec = np.average(words_vec, axis = 0)
        
        k = 20
        print("Top {} closest words for average of {}".format(k, words_list))
        avg_diff = pairwise_distances(avg_vec.reshape(1,-1), self.np_embeddings).flatten()
        
        top_k_indexes = np.argsort(avg_diff)[1 : (k + 1)]
        print("{}".format(np.take(self.all_words, top_k_indexes)))
        
        print()
        print()

In [40]:
experiment = WordEmbeddingsExperiment(embeddings_df)
experiment.word_diff('man', 'gender')
experiment.word_diff('day', 'sun')
experiment.word_diff('man', 'sex')

experiment.word_avg(['day', 'monday', 'weekend', 'today'])

Distance between man and gender is 6.838671318666295
Top 20 closest words for man
['woman' 'another' 'boy' 'one' 'old' 'turned' 'whose' 'himself' 'who'
 'friend' 'him' 'gets' 'a' 'blind' 'once' 'young' 'person' 'victim' 'his'
 'thought']

Top 20 closest words for gender
['ethnicity' 'orientation' 'racial' 'mainstreaming' 'societal' 'defining'
 'preferences' 'sexes' 'bias' 'irrespective' 'regardless' 'attitudes'
 'equality' 'workplace' 'define' 'discrimination' 'makeup' 'defines'
 'demographics' 'sexuality']

Top 20 closest words for difference
['landed' 'roger' 'nicknamed' 'cole' 'fisherman' 'blew' 'barge' 'chased'
 'captain' 'crewman']
['www.star' 'story3d' 'officership' '20003' 'daybook' 'non-obligatory'
 'afptv' '202-383-7824' 'eighteens' '25-64' 'non-families']


Distance between day and sun is 4.95315822564458
Top 20 closest words for day
['days' 'next' 'coming' 'weekend' 'came' 'week' 'here' 'night' 'time'
 'morning' 'on' 'weeks' 'last' 'before' 'took' 'month' 'start' 'starts'
 '