In [6]:
import os
import numpy as np
from sklearn.preprocessing import StandardScaler

In [4]:
class Dataset:
    def __init__(self):
        self.datapoints = []
        
    def add_datapoint(self, datapoint):
        self.datapoints.append(datapoint)
    
    def remove_datapoint(self, datapoint):
        self.datapoints.remove(datapoint)
    
    def display_content(self):
        for i, dp in enumerate(self.datapoints):
            print(f'Datapoint #{i+1}:')
            dp.display_content()
        
    def get_lexicon(self):
        lexicon = []
        for dp in self.datapoints:
            lexicon += dp.get_all_words()
        return list(set(lexicon))
    
    def filter_against_embedding(self, embedding):
        print(f'Number of datapoints before adjustment: {len(self.datapoints)}')
        lexicon = self.get_lexicon()
        i = 0
        while i < len(self.datapoints):
            dp = self.datapoints[i]
            i += 1
            for w in dp.get_all_words():
                if w not in embedding.keys():
                    i -= 1
                    self.remove_datapoint(dp)
                    break
        print(f'Number of datapoints after adjustment: {len(self.datapoints)}')
    
    def get_triplets(self):
        triplets = []
        for dp in self.datapoints:
            for dn in dp.denoms:
                triplets.append([dp.noun, dn, dp.non_denoms+dp.others])
        return triplets
    
    def get_vecs(self, embedding):
        ws = self.get_lexicon()
        vecs = np.array([embedding[w] for w in ws])
        sc = StandardScaler()
        vecs = sc.fit_transform(vecs)
        return vecs
        
    
        
class Datapoint:
    def __init__(self, noun, denoms, non_denoms, others):
        self.noun = noun
        self.denoms = denoms
        self.non_denoms = non_denoms
        self.others = others
        
    def get_all_words(self):
        all_words = [self.noun]
        all_words += self.denoms
        all_words += self.non_denoms
        all_words += self.others
        return all_words
    
    def display_content(self):
        print(f'Noun: {self.noun}')
        print(f'Denominal(s): {self.denoms}')
        print(f'Non-denominal(s): {self.non_denoms}')
        print(f'Other(s): {self.others}')
        


In [5]:
def build_dict_from_vector_file(path_to_vecs, filename):
    """ For processing GloVe .txt models. Might no longer be needed for newer versions of gensim,
    as the gensim load_word2vec_format function can handle GloVe outputs as well"""
    embedding = {}
    if filename not in os.listdir(path_to_vecs):
        print(f'File not found. Generate it using GloVe.')
    with open(path_to_vecs+filename, 'r') as f:
        f = f.read().split('\n')
        f = [l.split(' ') for l in f]
        n_entries = len(f)
        for i, l in enumerate(f):
            w = l[0]
            try:
                v = np.array([float(x) for x in l[1:]])
            except ValueError:
                print(f'Line: {i}')
                print(f'Word: {w}')
                print(f'Vector: {l[1:]}')
            embedding[w] = v
            if (i % 50000 == 0):
                print(f'Processed {i} / {n_entries} entries')
    display_embedding_properties(embedding)
    return embedding

def display_embedding_properties(embedding):
    print(f'Embedding entries: {len(embedding.keys())}')
    print(f'Embedding dimension: {len(embedding[list(embedding.keys())[0]])}')

In [None]:
def parse_cell(cell_content):
    cell_content = cell_content.split(' ')
    return [x for x in cell_content if x != '']

def load_dataset_from_csv(path):
    dataset = Dataset()
    with open(path, 'r') as f:
        f = f.read().split('\n')[1:-1]
        for l in f:
            l = l.split(',')
            n = l[0]
            dns = parse_cell(l[1])
            ndns = parse_cell(l[2])
            others = parse_cell(l[3])
            dp = Datapoint(n, dns, ndns, others)
            dataset.add_datapoint(dp)
    return dataset