In [1]:
%run Utils.ipynb
%run CRV.ipynb

In [2]:
import matplotlib.pyplot as plt
from itertools import *
import numpy as np
import random

In [None]:

class Vectorizer:
    def __init__(self, vocab, matrix):
        self.vocab = vocab
        # vocab size, vector size
        self.matrix = matrix

        self.vsize, self.csize = self.matrix.shape

        self.indices = {word : i for i, word in enumerate(self.vocab)}

    @argmap
    def vectorize(self, word, mode = 'vec'):
        if type(word) in (str, int) and mode == 'vec':
            return self[word]
        
        elif type(word) == str and mode == 'int':
            return self.indices[word]
        
        elif type(word) == int and mode == 'str':
            return self.vocab[word]
        
        elif type(word) in (str, int) and mode == '1hot':
            result = np.zeros(self.csize)
            result[self.to_int(word)] = 1
            return result
        
        elif mode == 'CRV':
            return self.to_CRV(word)
        
        return word
        
    
    def to_int(self, item):
        return item if type(item) == int else self.indices[item]
    
    def to_vector(self, item):
        if type(item) == np.ndarray:
            if item.shape == (self.csize,):
                return item
            else:
                raise Exception(f'Cannot convert array of shape {item.shape} into a vector with this Vectorizer')
            
        elif type(item) in (str, int):
            return self[item]
        
        elif type(item) == CRV:
            result = np.zeros(self.vsize)
            for word, val in item.values.items():
                result[self.indices[word]] = val

            return result
   
    def to_CRV(self, item):
        if type(item) == CRV:
            return item
        
        item = self.to_vector(item)
        return CRV({word : val for word, val in zip(self.vocab, item) if val != 0})

    def average(self, *args):
        result = np.zeros(self.csize)
        for arg in args:
            result += self.to_vector(arg)

        return result / len(args)
    

    def from_probabilities(self, probabilities, top_p = .8, temperature = 1):
        #Temp. 1st or last?
        probabilities = sort_hl({self.vocab[idx]:val for idx, val in enumerate(probabilities)})
        clipped_probabilities = dict()
        prob_so_far = 0
        idx = 0
        keys = list(probabilities.keys())
        values = list(probabilities.values())
        while prob_so_far < top_p:
            clipped_probabilities[keys[idx]] = values[idx]
            prob_so_far += values[idx]

        clipped_probabilities = {word : val ** temperature for word, val in clipped_probabilities.items()}
        return random.choices(list(clipped_probabilities.keys()), list(clipped_probabilities.values()), k = 1)[0]
    
    def rate_words(self, vector, mode = 'min', space = 'union'):
        vector = self.to_vector(vector)

        if mode == 'min':
            ratings = np.sum(np.minimum(self.matrix, vector), axis = 1)
        elif mode == 'diff':
            ratings = 1 - np.sum(abs(self.matrix - vector), axis = 1)
        elif mode == 'mult':
            ratings = np.einsum('wv,v->w', self.matrix, vector)
        elif mode == 'min/max':
            max_vals = np.maximum(self.matrix, vector) 
            ratings = np.divide(np.minimum(self.matrix, vector), max_vals, out = np.zeros_like(max_vals), where = max_vals!=0)
            ratings = np.sum(ratings, axis = 1)
        elif mode == 'sqrt':
            ratings = np.sqrt(self.matrix * vector)
            ratings = np.sum(ratings, axis = -1)
            ratings *= ratings
            
        return sort_hl({word : i for word, i in zip(self.vocab, list(ratings))})
    

    def rate_sequence(self, vector, word_sequence, mode = 'min'):
        vector = self.to_vector(vector)
        sequence = self.vectorize(word_sequence)

        if type(sequence) == list:
            sequence = np.array(sequence)

        try:
            if mode == 'min':
                ratings = np.sum(np.minimum(sequence, vector), axis = 1)
            elif mode == 'diff':
                ratings = 1 - np.sum(abs(sequence - vector), axis = 1)
            elif mode == 'mult':
                ratings = np.einsum('wv,v->w', sequence, vector)

        except Exception as error:
            print(type(vector), type(sequence))
            print(sequence.shape, vector.shape)
            print(sequence.dtype, vector.dtype)
            raise error

        return [(word, i) for word, i in zip(word_sequence, list(ratings))]
    

    def __getitem__(self, idx):
        return self.matrix[self.to_int(idx)]
        

    def get_CRV(self, idx):
        return CRV({word : val for word, val in zip(self.vocab, self[idx]) if val != 0})


    def sub(self, vec_a, vec_b, mode = 'union'):
        vec_a, vec_b = self.to_vector(vec_a), self.to_vector(vec_b)
        if mode == 'union':
            return vec_a - vec_b
        elif mode == 'intersection':
            return vec_a - vec_b * ((vec_a != 0) * 1)
        elif mode == 'exclusion':
            return vec_a - vec_b * ((vec_a == 0) * 1)
    

    # Correlation

    def normalize(self):
        self.matrix = np.sqrt(self.matrix)

    def get_cross_correlation_matrix(self, mask = True):
        result = np.corrcoef(self.matrix)

        if mask:
            result = np.tril(result, -1)

        max_pos = np.unravel_index(np.argmax(result, axis=None), result.shape)
        min_pos = np.unravel_index(np.argmin(result, axis=None), result.shape)
        print(f'Maximum correlation: {np.max(result)}, between {self.vocab[max_pos[0]]} and {self.vocab[max_pos[1]]}')
        print(f'Minimin correlation: {np.min(result)}, between {self.vocab[min_pos[0]]} and {self.vocab[min_pos[1]]}')
        print(' ')

        return result
        

    def get_top_n_correlations(self, n = 10):
        result = self.get_cross_correlation_matrix().numpy()

        for i in range(n):
            max_pos = np.unravel_index(np.argmax(result, axis=None), result.shape)
            print(f'Maximum correlation: {np.max(result)}, between {self.vocab[max_pos[0]]} and {self.vocab[max_pos[1]]}')
            result[max_pos] = 0


    def num_correlations_over(self, n = .9):
        matrix = self.get_cross_correlation_matrix().numpy()
        return (matrix > n).sum()


    def plot_correlation(self, word_a, word_b):
        setup_plot()
        plt.scatter(
            np.squeeze(self.matrix[:, self.indices[word_a]]),
            np.squeeze(self.matrix[:, self.indices[word_b]]))
