## Nbdev import session

In [242]:
#default_exp kernel

In [243]:
#hide
from nbdev.showdoc import *

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..') #appends project root to path in order to import project packages since `noteboks_dev` is not on the root


# Code

In [193]:
#export
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import normalize, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomTreesEmbedding

from scipy import sparse
import numpy as np
import pandas as pd

from sparse_dot_topn import awesome_cossim_topn


In [2]:
#export
def make_batches(arr, batch_size = 100):
    '''make batches for batch query'''
    #lst = [i for i in arr]

    if arr.shape[0] < batch_size:
        batches = [arr]
    else:
        n_bs = arr.shape[0] // batch_size
        last_batch = arr.shape[0] - batch_size * n_bs
        batches = []
        i = 0
        for i in range(n_bs):
            yield arr[i * batch_size:(i + 1) * batch_size]

        if last_batch:
            yield arr[(i + 1) * batch_size:]
    

In [3]:
def similarity_plot(vector, query_matrix):
    '''
    plots similarity plots like in https://gdmarmerola.github.io/forest-embeddings/
    '''
    return

In [4]:
# export
def sparsify(*arrs):
    '''
    makes input arrs sparse
    '''
    arrs = list(arrs)
    for i in range(len(arrs)):        
        if not sparse.issparse(arrs[i]):
            arrs[i] = sparse.csr_matrix(arrs[i])
    
    return arrs

def sim_matrix_to_idx_and_score(sim_matrix):
    '''
    returns list of indexes (col index of row vector) and scores (similarity value) for each row, given a similarity matrix
    '''
    scores = []
    idxs = []
    for row in sim_matrix:
        idxs.append(row.nonzero()[-1])
        scores.append(row.data)
    
    return idxs, scores

def cosine_similarity(A, B, topn = 30, remove_diagonal = False, **kwargs):        
    
    A,B = sparsify(A,B)
    A = normalize(A, norm  = 'l2').astype(np.float64)
    B = normalize(B, norm  = 'l2').astype(np.float64)
    dot = awesome_cossim_topn(A, B.T, ntop = topn, **kwargs)    
    
    if remove_diagonal:
        dot.setdiag(0)
        dot.eliminate_zeros()
    
    return dot

def jaccard_similarity(A, B, topn = 30, remove_diagonal = False, **kwargs):
    '''
    assumes that the ammount of non zero elements in the matrix are the same in all the columns
    and they are all equal to 1
    '''
    A,B = sparsify(A,B)
    
    A = A.astype(np.float64)
    B = B.astype(np.float64)
    
    total_elements = A[0].sum() + B[0].sum() #proxy to union 
    intersection = awesome_cossim_topn(A, B.T, ntop = topn, **kwargs)
    
    union = total_elements - intersection.data
    intersection.data = intersection.data/union
    
    if remove_diagonal:
        intersection.setdiag(0)
        intersection.eliminate_zeros()
        
    return intersection

def cosine_distance(A, B, topn = 30, remove_diagonal = False, **kwargs):    
    
    #calculate sim
    dist = cosine_similarity(A, B, topn, remove_diagonal, **kwargs)
    #calculate distance
    dist.data = 1 - dist.data    
    return dist

def jaccard_distance(A, B, topn = 30, remove_diagonal = False, **kwargs):
    #calculate sim
    dist = jaccard_similarity(A, B, topn, remove_diagonal, **kwargs)
    #calculate distance
    dist.data = 1 - dist.data    
    return dist


In [210]:
#export
class RobustEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self,):            
        '''
        A robust one hot encoder. Always return the same amount of nonzero value sin each transformed row.
        Has columns for unknown values
        '''
        return
    
    def fit(self, X, y = None, **kwawrgs):        
        self.ordinalencoder_ = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1).fit(X)
        
        X = self.ordinalencoder_.transform(X)
        
        categories = [np.arange(-1, len(cats)) for cats in self.ordinalencoder_.categories_]
        self.onehotencoder_ = OneHotEncoder(categories = categories).fit(X)        
        return self
    
    def transform(self, X, **kwargs):
        X = self.ordinalencoder_.transform(X)
        return self.onehotencoder_.transform(X)

In [97]:
x = [['aaa'], ['bbb'], ['ccc'], ['ddd']]
enc = RobustEncoder().fit(x)

enc.transform([['aaa'],['asdasd'], ['asd'],['ccc']]).A

[array([-1,  0,  1,  2,  3])]
[[ 0.]
 [-1.]
 [-1.]
 [ 2.]]


array([[0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [239]:
#export
class EstimatorKernel(BaseEstimator, TransformerMixin):    
    '''
    creates a kernel with some specified estimator.
    projection method will be performed according to projection_method.
    projection method can be a string refering to estimators method used to project,
    or a callable, that receives the estimator and X (vector to be projected) as the inputs.
    should return the projections of X according to estimator.
    norm will normalize vectors in matrices prior to applying dot products.
    '''
    def __init__(self, estimator, projection_method = 'predict_proba', norm = 'l2'):
        '''
        creates a kernel with some specified estimator.
        projection method will be performed according to projection_method.
        projection method can be a string refering to estimators method used to project,
        or a callable, that receives the estimator and X (vector to be projected) as the inputs.
        should return the projections of X according to estimator.
        norm will normalize vectors in matrices prior to applying dot products.
        '''
        self.estimator = estimator
        self.projection_method = projection_method
        self.norm = norm
    
    def project(self, X):
        '''
        projects X into new space, according to projection_method
        '''
        if callable(self.projection_method):
            return self.projection_method(self.estimator, X)
        else:
            return getattr(self.estimator, self.projection_method)(X)                        
    
    def transform(self, X):
        '''
        alias to self.project        
        '''
        return self.project(X)
    
    def fit(self, X, y = None, save_values = None, **kwargs):
        '''
        X is the feature space,
        y is used only for supervised Kernels
        save_values are values associated with each "Embeding". During transform,
        the values of saved_values are retrieved according to indexes returned by Nearest Neighbor query        
        '''
        if not save_values is None:
            if not len(save_values) == len(X):
                raise IndexError(f'X and save_values must have the same shape along the first dimension. Got {X.shape} and {save_values.shape}')
        
        self.estimator.fit(X, y, **kwargs)
        
        self.train_projection_space_ = self.project(X) #saves projection space of X in train
        self.saved_values_ = save_values #saves values to be retrieved by some query
        return self
        
    def similarity_matrix(self, X = None, topn = 30, remove_diagonal = False, lower_bound = 0.0, metric = 'cosine'):
        '''
        reeturns pariwise_similarity of X and self.train_projection_space_
        if X is None, returns pariwise similarity of self.train_projection_space_ with itself
        '''
        
        METRICS = {
            'cosine':cosine_similarity,
            'jaccard':jaccard_similarity,
        }
        
        if X is None:
            X = self.train_projection_space_
        
        try:
            return METRICS[metric.lower()](
                #normalize first
                normalize(X, norm = self.norm, axis = 0),
                normalize(self.train_projection_space_, norm = self.norm, axis = 0),
                topn,
                remove_diagonal,
                lower_bound = lower_bound
            )
        except KeyError:
            raise AttributeError(f'metric should be one of {list(METRICS)}, got {metric}')
            
    def similarity_idxs(self, X = None, topn = 30, remove_diagonal = False, lower_bound = 0.0, metric = 'cosine'):
        '''
        performs dot product based similarity of normalized X versus normalized self.train_projection_space_.
        if X is None, returns similarity of self.train_projection_space_ within itself
        
        returns two lists, one of indexes and other of scores, the indexes refer to self.train_projecetion_space_ rows
        '''        
            
        sim_matrix = self.similarity_matrix(X, topn, remove_diagonal, lower_bound, metric)
        idxs, sim = sim_matrix_to_idx_and_score(sim_matrix)
        return idxs, sim
    
    def similarity(self, X = None, topn = 30, remove_diagonal = False, lower_bound = 0.0, metric = 'cosine'):
        '''
        same as similarity, but instead of returning indexes, returns values in self.saved_values_
        '''
        idxs, sim = self.similarity_idxs(X, topn, remove_diagonal, lower_bound, metric)
        values = [self.saved_values_[idx] for idx in idxs]
        return values, sim

In [240]:
#export
class ForestKernel(EstimatorKernel):
    '''
    A Space tranformation performed based on Forest transformations.
    Can be supervised or not (CARTs, RandomTreeEmbeddings, Boosted trees...)
    '''                
    def project(self, X):
        
        if hasattr(self, 'one_hot_node_embeddings_encoder_'):
            X = self.estimator.apply(X)
            X = self.one_hot_node_embeddings_encoder_.transform(X)
        
        else:
            X = self.estimator.apply(X)
            self.one_hot_node_embeddings_encoder_ = OneHotEncoder().fit(X)
            X = self.one_hot_node_embeddings_encoder_.transform(X)
            
        return X 

In [None]:
kernel = ForestKernel()

In [226]:
#export
class CategoricalKernel(EstimatorKernel):    
    '''
    Linear model kernel for high cardinality categorical variables.
    kernel space is defined by liner model coefficients indexed by the nonzero elements
    of X
    '''
    def __init__(self, estimator, norm = 'l2', use_encoder = False):
        self.use_encoder = use_encoder
        super().__init__(estimator, norm)
        return    
    
    def fit(self, X, y = None, save_values = None, **kwargs):
        
        if self.use_encoder:
            self.estimator = make_pipeline(RobustEncoder(), self.estimator)
                
        return super().fit(X, y, save_values, **kwargs)
            
    
    def project(self, X):
        '''
        multiplies sparse vector to its coef_ s from linear model.
        if multiclass classification, the number of final features will be
        n*original_n_features_before_one_hot_encoding
        '''
        
        if self.use_encoder:
            coefs = self.estimator[-1].coef_
            X = self.estimator[0].transform(X)
        else:            
            coefs = self.estimator.coef_
                
        if len(coefs.shape) == 1:
            coefs = coefs.reshape(1,-1)
        
        embeddings = []
        for dim in range(coefs.shape[0]):
            #assumes all rows have the same ammount of nonzero elements
            dim_embeddings = coefs[dim, X.nonzero()[1]].reshape(X.shape[0], len(X[0].data))                        
            embeddings.append(dim_embeddings)
                
        return np.hstack(embeddings)
    

In [237]:
n_features = 7
cardinality_per_feature = 1000
n_classes = 5
n_reg_dims = 2

X = np.random.randint(0,cardinality_per_feature,(1000,n_features))

y_class = np.random.randint(0,n_classes, 1000)
y_reg = np.random.randn(1000,n_reg_dims)

kernel_class = CategoricalKernel(LogisticRegression(), use_encoder = True)
kernel_reg = CategoricalKernel(LinearRegression(), use_encoder = True)
kernel_class.fit(X,y_class)
kernel_reg.fit(X,y_reg)

kernel_class.project(X).shape, kernel_reg.project(X).shape

((1000, 35), (1000, 14))

# Export

In [246]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted kernel.ipynb.
