In [1]:
#default_exp metrics

In [2]:
#hide
from nbdev.showdoc import *

In [129]:
#hide
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dev comments

- entropy
- kl div between dirichlet flat and quantile dist
- 

# Imports -

In [788]:
#export
import numpy as np
from sklearn.metrics import pairwise
from scipy import stats

from scikit_density.core.random_variable import KDE, RandomVariable
from scikit_density.utils import (
    _fix_one_sample_2d, _fix_one_dist_2d, _fix_dist_1d,
    _fix_X_1d, _assert_dim_3d, _fix_one_dist_1d
)



# Density Metrics

## 

In [None]:
#export    

def quantile(y_true, pred_dist):
    '''checks in which quantile lies y_true, given the predicted distribution'''    
    y_true = _fix_X_1d(y_true)
    y_true = _fix_one_sample_2d(y_true)
    pred_dist = _assert_dim_3d(pred_dist)
    assert y_true.shape[0] == pred_dist.shape[0], 'number of dists should be the same as number of points'
    return _fix_one_dist_2d(np.array([(y_true[i].T <= pred_dist[i].T).mean(axis = 1) for i in range(len(y_true))]))

def quantile_value_sklearn(y_true, pred_dist):
    '''checks in which quantile lies y_true, given the predicted distribution, using skleaerns QuantileTransformer'''
    return QuantileTransformer().fit(pred_dist).transform(y_true)

def theoretical_entropy(data, dist):
    return RandomVariable(data).fit_dist(dist).entropy(dist)

def kde_entropy(data, sample_size = 10000, bw = 'ISJ'):    
    '''
    Calculates the entropy of multiple distributions
    input should be of shape (n_distributions, n_sample_per_distribution, n_dims_in_distribtuion)
    '''
    dist = _assert_dim_3d(data)
    return np.array([KDE(bw = bw).fit(d).entropy(sample_size = sample_size) for d in data])

def variance(data):
    '''
    Calculates the variance for each dimension (marginal) of multiple distributions
    input should be of shape (n_distributions, n_sample_per_distribution, n_dims_in_distribtuion)
    '''
    dist = _assert_dim_3d(data)
    return data.var(axis = -2)

def covariance_matrix(data):
    '''
    Calculates the variance for each dimension (marginal) of multiple distributions
    input should be of shape (n_distributions, n_sample_per_distribution, n_dims_in_distribtuion)
    '''
    

def bimodal_variance(data, pct_conv = 0.05, lb = 0.1,ub = 0.9):
    '''
    splits data in two according to the highest value of the derivative of cpdf
    and takes the wieghted average of the variance of the two ditributions generated
    '''
    #make split point
    #GENERALIZE FOR MULTIDIM
    data.sort()
    data = filter_borders(data,lb,ub)
    diff = np.diff(data)
    filter_size = np.floor(pct_conv*len(data))
    diff = np.convolve(diff,np.ones(filter_size)/filter_size, mode = 'same')
    split_point = data[np.argmax(diff)+1]
    #average variance
    arr1, arr2 = data[data >= split_point], data[data < split_point] 
    var = len(arr1)*(arr1.var()) + len(arr2)*(arr2.var())


def gaussian_distance_entropy(data):
    '''
    calculates the entropy of the distribution of distances from centroid of points in dist, assuming normal distribution
    '''
    return -np.log(expected_distance_gaussian_likelihood(data))

def expected_distance_gaussian_likelihood(data):
    '''
    calculates the expected likelihood of the distances from centroid of samples in distributions in dist
    input should be of shape (n_distributions, n_sample_per_distribution, n_dims_in_distribtuion)
    '''
    dist = _assert_dim_3d(data)    
    return np.array([distance_gaussian_likelihood(d).mean() for d in data])

def distance_gaussian_likelihood(data):    
    '''
    calculates the expected likelihood of the distances from the centroid of samples in dist
    '''
    centroid = data.mean(axis = 0).reshape(1,-1)
    distances =  pairwise.euclidean_distances(data, centroid).flatten()  
    distance_std = distances.std()
    if distance_std == 0:
        return 1
    z = (distances - distances.mean())/distance_std
    return 1/(distance_std*np.pi**(1/2))*np.exp(-1/2*z**2)

In [1150]:
#export
def _make_outlier_filter(data, lb = 25, ub = 75, c = 1):
    a = np.array(data)
    upper_quartile = np.percentile(a, ub)
    lower_quartile = np.percentile(a, lb)
    iqr = (upper_quartile - lower_quartile) * c
    lb = lower_quartile - iqr
    ub = upper_quartile + iqr
    filter_ = np.zeros(a.shape)
    filter_[(a >= lb) & (a <= ub)] = 1
    return filter_

def filter_borders(data, lb = 0.05, ub = 0.95):
    lb = int(len(data)*lb)
    ub = int(len(data)*ub)
    return data[lb:ub]

#def distance_log_variance(dist):
#    '''variance of the distances of points to centroid of distribution'''
#    centroid = dist.mean(axis = 0).reshape(1,-1)
#    distances =  pairwise.euclidean_distances(dist, centroid).flatten()    
#    return distances.var()

In [1291]:
dists = np.random.randn(300,299,1)
y_true = np.random.randn(300,1,1)

In [1292]:
variance(dists)

array([[1.03840303],
       [0.99429577],
       [0.85657209],
       [1.00802389],
       [0.91129206],
       [1.20180636],
       [0.96826585],
       [1.13715654],
       [0.98933793],
       [1.01014119],
       [1.01212786],
       [0.84589956],
       [0.98587586],
       [0.91998712],
       [0.98106139],
       [1.05764596],
       [0.92059322],
       [0.9275175 ],
       [0.96779067],
       [1.00345712],
       [0.93999219],
       [1.02603654],
       [1.01191194],
       [1.0282939 ],
       [0.79073409],
       [1.04607534],
       [1.23230325],
       [0.89371801],
       [0.97529275],
       [1.12663384],
       [0.97863051],
       [1.00237994],
       [0.94278942],
       [1.00163499],
       [0.92007089],
       [1.063153  ],
       [1.15099136],
       [0.90896666],
       [0.96672634],
       [0.85489839],
       [0.81842303],
       [1.11224568],
       [1.07926474],
       [0.96433255],
       [0.9698845 ],
       [1.0247836 ],
       [0.97963916],
       [1.044

In [1293]:
kde_entropy(quantile(y_true,dists))

array([0.28690482])

# Export -

In [1295]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_ensemble.ipynb.
Converted 02_core.random_variable.ipynb.
Converted 03_utils.ipynb.
Converted 04_metrics.ipynb.
Converted 05_neighbors.ipynb.
