In [1]:
import numpy as np,scipy.stats as ss
from sklearn.metrics import mutual_info_score
import numpy as np

In [2]:
class DistanceMetrics:
    """
    Metrics that help to quantify the amount of information contained in data
    
    
    Parameters
    ----------
    :param X: observations of x random variable
    :param Y: observations of y random variable
    :bins: using histogram-based estimation for estimation distance metrics
    
    :nObs: number of observation
    :corr: correlation between observable random variables
    :normalised: give output of normalized data
    """
        
#--------------------------------------------------------------------------     

    @staticmethod 
    def numBins(nObs: int , corr: int = None) -> int:
        
        """
        Function for finding optimal amount of bins for 
        dataset of discretized continious random variables
        """
        
        if corr is None: 
            z = (8 + 324*nObs + 12*(36*nObs + 729*nObs**2)**.5)**(1/3.)
            b = round(z/6. + 2./(3*z) + 1./3)
            
        else: 
            b = round(2**(-.5) * (1 + (1 + 24*nObs/(1. - corr**2))**.5)**.5)
            
        return int(b)
    
#--------------------------------------------------------------------------  

    @staticmethod 
    def entropy(X: np.ndarray, bins: int) -> float:
        
        """
        Function for finding entropy, i.e the amount of uncertainty associated with X.
        """
        
        hX = ss.entropy(np.histogram(X, bins)[0]) 
        
        return hX
    
#--------------------------------------------------------------------------        

    def mutual_info(self, X: np.ndarray, Y: np.ndarray, bins: int, normalised: bool = False) -> float:
        
        """
        Functions for finding mutual information of X,Y , i.e the decrease in uncertainty (or informational
        gain) in X that results from knowing the value of Y.
        """
        
    
        cXY = np.histogram2d(X, Y, bins)[0]
        iXY = mutual_info_score(None, None, contingency = cXY)

        if normalised:
            hX = self.entropy(X, bins) 
            hY = self.entropy(Y, bins) 
            iXY = iXY / min(hX, hY)
        
        return iXY
    
#--------------------------------------------------------------------------        
    
    def join_entropy(self, X: np.ndarray, Y: np.ndarray, bins: int)  -> float:
        """
        Function for finding the joint entropy of X and Y, i.e.
        the measure of the uncertainty associated with a set of X and Y
        """
        
        hX = self.entropy(X, bins)
        hY = self.entropy(X, bins)

        iXY = self.mutual_info(X, Y, bins)

        hXY = hX + hY - iXY #join entropy by formula

        return hXY

#--------------------------------------------------------------------------    

    def conditional_entropy(self, X: np.ndarray, Y: np.ndarray, bins: int)  -> float:
        
        """
        Functions for condtional entropy of X if Y is known ,
        i.e the amount of information needed to describe the outcome of a 
        random variable Y given that the value of another random variable X is known.
        """
                
        hXY = self.join_entropy(X, Y, bins)
        hY = self.entropy(Y, bins)
        hX_Y = hXY - hY #conditional entropy by formula
    
        return hX_Y
    
#--------------------------------------------------------------------------    

    def varInfo(self, X: np.ndarray, Y: np.ndarray, bins: int, normalised: bool = False) -> float:
        
        """
        Functions for finding variarion  of information of X,Y , i.e the uncertainty 
        we expect in one variable if we are told the value of other
        """
        
        iXY = self.mutual_info(X, Y, bins)
        hX = self.entropy(X, bins) 
        hY = self.entropy(Y, bins) 

        vXY = hX + hY - 2*iXY #Variation of information by formula
        
        if normalised:
            hXY = hX + hY - iXY 
            vXY = vXY/hXY 
            
        return vXY

# Checking working of fuctions


In [3]:
size,seed=5000,0
np.random.seed(seed)
x=np.random.normal(size=size)
e=np.random.normal(size=size)
y=0*x+e
corr=np.corrcoef(x,y)[0,1]

In [4]:
metric = DistanceMetrics()

### numbins

In [7]:
n_bins = metric.numBins(x.shape[0], corr)

In [8]:
n_bins

13

### entropy

In [9]:
metric.entropy(x, n_bins)

1.9638844237154696

In [10]:
metric.entropy(y, n_bins)

2.119199896008615

### mutual_info

In [11]:
metric.mutual_info(x,y, n_bins, True)

0.006764619108279091

### join_entropy

In [12]:
metric.join_entropy(x,y, n_bins)

3.9144839173318218

### condtional_entropy

In [13]:
metric.conditional_entropy(x,y, n_bins)

1.7952840213232069

In [14]:
metric.conditional_entropy(y,x, n_bins)

2.261230438202643

### varInfo

In [15]:
metric.varInfo(x,y, n_bins, True)

0.9967357285145346