In [1]:
import numpy as np
%load_ext cython

## Function code and checks

In [2]:
def conditional_entropy_python(X, Y):
    """ 
    Calculate conditional entropy of all columns of X against Y (i.e. \sum_i=1^{N} H(X_i | Y)).
    """
    # Calculate distribution of y    
    Y_dist = np.zeros(shape=(int(Y.max()) + 1, ), dtype=np.float32)
    for y in range(Y.max() + 1):
        Y_dist[y] = (float(len(np.where(Y==y)[0]))/len(Y))
        
    Y_max = Y.max()
    X_max = X.max()
    
    ce_sum = 0.
    for i in range(X.shape[1]):
        ce_sum_partial = 0.
        
        # Count
        counts = np.zeros(shape=(X_max + 1, Y_max + 1), dtype=np.int32)
        for row, x in enumerate(X[:, i]):
            counts[x, Y[row]] += 1
        
        # For each value of y add conditional probability
        for y in range(Y.max() + 1):
            count_sum = float(counts[:, y].sum())
            probs = counts[:, y] / count_sum
            entropy = -probs * np.log2(probs)
            ce_sum_partial += (entropy * Y_dist[y]).sum()

        ce_sum += ce_sum_partial
        
    return ce_sum

In [3]:
# Some data: two conditionally independent variables
X = np.random.randint(0, 2, size=(100, 80))
Y = np.random.randint(0, 2, size=(100))

In [4]:
# Check that function calculates correctly entropy
val = conditional_entropy_python(X, Y)
assert abs(val - X.shape[1]) < 0.1, "X and Y are conditionally independent. 1 bit per column"

IndexError: tuple index out of range

In [None]:
%timeit conditional_entropy_python(X, Y)