In [1]:
import localutils as lu
import numpy as np
import pandas as pd
from sklearn.preprocessing import label_binarize

Load the Hugin dataset

In [2]:
X, y, feature_names = lu.load_hugin_dataset("synthetic_v1_100.dat", 'Class')

In [3]:
X.shape

(100, 3)

In [4]:
y.shape

(100,)

In [5]:
feature_names

array(['F3', 'F2', 'F1'], dtype=object)

Mutual information

In [6]:
yb=label_binarize(y, classes = ['Yes', 'No'])

In [7]:
Y = np.concatenate((1 - yb, yb), axis=1)

F1

In [8]:
np.dot(Y.T, X[:, 2])

array([39, 19])

In [9]:
np.dot(Y.T, 1-X[:, 2])

array([15, 27])

In [10]:
lu.mutual_info([[39, 19], [15, 27]])

0.07124245191512682

F2

In [11]:
np.dot(Y.T, X[:, 1])

array([20, 33])

In [12]:
np.dot(Y.T, 1-X[:, 1])

array([34, 13])

In [13]:
lu.mutual_info([[20, 33], [34, 13]])

0.08875404536246008

F3

In [14]:
np.dot(Y.T, X[:, 0])

array([19, 17])

In [15]:
np.dot(Y.T, 1-X[:, 0])

array([35, 29])

In [16]:
lu.mutual_info([[19, 17], [35, 29]])

0.00024389338556663961

Information Gain

In [17]:
#F3
lu.information_gain([54, 46], [[19, 17], [35, 29]])

0.00024389338556629614

Chi2

In [18]:
#F1
lu.chi2([[39, 19], [15, 27]])

9.747507198781559

In [19]:
#F2
lu.chi2([[20, 33], [34, 13]])

12.008512448356756

In [20]:
#F3
lu.chi2([[19, 17], [35, 29]])

0.03382760780103783

Conditional Mutual Information

In [21]:
# Conditional Mutual Information
# F2 = True
chosen = X[:, 1] == True

chosen_X = X[chosen]
chosen_yb = yb[chosen]
chosen_Y = np.concatenate((1 - chosen_yb, chosen_yb), axis=1)

In [22]:
np.dot(chosen_Y.T, chosen_X[:, 2])

array([13, 12])

In [23]:
np.dot(chosen_Y.T, 1-chosen_X[:, 2])

array([ 7, 21])

In [24]:
lu.mutual_info([[13, 12], [7, 21]])

0.05640170510448253

In [25]:
# Conditional Mutual Information
# F2 = False
chosen = X[:, 1] == False

chosen_X = X[chosen]
chosen_yb = yb[chosen]
chosen_Y = np.concatenate((1 - chosen_yb, chosen_yb), axis=1)

In [26]:
np.dot(chosen_Y.T, chosen_X[:, 2])

array([26,  7])

In [27]:
np.dot(chosen_Y.T, 1-chosen_X[:, 2])

array([8, 6])

In [28]:
lu.mutual_info([[26, 7], [8, 6]])

0.033849795175880035

Scikit-learn

In [29]:
from sklearn.feature_selection import chi2, mutual_info_classif

In [30]:
mutual_info_classif(X, y, discrete_features=True)

array([0.00016905, 0.06151962, 0.0493815 ])

In [31]:
chi2(X, y)

(array([0.02164967, 5.64400085, 4.09395302]),
 array([0.88302289, 0.01751519, 0.04303688]))

Notes
1. Sklearn uses np.log instead of np.log2
2. If you do not specify discrete_features, it assumes all features are continuous for dense matrices and all features are discrete for sparse matrices.
3. Sklearn's chi2 statistic is implemented for sparse matrices and ignores False (or 0) cases.

In [32]:
mutual_info_classif(X, y, discrete_features=True)/np.log(2)

array([0.00024389, 0.08875405, 0.07124245])