In [2]:
import operator
import pandas as pd
import numpy as np
from scipy.stats import entropy
from numpy import histogram
from scipy.stats import iqr

In [3]:
# Read dataset about breast cancer detection
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv")

In [4]:
# Feature Selection
# Separate input and targets

target = df['Classification']
df.drop(['Classification'], axis=1, inplace=True)

def kl(x,y):
	# Compute Kullback-Leibler (KL) divergence
	bins_x = max(2,int(2*iqr(x)*len(x)**-(1/3))) # use Freedman-Diaconis's Rule of thumb
	bins_y = max(2,int(2*iqr(y)*len(y)**-(1/3))) 
	bins = np.min([bins_x,bins_y]) # entropy function requires vectors of same length
	prob_x = histogram(x,bins)[0] + np.finfo(float).eps #avoid division by zero error
	prob_y = histogram(y,bins)[0] + np.finfo(float).eps
	kl_dist = entropy(prob_x, prob_y)
	return kl_dist

# Rank features based on KL divergence
KL_dict = {}
for col in df:
	obs_class1 = df[col][target == 1]
	obs_class2 = df[col][target == 2]
	KL_dict[col] = kl(obs_class1,obs_class2)
sorted_KL = sorted(KL_dict.items(), key=operator.itemgetter(1),reverse=True)
print(sorted_KL)

[('MCP.1', 23.108155830201042), ('Glucose', 0.9088589926775091), ('Leptin', 0.7579402055498925), ('Age', 0.3180042601000994), ('Resistin', 0.2743823759084119), ('BMI', 0.07925847583930119), ('Adiponectin', 0.0027814238004698517), ('HOMA', 0.0016607020266663202), ('Insulin', 1.007652530981098e-05)]
