In [48]:
import numpy as np
from hellinger_distance_criterion import HellingerDistanceCriterion
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Random Forest criterions comparison 

In [50]:
def compare(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(criterion='gini', max_depth=4, n_estimators=100)
    clf.fit(X_train, y_train)
    print('gini score: ', clf.score(X_test, y_test))

    clf = RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=100)
    clf.fit(X_train, y_train)
    print('entropy score: ', clf.score(X_test, y_test))

    hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64'))
    clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100)
    clf.fit(X_train, y_train)
    print('hellinger distance score: ', clf.score(X_test, y_test))

# Comparison on breast cancer dataset

In [57]:
bc = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.3)
compare(X_train, y_train, X_test, y_test)

gini score:  0.9415204678362573
entropy score:  0.9415204678362573
hellinger distance score:  0.9298245614035088


# Comparison on custom dataset

In [64]:
X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
compare(X_train, y_train, X_test, y_test)

gini score:  0.94575
entropy score:  0.9455
hellinger distance score:  0.94525
