In [299]:
import pandas as pd
import numpy as np
import matplotlib as mp
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Creation of a test Dataframe

In [301]:
s = pd.Series(np.random.random(10000))
df = pd.DataFrame(s)
df.columns = ["Feature"]

In [302]:
df.head()

Unnamed: 0,Feature
0,0.062234
1,0.57379
2,0.104597
3,0.312247
4,0.631774


In [303]:
df['Target'] = df.Feature.apply(lambda x: 1 if x>0.5 else 0)

In [304]:
# We have a category A of middle size rather on 1, B is big but perfectly balanced, \
# C is small ands tends toward 0, D is the leftover.
df['Feature'] = df.Feature.apply(lambda x: 'A' if (x>0.8 or x<0.05) \
                                 else 'B' if (x>0.3 and x<0.7) \
                                 else 'C' if ((x>0.2 and x<0.3) or (x<0.8 and x>0.78)) \
                                 else 'D')

In [305]:
df.tail(10)

Unnamed: 0,Feature,Target
9990,B,0
9991,B,1
9992,B,1
9993,D,0
9994,B,0
9995,B,0
9996,A,1
9997,B,0
9998,B,1
9999,A,1


In [306]:
df.Feature.value_counts()

B    3953
A    2527
D    2333
C    1187
Name: Feature, dtype: int64

### Functions

In [308]:
"""
Computes entropy of a category with respect to the target
"""

def compute_category_entropy(feature, target, category):
    targets = target.unique()
    probas = []
    
    for t in targets:
        #We compute the probabilities, for a given category, to find it associated with each target
        p = float(feature[feature[feature == category].index  & target[target == t].index].count()) / float(feature[feature == category].count())
        probas.append(p)
    
    category_entropy = sum(-p*log(p) if p != 0 else 0 for p in probas)
    category_ratio = feature[feature == category].count() / float(feature.count())

    return (category_entropy,category_ratio)

In [314]:
"""
Computes entropy of all the non-selected categories (the 'others')
"""

def compute_others_entropy(feature, target, selected_categories):
    other_probas = []
    targets = target.unique()
    
    #If there are no others, then entropy is 0
    if feature[~feature.isin(selected_categories)].count() == 0:
        return (0,0)
    
    for t in targets:
        #We compute the probabilities for any other value to find it associated with each target
        p = float(feature[feature[~feature.isin(selected_categories)].index  & target[target == t].index].count())\
            / float(feature[~feature.isin(selected_categories)].count())
        other_probas.append(p)

    other_entropy = sum(-p*log(p) if p != 0 else 0 for p in other_probas)
    other_ratio = feature[~feature.isin(selected_categories)].count() / float(feature.count())
    
    return (other_entropy,other_ratio)

In [None]:
"""
Computes entropy given the individual entropies of each category and a set of selected categories
"""

def compute_entropy(entropies, feature, target, selected_categories):
    #First we compute the entropy of the others
    others_entropy,others_ratio = compute_others_entropy(feature, target, selected_categories)
    print "Computing entropy"
    print "Others entropy : "+str(others_entropy)+" with ratio "+str(others_ratio)
    
    selected_entropies = {k:v for k,v in entropies.iteritems() if k in selected_categories}
    print "Selected categories entropy : "+str(selected_entropies)
    
    #We sum the entropies of all categories, modulated with their importance (the ratio)
    entropy = sum(e[1] * e[0] for e in selected_entropies.values()) + others_entropy * others_ratio
    
    return entropy

In [315]:
def otherize_feature(feature, target, max_categories=0, max_entropy=0.): 
    selected_categories = []
    categories = feature.unique()
    categories_entropies = {}
    
    #We compute the entropy of each category
    for c in categories:
        categories_entropies[c]=compute_category_entropy(feature, target, c)
    
    print "Base entropy computation"
    base_entropy = compute_entropy(categories_entropies, feature, target, selected_categories)
    print "Base entropy is : "+str(base_entropy)
    
    while True:
        if len(selected_categories) >= max_categories & max_categories != 0:
            print "Maximum number of categories reached, otherization complete"
            break
        if entropy < max_entropy:
            print "Entropy treshold reached, otherization complete"
            break
        
        print ""
        print "Selected categories are "+str(selected_categories)
        print "Base entropy : "+str(base_entropy)
        gains = []
        
        #We compute the entropy gain for each scenario where we select a category and put it out of the others
        entropies = {}
        for c in categories:
            print "Including category "+c
            new_entropy = compute_entropy(categories_entropies, feature, target, selected_categories+[c])
            entropies[c] = new_entropy
            print "New entropy with "+c+" included would be: "+str(new_entropy)
            gain = base_entropy - new_entropy
            gains.append(gain)
            
        #If there is a gain, we select the category with the best gain
        if max(gains)>0:
            j = gains.index(max(gains))
            selected_categories.append(categories[j])
            print ""
            print "Adding category "+str(categories[j])
            base_entropy = entropies[categories[j]]
            categories = np.delete(categories, j)
        else :
            print "No more entropy to gain, otherization complete"
            break
            
    return selected_categories

In [318]:
#Small test
otherize_feature(df.Feature, df.Target, max_categories=2)

Base entropy computation
Computing entropy
Others entropy : 0.693082199152 with ratio 1.0
Selected categories entropy : {}
Base entropy is : 0.693082199152

Selected categories are []
Base entropy : 0.693082199152
Including category D
Computing entropy
Others entropy : 0.687734398048 with ratio 0.7667
Selected categories entropy : {'D': (0.64965997927967822, 0.23330000000000001)}
New entropy with D included would be: 0.67885163615
Including category B
Computing entropy
Others entropy : 0.693002107918 with ratio 0.6047
Selected categories entropy : {'B': (0.69314330885452535, 0.39529999999999998)}
New entropy with B included would be: 0.693057924648
Including category A
Computing entropy
Others entropy : 0.674906330777 with ratio 0.7473
Selected categories entropy : {'A': (0.4946438510579233, 0.25269999999999998)}
New entropy with A included would be: 0.629354002152
Including category C
Computing entropy
Others entropy : 0.688322660535 with ratio 0.8813
Selected categories entropy : {'C

['A', 'B']