In [30]:
import pandas as pd
from collections import Counter
# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age)
colum_names = ['age', 'workclass', 'fnlwgt', 'education', 
               'education_num', 'marital_status', 'occupation',
               'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'high_income']
income = pd.read_csv("income.csv", index_col=False,names=colum_names)


In [31]:
income.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [32]:
income.columns


Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'high_income'],
      dtype='object')

In [33]:
#convert all categorical values into numeric in order to do computations
col = pd.Categorical(income["workclass"])
income["workclass"] = col.codes

for name in ["education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]:
    col = pd.Categorical(income[name])
    income[name] = col.codes

In [34]:
# new dataframe where contains only the workclass private
income[income.workclass ==4].head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23,0


## how to perform a split in a decision tree

* For example, for a binary classification problem . High entropy means that 1's and 0's are very mixed.

In [35]:
import math
def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count for count in Counter(labels).values()]

def entropy(class_probabilities):
    return  sum(- p * math.log(p, 2) for p in class_probabilities if p )

def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [36]:
print(Counter(income.workclass))
print(Counter(income.workclass).values())

Counter({4: 22696, 6: 2541, 2: 2093, 0: 1836, 7: 1298, 5: 1116, 1: 960, 8: 14, 3: 7})
dict_values([1298, 2541, 22696, 960, 2093, 1836, 1116, 14, 7])


In [37]:
cl = class_probabilities(income.marital_status)
entropy(cl)
print(cl)

[0.328091889069746, 0.4599367341297872, 0.1364515831823347, 0.012837443567458001, 0.03147937716900587, 0.0007063665120850096, 0.030496606369583245]


In [38]:
print(entropy(cl))
print(calc_entropy(income.marital_status))

1.8336493538835446


NameError: name 'np' is not defined

In [39]:
-(0.8 * math.log(0.8, 2) + 0.2 * math.log(0.2, 2))

0.7219280948873623

In [40]:
import numpy as np

np.bincount(income.workclass) /len(income.workclass)


array([5.63864746e-02, 2.94831240e-02, 6.42793526e-02, 2.14981112e-04,
       6.97030189e-01, 3.42741316e-02, 7.80381438e-02, 3.98636406e-02,
       4.29962225e-04])

In [41]:
income_entropy = calc_entropy(income["high_income"])

median_age = income["age"].median()

left_split = income[income["age"] <= median_age]
right_split = income[income["age"] > median_age]

age_information_gain = income_entropy - ((left_split.shape[0] / income.shape[0]) * calc_entropy(left_split["high_income"]) +
 ((right_split.shape[0] / income.shape[0]) * calc_entropy(right_split["high_income"])))

In [42]:
age_information_gain

0.047028661304691965

In [43]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

# Verify that our answer is the same as on the last screen
print(calc_information_gain(income, "age", "high_income"))

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

information_gains = [calc_information_gain(income,ig,'high_income') for ig in columns]

highest_gain = columns[np.argmax(information_gains)]

0.047028661304691965


In [51]:
def find_best_column(data, target_name, columns):
    information_gains = []
    information_gains = [calc_information_gain(data,col,target_name) for col in columns]
    highest_gain = columns[np.argmax(information_gains)]
    return highest_gain

In [52]:
unique_targets = pd.unique(income.high_income)


In [53]:

unique_targets

array([0, 1])

In [54]:
data = pd.DataFrame([
    [0,20,0],
    [0,60,2],
    [0,40,1],
    [1,25,1],
    [1,35,2],
    [1,55,1]
    ])
# Assign column names to the data
data.columns = ["high_income", "age", "marital_status"]

# Call the function on our data to set the counters properly
#id3(data, "high_income", ["age", "marital_status"])
label_1s = []
label_0s = []

def id3(data, target, columns):
    unique_targets = pd.unique(data[target])

    if len(unique_targets) == 1:
        if 0 in unique_targets:
            label_0s.append(0)
        elif 1 in unique_targets:
            label_1s.append(1)
        return
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    
    for split in [left_split, right_split]:
        id3(split, target, columns)



In [56]:

id3(data, "high_income", ["age", "marital_status"])