In [5]:
# decision_trees.ipynb
# based on https://github.com/joelgrus/data-science-from-scratch/blob/master/first-edition/code/decision_trees.py
# adapted to python 3 
from collections import Counter, defaultdict
from functools import partial
import math
from pprint import pprint as pp

In [6]:
def entropy(labeled_data):        
    labels = [label for _, label in labeled_data]
    class_probabilities = [count / len(labels) 
            for count in Counter(labels).values()]
    # given a list of class probabilities, compute the entropy
    return sum(-p * math.log(p, 2) for p in class_probabilities if p)

def partition_entropy(subsets):
    """find the entropy from this partition of data into subsets"""
    total_count = sum(len(subset) for subset in subsets)
    return sum( entropy(subset) * len(subset) / total_count
                for subset in subsets )
    
def partition_by(inputs, key):
    """returns a dict of inputs partitioned by the key
    each input is a pair (attribute_dict, label)"""
    groups = defaultdict(list)
    # a python dictionary throws a KeyError if you try to get an item 
    # with a _key that is not currently in the dictionary. 
    # The defaultdict() in contrast will simply create any items 
    # that you try to access if they do not exist yet. 
    # To create such a "default" item, it calls the function object 
    # that you pass to the constructor, list() in our case.
    for input in inputs:
        _key = input[0][key]
        groups[_key].append(input)
    return groups   

def partition_entropy_by(inputs, key):
    """computes the entropy corresponding to the given partition"""        
    partitions = partition_by(inputs, key)
    return partition_entropy(partitions.values())

In [7]:
def build_tree_id3(inputs, split_candidates=None):
    # Quinlan, J. R. 1986. Induction of Decision Trees. Mach. Learn. 1, 1 (Mar. 1986), 81–106
    # https://hunch.net/~coms-4771/quinlan.pdf

    # if this is our first pass, 
    # all keys of the first input are split candidates
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()

    # count Trues and Falses in the inputs
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues
    
    if num_trues == 0:                  # if only Falses are left
        return False                    # return a "False" leaf
        
    if num_falses == 0:                 # if only Trues are left
        return True                     # return a "True" leaf

    if not split_candidates:            # if no split candidates left
        return num_trues >= num_falses  # return the majority leaf
                            
    # otherwise, split on the best attribute
    best_attribute = min(split_candidates,
                         key=partial(partition_entropy_by, inputs))

    partitions = partition_by(inputs, best_attribute)
    new_candidates = [a for a in split_candidates 
                      if a != best_attribute]
    
    # recursively build the subtrees
    subtrees = { attribute : build_tree_id3(subset, new_candidates)
                 for attribute, subset in partitions.items() }

    subtrees[None] = num_trues > num_falses # default case

    return (best_attribute, subtrees)

In [8]:
def classify(tree, input):
    """classify the input using the given decision tree"""
    
    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree
   
    # otherwise find the correct subtree
    attribute, subtree_dict = tree
    
    subtree_key = input.get(attribute)  # None if input is missing attribute

    if subtree_key not in subtree_dict: # if no subtree for key,
        subtree_key = None              # we'll use the None subtree
    
    subtree = subtree_dict[subtree_key] # choose the appropriate subtree
    return classify(subtree, input)     # and use it to classify the input

In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt   # data visualization
import sys
import matplotlib
matplotlib.use('Agg')
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

In [10]:
import pandas as pd  # Importera pandas och använd "pd" som alias

# Nu kan du använda pandas-funktioner med "pd" prefix
df = pd.read_csv("./hr_train_1.csv")
df.head(1)

Unnamed: 0,level,lang,tweets,phd,ignore
0,Senior,Java,no,no,True


#here converting dataframe to each row into the desired format

In [11]:
import pandas as pd

# Load your DataFrame from the CSV file
df = pd.read_csv("./hr_train_1.csv")

# Define a function to convert each row into the desired format
def row_to_input(row):
    feature_dict = {
        'level': row['level'],
        'lang': row['lang'],
        'tweets': row['tweets'],
        'phd': row['phd']
    }
    ignore = row['ignore']  # Assuming 'ignore' column contains Boolean values
    return (feature_dict, ignore)

# Apply the function to each row of the DataFrame and create a list of tuples
inputs = [row_to_input(row) for _, row in df.iterrows()]

# Print the first 5 entries of the 'inputs' list
print(inputs[:5])  # Adjust the range if you want to print more entries

[({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'no'}, True), ({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'yes'}, True), ({'level': 'Mid', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, True), ({'level': 'Junior', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, True), ({'level': 'Junior', 'lang': 'R', 'tweets': 'yes', 'phd': 'no'}, False)]


In [12]:
# inputs = [
#     ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
#     ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
#     ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
#     ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
#     ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
#     ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
#     ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
#     ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
#     ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
#     ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
#     ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
#     ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
#     ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
#     ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
# ]


print('Partition entropy by key (lower is better)')
for key in ['level','lang','tweets','phd']:
    print (f'{key:<10}', f'{partition_entropy_by(inputs, key):.3f}')
print()

senior_inputs = [(input, label)
                 for input, label in inputs if input["level"] == "Senior"]

print('Partition entropy by key for Seniors (lower is better)')
for key in ['lang', 'tweets', 'phd']:
    print (f'{key:<10}', f'{partition_entropy_by(senior_inputs, key):.3f}')
print()

print ("building the tree")
tree = build_tree_id3(inputs)
pp (tree)
print('\n', "-"*6, 'TEST', "-"*6)
print ("Junior / Java / tweets / no phd")
print (classify(tree, 
                {"level"  : "Junior", 
                "lang"   : "Java", 
                "tweets" : "yes", 
                "phd"    : "no"})
      ) 

print ("Junior / Java / tweets / phd")
print (classify(tree, 
                {"level"  : "Junior", 
                  "lang"   : "Java", 
                  "tweets" : "yes", 
                  "phd"    : "yes"})
      )

print ("Intern", classify(tree, { "level" : "Intern" } ))
print ("Senior", classify(tree, { "level" : "Senior" } ))

Partition entropy by key (lower is better)
level      0.805
lang       0.864
tweets     0.562
phd        0.855

Partition entropy by key for Seniors (lower is better)
lang       0.928
tweets     0.530
phd        0.961

building the tree
('tweets',
 {None: True,
  'no': ('phd',
         {None: True,
          'no': True,
          'yes': ('level',
                  {None: True,
                   'Junior': True,
                   'Mid': True,
                   'Senior': ('lang',
                              {None: True,
                               'Java': True,
                               'Python': False,
                               'R': False})})}),
  'yes': ('level',
          {None: False,
           'Junior': ('lang',
                      {None: True, 'Java': True, 'Python': True, 'R': False}),
           'Mid': ('phd',
                   {None: False,
                    'no': ('lang',
                           {None: False,
                            'Java': False,


In [13]:
print ("Senior", classify(tree, { "Java" : "yes", "tweets" : "yes","phd" : "no"  } ))
print ("Senior", classify(tree, { "Python" : "yes", "tweets" : "no","phd" : "yes"  } ))
print ("Mid", classify(tree, { "Java" : "yes", "tweets" : "yes","phd" : "no"  } ))
print ("Junior", classify(tree, { "Python" : "yes", "tweets" : "yes","phd" : "no"  } ))

Senior False
Senior True
Mid False
Junior False
