In [70]:
import math
from tqdm import tqdm
import pandas as pd
from collections import Counter

In [71]:
def entropy(data, target_attr):
    frequencies = Counter([record[target_attr] for record in data])
    entropy = 0
    for freq in frequencies.values():
        probability = freq / len(data)
        entropy += -probability * math.log2(probability)
    return entropy

In [72]:
def gain_ratio_nominal(data, attribute, target_attr, subset, positives, negatives):
    attribute_entropy = entropy(subset, target_attr)
    values = [record[attribute] for record in data]
    unique_values = set(values)

    iv = 0
    gain = 0
    for value in unique_values:
        value_subset = [record for record in subset if record[attribute] == value]
        value_positives = [record for record in value_subset if record[target_attr] == "yes"]
        value_negatives = [record for record in value_subset if record[target_attr] == "no"]

        if len(value_subset) == 0:
            value_gain_ratio = 0
            value_iv = 0
        else:
            value_gain_ratio = entropy(value_subset, target_attr)
            value_iv = 0
            if len(value_positives) > 0:
                value_iv -= (len(value_positives) / len(value_subset)) * math.log2(len(value_positives) / len(value_subset))
                if len(value_negatives) > 0:
                    value_iv -= (len(value_negatives) / len(value_subset)) * math.log2(len(value_negatives) / len(value_subset))


        iv += (len(value_subset) / len(data)) * value_iv
        gain += (len(value_subset) / len(data)) * value_gain_ratio

    if iv == 0:
        gain_ratio = float('inf')
    else:
        gain_ratio = (entropy(data, target_attr) - gain) / iv

    return gain_ratio

In [240]:
def attribute_values(data, attribute):
    values = [record[attribute] for record in data]
    return list(set(values))

In [74]:
def majority_value(data, target_attr):
    value_counts = Counter(record[target_attr] for record in data)
    majority_value = max(value_counts, key=value_counts.get)
    return majority_value

In [292]:
def cn2(data, attributes, target_attr):
    """
    CN2 algorithm for inducing a decision rule-based classifier.
    """
    # Initialize the list of induced rules
    rules = []
    
    # Initialize a list of positive and negative examples
    positives = [record for record in data if record[target_attr] == "yes"]
    negatives = [record for record in data if record[target_attr] == "no"]

    # Start the algorithm
    length_attr = len(attributes)
    for i in range(length_attr):
        while positives:
            # Select the best attribute and its corresponding subset
            best_subset, best_attr = None, None
            max_gain_ratio = 0
            for attr in attributes:
                for value in attribute_values(data, attr):
                    subset = [record for record in positives if record[attr] == value]
                    gain_ratio = gain_ratio_nominal(data, attr, target_attr, subset, positives, negatives)
                    if gain_ratio > max_gain_ratio:
                        best_subset, best_attr, max_gain_ratio = subset, attr, gain_ratio
            if max_gain_ratio == 0:
                break
            rule = {"attribute": best_attr, "subset": best_subset}
            attributes.remove(best_attr)
            # pbar.update(1)


            # Remove the covered examples from the positive examples
            positives = [record for record in positives if record not in best_subset]

            # Find the majority value for the covered examples and add it to the rule
            rule["conclusion"] = majority_value(best_subset, target_attr)

            # Add the rule to the list of induced rules
            rules.append(rule)
            
        # Add a default rule for negative examples
        rules.append({"attribute": "default", "subset": negatives, "conclusion": "no"})

    return rules

In [286]:
data = [{"outlook": "sunny", "temperature": "hot", "humidity": "high", "windy": "false", "play": "no"},
        {"outlook": "sunny", "temperature": "hot", "humidity": "high", "windy": "true", "play": "no"},
        {"outlook": "overcast", "temperature": "hot", "humidity": "high", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "mild", "humidity": "high", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "cool", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "cool", "humidity": "normal", "windy": "true", "play": "no"},
        {"outlook": "overcast", "temperature": "cool", "humidity": "normal", "windy": "true", "play": "yes"},
        {"outlook": "sunny", "temperature": "mild", "humidity": "high", "windy": "false", "play": "no"},
        {"outlook": "sunny", "temperature": "cool", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "mild", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "sunny", "temperature": "mild", "humidity": "normal", "windy": "true", "play": "yes"},
        {"outlook": "overcast", "temperature": "mild", "humidity": "high", "windy": "true", "play": "yes"},
        {"outlook": "overcast", "temperature": "hot", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "mild", "humidity": "high", "windy": "true", "play": "no"},]

In [198]:
attributes = ["outlook", "temperature", "humidity", "windy"]
target_attr = "play"

In [295]:
def print_rules(rules):
    for rule in rules:
        if rule["attribute"] == "default":
            print("Default rule: {} -> {}".format(rule["subset"], rule["conclusion"]))
        else:
            attribute_value = [record[rule["attribute"]] for record in rule["subset"]][0]
            print("If {} = {} then class is {}".format(rule["attribute"], attribute_value, rule["conclusion"]))
# rules = cn2(data, attributes, target_attr)


In [296]:
data = None
csv_data = None
csv_attributes = None
csv_target_attr = None
# attribute_values = None
attributes = None
target_attr = None
data = [{"outlook": "sunny", "temperature": "hot", "humidity": "high", "windy": "false", "play": "no"},
        {"outlook": "sunny", "temperature": "hot", "humidity": "high", "windy": "true", "play": "no"},
        {"outlook": "overcast", "temperature": "hot", "humidity": "high", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "mild", "humidity": "high", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "cool", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "cool", "humidity": "normal", "windy": "true", "play": "no"},
        {"outlook": "overcast", "temperature": "cool", "humidity": "normal", "windy": "true", "play": "yes"},
        {"outlook": "sunny", "temperature": "mild", "humidity": "high", "windy": "false", "play": "no"},
        {"outlook": "sunny", "temperature": "cool", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "mild", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "sunny", "temperature": "mild", "humidity": "normal", "windy": "true", "play": "yes"},
        {"outlook": "overcast", "temperature": "mild", "humidity": "high", "windy": "true", "play": "yes"},
        {"outlook": "overcast", "temperature": "hot", "humidity": "normal", "windy": "false", "play": "yes"},
        {"outlook": "rainy", "temperature": "mild", "humidity": "high", "windy": "true", "play": "no"},]
attributes = ["outlook", "temperature", "humidity", "windy"]
target_attr = "play"
csv_data = pd.read_csv('file.csv')
csv_attributes = csv_data.columns.values.tolist()[:-1]
# print(attributes)
csv_target_attr = csv_data.columns.values.tolist()[-1]
print_rules(cn2(csv_data.to_dict('records'), csv_attributes, csv_target_attr))
print_rules(cn2(data, attributes, target_attr))


If outlook = sunny then class is yes
If temperature = mild then class is yes
If humidity = normal then class is yes
If windy = False then class is yes
Default rule: [{'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': False, 'play': 'no'}, {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': True, 'play': 'no'}, {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': True, 'play': 'no'}, {'outlook': 'sunny', 'temperature': 'mild', 'humidity': 'high', 'windy': False, 'play': 'no'}, {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'high', 'windy': True, 'play': 'no'}] -> no
Default rule: [{'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': False, 'play': 'no'}, {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': True, 'play': 'no'}, {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': True, 'play': 'no'}, {'outlook': 'sunny', 'temperature': 'mild', 'humidity': 'h

In [297]:
csv_data.to_dict('records')

[{'outlook': 'sunny',
  'temperature': 'hot',
  'humidity': 'high',
  'windy': False,
  'play': 'no'},
 {'outlook': 'sunny',
  'temperature': 'hot',
  'humidity': 'high',
  'windy': True,
  'play': 'no'},
 {'outlook': 'overcast',
  'temperature': 'hot',
  'humidity': 'high',
  'windy': False,
  'play': 'yes'},
 {'outlook': 'rainy',
  'temperature': 'mild',
  'humidity': 'high',
  'windy': False,
  'play': 'yes'},
 {'outlook': 'rainy',
  'temperature': 'cool',
  'humidity': 'normal',
  'windy': False,
  'play': 'yes'},
 {'outlook': 'rainy',
  'temperature': 'cool',
  'humidity': 'normal',
  'windy': True,
  'play': 'no'},
 {'outlook': 'overcast',
  'temperature': 'cool',
  'humidity': 'normal',
  'windy': True,
  'play': 'yes'},
 {'outlook': 'sunny',
  'temperature': 'mild',
  'humidity': 'high',
  'windy': False,
  'play': 'no'},
 {'outlook': 'sunny',
  'temperature': 'cool',
  'humidity': 'normal',
  'windy': False,
  'play': 'yes'},
 {'outlook': 'rainy',
  'temperature': 'mild',
  'h