In [1]:
class DecisionNode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col # attribute on which to split
        self.value = value # value on which to split
        self.results = results #If the node has no children - we store here class labels with their counts
        self.tb = tb  # True branch
        self.fb = fb  # False branch

In [2]:
def split(rows, column, value):
    # define split function according to the value type
    split_function = None
    if isinstance(value, int) or isinstance(value, float):
        split_function = lambda row: row[column] >= value
    else:
        split_function = lambda row: row[column] == value

    # Divide the rows into two sets and return them
    set1 = [row for row in rows if split_function(row)]
    set2 = [row for row in rows if not split_function(row)]
    return (set1, set2)

In [3]:
def count_labels(rows):
    label_count = {}
    for row in rows:
        # The class label is in the last column
        label = row[- 1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    return label_count

In [4]:
data_file = "C:/Users/cuiji/Desktop/ML2020LAB/ml_covid_rules_lab-master/covid_categorical_good.csv"

In [5]:
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")
data.columns

Index(['sex', 'age', 'diabetes', 'copd', 'asthma', 'imm_supr', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'outcome'],
      dtype='object')

In [6]:
data_rows = data.to_numpy().tolist()
len(data_rows)

219179

In [7]:
columns_list = data.columns.to_numpy().tolist()
print(columns_list)

['sex', 'age', 'diabetes', 'copd', 'asthma', 'imm_supr', 'hypertension', 'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'outcome']


In [9]:
outcomes = []
for r in data_rows:
    if r[-1] not in outcomes:
        outcomes.append(r[-1])
print(outcomes)

['alive', 'dead']


In [58]:
R = []
a = 0
b = 0
gsize = len(data_rows)
def accuracy(s, col, value, c):
    total = len(s)
    count = 0
    true = split(s, col, value)[0]
    for r in true:
        if r[-1] == c:
            count += 1
    return count/total

def learnOneRule(E, c):
    global a
    global b
    global R
    global gsize
    M = None

    column_count= len(E[0]) - 1


    best_accuracy = 0
    best_coverage = 0
    best_rule = None
    r_cover = None

    for col in range(0, column_count):
        column_values = set()
        for row in E:
            column_values.add(row[col])

        for value in column_values:
            (set1, set2) = split(E, col, value)

            acc = accuracy(set1, col, value, c)
            if type(value) is int or type(value) is float:
                acc = max(acc, 1-acc)
            p = float(len(set1)) / len(E)

            if acc > best_accuracy or (acc == best_accuracy and p > best_coverage):
                best_accuracy = acc
                best_coverage = p
                best_rule = (col, value)
                r_cover = set1
                M = set2
    print(best_rule)
    print(best_accuracy)
    print(best_coverage)
    R.append(best_rule)
    a += best_accuracy*len(set1)/gsize
    b += best_coverage
    return M

def PRISM(rows, c, accuracy_threshold=1, coverage_threshold=0):
    R = []
    a = 0
    b = 0
    gsize = len(rows)
    while len(rows) != 0 and (b == 0 or a < accuracy_threshold) and b < coverage_threshold:
        rows = learnOneRule(rows, c)

PRISM(data_rows, 'alive', 0.99, 0.95)

(1, 106)
1.0
2.7374885367667522e-05
(6, 'no')
0.9118518814640526
0.7989259625957577
(4, 'yes')
0.8127159640635798
0.03283412752439301
(2, 'no')
0.788655724080615
0.5646012716139174
(1, 103)
1.0
5.3885116930703737e-05
(1, 100)
0.75
0.0002155520827719998
(1, 99)
1.0
0.0001077992777448391
(1, 97)
0.7142857142857143
0.00037733814888685246
(0, 'female')
0.7080097368726093
0.4652178602243313
(8, 'yes')
0.6834862385321101
0.2857719068266613
(9, 'no')
0.6595744680851063
0.8759000423549343
(1, 92)
1.0
0.0011376564277588168
(1, 90)
1.0
0.002277904328018223
(1, 88)
0.6666666666666667
0.00684931506849315
(1, 84)
0.6363636363636364
0.01264367816091954
(1, 78)
0.6428571428571428
0.048894062863795114
(1, 77)
0.5882352941176471
0.0208078335373317
(1, 75)
0.5714285714285714
0.035
(1, 73)
0.5454545454545454
0.05699481865284974
(1, 72)
0.5555555555555556
0.03708791208791209
(1, 65)
0.5502645502645502
0.26961483594864477
(1, 64)
0.65
0.0390625
(5, 'yes')
0.5714285714285714
0.08536585365853659
(1, 61)
0.54

In [None]:
def prediction(leaf_labels):
    total = 0
    result = {}
    for label, count in leaf_labels.items():
        total += count
        result[label] = count

    for label, val in result.items():
        result[label] = str(int(result[label]/total * 100))+"%"

    return result

def print_tree(tree, current_branch, attributes=None,  indent='', leaf_funct=prediction):
    # Is this a leaf node?
    if tree.results != None:
        print(indent + current_branch + str(leaf_funct(tree.results)))
    else:
        # Print the split question
        split_col = str(tree.col)
        if attributes is not None:
            split_col = attributes[tree.col]
        split_val = str(tree.value)
        if type(tree.value) == int or type(tree.value) == float:
            split_val = ">=" + str(tree.value)
        print(indent + current_branch + split_col + ': ' + split_val + '? ')

        # Print the branches
        indent = indent + '  '
        print_tree(tree.tb, 'T->', attributes, indent)
        print_tree(tree.fb, 'F->', attributes, indent)

print_tree(tree, " ", columns_list)