In [1]:
import pandas as pd
import numpy as np
import warnings

# This is due to strange behavior from pandas. I looked up this solution. I have no idea why it happened.
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# The Rule class.
class Rule:
    
    def __init__(self, class_label):
        self.conditions = []
        self.class_label = class_label
    
    def __repr__(self):
        return "If {} then {}. ".format(self.conditions, self.class_label)

# The Condition class.
class Condition:
    
    def __init__(self, attribute, value, numeric_greater=None):
        self.attribute = attribute
        self.value = value
        self.numeric_greater = numeric_greater
    
    def __repr__(self):
        if self.numeric_greater is None:
            return self.attribute + "=" + self.value
        else:
            return self.attribute + ">=" + str(self.value) + ":" + str(self.numeric_greater)
    
    # Takes any subset of the data and a class label, returns accuracy, coverage and the covered portion of the subset.
    # Accuracy is returned as 0 if coverage is 0, to avoid ArithmeticError
    def score(self, class_label, frame):
        if self.numeric_greater is None:
            covered = frame[frame[self.attribute] == self.value]
            true = len(covered[covered['outcome'] == class_label])
            if len(covered) == 0:
                return (0, 0, frame)
            return (true/len(covered), len(covered), covered)
        elif self.numeric_greater == True:
            covered = frame[frame[self.attribute] >= self.value]
            true = len(covered[covered['outcome'] == class_label])
            if len(covered) == 0:
                return (0, 0, frame)
            return (true/len(covered), len(covered), covered)
        else:
            covered = frame[frame[self.attribute] < self.value]
            true = len(covered[covered['outcome'] == class_label])
            if len(covered) == 0:
                return (0, 0, frame)
            return (true/len(covered), len(covered), covered)

In [3]:
# It tries to either learns a new Rule with one Condition, or adds one Condition to an existing Rule.
# Returns the Rule, the new attribute list after removing the attribute in the Condition, the Condition, 
# the accuracy, the coverage and the covered subset in that order.
# If it fails to produce new Rule, then returns an empty Rule, the original attribute list, a None Condition,
# a 0 accuracy, a 0 coverage and the original subset.
# If it fails to update existing Rule, then returns the original Rule, the original attribute list,
# a None Condition, the original accuracy, the original coverage and the original subset.
def learn_one_rule(attributes, data, class_label, cov_threshold, prev_acc=0, prev_cov=0, prev_rule=None):
    
    rule = prev_rule if prev_rule is not None else Rule(class_label)
    
    best_acc = prev_acc
    best_cov = prev_cov
    best_condition = None
    best_covered = data
    
    # Iterates over every attribute other than "outcome"
    for attribute in attributes:
        if attribute == "outcome":
            continue
        
        # All values of attributes are tested to find which value yields the best condition for the rule
        unique_values = data[attribute].unique().tolist()
        if len(unique_values) == 0:
            continue
        
        if type(unique_values[0]) == int or type(unique_values[0]) == float:
            for val in unique_values:
                new_condition = Condition(attribute, val, True)
                acc, cov, covered = new_condition.score(class_label, data)
                
                if cov > cov_threshold and (acc > best_acc or (acc == best_acc and cov > best_cov)):
                    best_acc = acc
                    best_cov = cov
                    best_condition = new_condition
                    best_covered = covered
            for val in unique_values:
                new_condition = Condition(attribute, val, False)
                acc, cov, covered = new_condition.score(class_label, data)
                
                if cov > cov_threshold and (acc > best_acc or (acc == best_acc and cov > best_cov)):
                    best_acc = acc
                    best_cov = cov
                    best_condition = new_condition
                    best_covered = covered
        else:
            for val in unique_values:
                new_condition = Condition(attribute, val, None)
                acc, cov, covered = new_condition.score(class_label, data)
                
                if cov > cov_threshold and (acc > best_acc or (acc == best_acc and cov > best_cov)):
                    best_acc = acc
                    best_cov = cov
                    best_condition = new_condition
                    best_covered = covered
    
    new_attributes = attributes.copy()
    if best_condition is not None:
        rule.conditions.append(best_condition)
        new_attributes.remove(best_condition.attribute)
    
    return (rule, new_attributes, best_condition, best_acc, best_cov, best_covered)

In [4]:
def PRISM(attributes, data, class_label, acc_threshold, cov_threshold, max_iterations=30):
    rules = []
    
    # Copies the data. The variable 'frame' will be modified once a Rule is established.
    frame = data.copy()
    
    # Keeps going until the remaining elements can no longer create a Rule that has large enough coverage.
    while len(frame) > cov_threshold:
        
        # Copies the subset of data. This will be modified every time the Rule is updated; 
        # it is the subset that satisfies old Rule and needs to be used in trying to update the Rule.
        subframe = frame.copy()
        
        # Learns a Rule, then keeps updating it until it can be updated no further.
        rule, remaining_attributes, new_condition, acc, cov, subframe = learn_one_rule(attributes, frame, class_label, cov_threshold)
        if new_condition is None:
            return ([])
        while new_condition is not None and acc < 1 and cov > cov_threshold:
            rule, remaining_attributes, new_condition, acc, cov, subframe = learn_one_rule(remaining_attributes, subframe, class_label, cov_threshold, acc, cov, rule)
        
        # If the Rule can be refined to the extent that it passes acc_threshold, it gets remembered.
        if acc > acc_threshold:
            rules.append("{}Accuracy: {}, Coverage: {}.".format(rule, acc, cov))

            # From frame, remove the elements covered by the Rule.
            # https://stackoverflow.com/questions/37313691/how-to-remove-a-pandas-dataframe-from-another-dataframe
            frame = pd.concat([frame, subframe, subframe]).drop_duplicates(keep=False)

            # Breaks if max_iterations is reached.
            if len(rules) >= max_iterations:
                return rules
        # If the Rule doesn't pass, then we know the best Rule can't pass; hence no Rule after this.
        else:
            break
    return rules

In [5]:
data = pd.read_csv("covid_categorical_good.csv")
data = data.dropna(how="any")
data.columns

Index(['sex', 'age', 'diabetes', 'copd', 'asthma', 'imm_supr', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'outcome'],
      dtype='object')

In [6]:
data_categorical = data.drop(columns=['age'])
column_list = data_categorical.columns.to_numpy().tolist()
column_list

['sex',
 'diabetes',
 'copd',
 'asthma',
 'imm_supr',
 'hypertension',
 'cardiovascular',
 'obesity',
 'renal_chronic',
 'tobacco',
 'outcome']

In [7]:
column_list = data.columns.to_numpy().tolist()
column_list

['sex',
 'age',
 'diabetes',
 'copd',
 'asthma',
 'imm_supr',
 'hypertension',
 'cardiovascular',
 'obesity',
 'renal_chronic',
 'tobacco',
 'outcome']

In [8]:
# 20 Rules for the "alive" class label. It ran for less than half a minute.
rules = PRISM(column_list, data, "alive", 0.6, 30, 20)
for line in rules:
    print(line)

If [age>=26:False, tobacco=yes, asthma=yes] then alive. Accuracy: 1.0, Coverage: 47.
If [age>=9:False, imm_supr=no, diabetes=no] then alive. Accuracy: 0.925, Coverage: 40.
If [age>=15:False, imm_supr=no, hypertension=no] then alive. Accuracy: 0.8780487804878049, Coverage: 41.
If [age>=20:False, hypertension=yes, imm_supr=no] then alive. Accuracy: 0.9117647058823529, Coverage: 34.
If [age>=21:False, obesity=yes] then alive. Accuracy: 0.8157894736842105, Coverage: 38.
If [age>=33:False, copd=yes, hypertension=no, diabetes=no] then alive. Accuracy: 0.9117647058823529, Coverage: 34.
If [age>=99:True, cardiovascular=no] then alive. Accuracy: 0.7352941176470589, Coverage: 34.
If [age>=33:False, copd=yes] then alive. Accuracy: 0.8709677419354839, Coverage: 31.
If [age>=44:False, copd=yes, asthma=yes] then alive. Accuracy: 0.96875, Coverage: 32.
If [age>=21:False, sex=male] then alive. Accuracy: 0.7741935483870968, Coverage: 31.
If [age>=34:False, cardiovascular=yes, obesity=yes, imm_supr=no, 

In [9]:
# Attempt at retrieving 5 rules for the "dead" category. Looks like no Rule of accuracy greater than 0.6 exist.
rules = PRISM(column_list, data, "dead", 0.6, 30, 5)
for line in rules:
    print(line)