In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification, make_blobs, fetch_openml
import sklearn.datasets as datasets

import sys
sys.path.append('../')
from prism_rules import PrismRules

## Example: Simple example with synthetic data

In [2]:
# Create 3 columns in X, and one Y column, where there columns in X each have 6
# unique values, and the Y column follows a simple rule with no exceptions. 

vals = [0, 1, 2, 3, 4, 5]
a_arr = np.random.choice(vals, 1000)
b_arr = np.random.choice(vals, 1000)
c_arr = np.random.choice(vals, 1000)
y = [True if ((a >= 2) and (b >= 4) and (c <= 2)) else False for a, b, c in zip(a_arr, b_arr, c_arr)]
df = pd.DataFrame({"a": a_arr, "b": b_arr, "c": c_arr, "Y": y})

prism = PrismRules()
_ = prism.get_prism_rules(df, 'Y')


Target: False
b = 3
   Support:  the target has value: 'False' for 100.000% of the 196 rows matching the rule 
   Coverage: the rule matches: 196 out of 884 rows for target value: 0. This is:
      22.172% of total rows for target value: 0
      19.600% of total rows in data
b = 1
   Support:  The target has value: 'False' for 100.000% of the 171 remaining rows matching the rule
   Coverage: The rule matches: 171 out of 688 rows remaining for target value: '0'. This is:
      24.855% of remaining rows for target value: '0'
      19.344% of total rows for target value: 0
      17.100% of total rows in data
b = 2
   Support:  The target has value: 'False' for 100.000% of the 162 remaining rows matching the rule
   Coverage: The rule matches: 162 out of 517 rows remaining for target value: '0'. This is:
      31.335% of remaining rows for target value: '0'
      18.326% of total rows for target value: 0
      16.200% of total rows in data
b = 0
   Support:  The target has value: 'False' 

## Example: Using sklearn's make_classification

In [20]:
bunch = make_classification(n_samples=1000, flip_y=0, random_state=0)
df = pd.DataFrame(bunch[0])
df['Y'] = bunch[1]

prism = PrismRules(min_coverage=100)
_ = prism.get_prism_rules(df, 'Y')


Target: 1
1 = High AND 16 = High
   Support:  the target has value: '1' for 100.000% of the 158 rows matching the rule 
   Coverage: the rule matches: 158 out of 500 rows for target value: 0. This is:
      31.600% of total rows for target value: 0
      15.800% of total rows in data
1 = High AND 16 = Med
   Support:  The target has value: '1' for 100.000% of the 117 remaining rows matching the rule
   Coverage: The rule matches: 117 out of 342 rows remaining for target value: '0'. This is:
      34.211% of remaining rows for target value: '0'
      23.400% of total rows for target value: 0
      11.700% of total rows in data
4 = High
   Support:  The target has value: '1' for 81.343% of the 109 remaining rows matching the rule
   Coverage: The rule matches: 109 out of 225 rows remaining for target value: '0'. This is:
      48.444% of remaining rows for target value: '0'
      21.800% of total rows for target value: 0
      10.900% of total rows in data

Target: 0
1 = Low AND 4 = Low

## Example: Using sklearn's make_blobs

In [6]:
bunch = make_blobs(
    n_samples=1000, 
    n_features=20,    
    centers=2,  # Generate 2 blobs
    random_state=0
    )

df = pd.DataFrame(bunch[0])
df['Y'] = bunch[1]

prism = PrismRules()
_ = prism.get_prism_rules(df, 'Y')


Target: 0
0 = Low
   Support:  the target has value: '0' for 100.000% of the 334 rows matching the rule 
   Coverage: the rule matches: 334 out of 500 rows for target value: 0. This is:
      66.800% of total rows for target value: 0
      33.400% of total rows in data
19 = High
   Support:  The target has value: '0' for 100.000% of the 114 remaining rows matching the rule
   Coverage: The rule matches: 114 out of 166 rows remaining for target value: '0'. This is:
      68.675% of remaining rows for target value: '0'
      22.800% of total rows for target value: 0
      11.400% of total rows in data
17 = High
   Support:  The target has value: '0' for 100.000% of the 38 remaining rows matching the rule
   Coverage: The rule matches: 38 out of 52 rows remaining for target value: '0'. This is:
      73.077% of remaining rows for target value: '0'
      7.600% of total rows for target value: 0
      3.800% of total rows in data
13 = High
   Support:  The target has value: '0' for 100.000

## Example: Using the Iris dataset

In [7]:
data = datasets.load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Y'] = data['target']
display(df.head())

prism = PrismRules()
_ = prism.get_prism_rules(df, 'Y')

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Y
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0



Target: 0
petal length (cm) = Low
   Support:  the target has value: '0' for 100.000% of the 50 rows matching the rule 
   Coverage: the rule matches: 50 out of 50 rows for target value: 0. This is:
      100.000% of total rows for target value: 0
      33.333% of total rows in data

Target: 1
petal width (cm) = Med AND petal length (cm) = Med
   Support:  the target has value: '1' for 100.000% of the 47 rows matching the rule 
   Coverage: the rule matches: 47 out of 50 rows for target value: 1. This is:
      94.000% of total rows for target value: 1
      31.333% of total rows in data

Target: 2
petal width (cm) = High AND sepal width (cm) = Low
   Support:  the target has value: '2' for 100.000% of the 18 rows matching the rule 
   Coverage: the rule matches: 18 out of 50 rows for target value: 2. This is:
      36.000% of total rows for target value: 2
      12.000% of total rows in data
petal length (cm) = High AND petal width (cm) = High
   Support:  The target has value: '2' f

## Example: Using the Wine dataset

In [8]:
data = datasets.load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Y'] = data['target']
display(df.head())

prism = PrismRules()
_ = prism.get_prism_rules(df, 'Y')

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Y
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0



Target: 0
proline = High AND alcohol = High
   Support:  the target has value: '0' for 100.000% of the 39 rows matching the rule 
   Coverage: the rule matches: 39 out of 59 rows for target value: 0. This is:
      66.102% of total rows for target value: 0
      21.910% of total rows in data
proline = High AND alcalinity_of_ash = Low
   Support:  The target has value: '0' for 100.000% of the 10 remaining rows matching the rule
   Coverage: The rule matches: 10 out of 20 rows remaining for target value: '0'. This is:
      50.000% of remaining rows for target value: '0'
      16.949% of total rows for target value: 0
      5.618% of total rows in data

Target: 1
color_intensity = Low AND alcohol = Low
   Support:  the target has value: '1' for 100.000% of the 46 rows matching the rule 
   Coverage: the rule matches: 46 out of 71 rows for target value: 1. This is:
      64.789% of total rows for target value: 1
      25.843% of total rows in data
color_intensity = Low
   Support:  The t

## Examples from openml

In [9]:
def test_openml_dataset(dataset_name):
    data = fetch_openml(dataset_name)
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['Y'] = data.target
    display(df.head())
    
    prism = PrismRules(min_coverage=100, verbose=1)
    _ = prism.get_prism_rules(df, 'Y')    

In [10]:
# This finds rules for only one class, which can be a biproduct of some classes
# being much more frequent than other classes, and is not typically a useful
# set of rules, though can provide some insights into the data. 

test_openml_dataset("hypothyroid")

  warn("Multiple active versions of the dataset matching the name"


Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Y
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative


Determining rules for each of 4 target classes:
Determining rules for target class: negative
  Determining next rule. 3,481 rows remaining for target class
  Determining next rule. 2,371 rows remaining for target class
  Determining next rule. 2,002 rows remaining for target class
  Determining next rule. 873 rows remaining for target class
  Determining next rule. 770 rows remaining for target class
  Determining next rule. 659 rows remaining for target class
  Determining next rule. 559 rows remaining for target class
  Determining next rule. 441 rows remaining for target class
  Determining next rule. 341 rows remaining for target class
Determining rules for target class: compensated_hypothyroid
  Determining next rule. 194 rows remaining for target class
Determining rules for target class: primary_hypothyroid
Determining rules for target class: secondary_hypothyroid

Target: negative
TSH = Med
   Support:  the target has value: 'negative' for 100.000% of the 1110 rows matching the 

In [11]:
data = fetch_openml("CreditCardSubset")
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Y'] = data.target
display(df.head())

prism = PrismRules(min_coverage=5, verbose=0)
_ = prism.get_prism_rules(df, 'Y') 

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Y
0,160466.0,-2.248971,-1.982501,-1.285486,-2.092348,1.512952,-2.149212,0.39884,0.009753,-1.278035,...,0.217312,0.1005,-0.407628,0.536201,1.470398,0.48021,-0.141845,-0.358375,145.8,0
1,117596.0,2.061359,-0.047083,-1.060125,0.41825,-0.1443,-1.217912,0.188331,-0.338308,0.533794,...,-0.28803,-0.693703,0.340591,0.047605,-0.298054,0.195206,-0.07045,-0.059714,1.98,0
2,29633.0,-0.697894,1.010449,1.575473,1.779442,-0.212302,-0.227076,0.554638,-0.155702,-0.255471,...,-0.055732,-0.069504,-0.289604,0.390298,0.456975,-0.089808,-0.371611,0.068766,32.38,0
3,41416.0,0.271994,-2.737118,0.114183,-0.090708,-1.750009,0.424753,-0.381162,-0.018583,0.186178,...,0.290923,-0.335537,-0.627308,-0.523477,0.284506,-0.246078,-0.060874,0.117236,598.52,0
4,47454.0,-1.176577,-1.172877,1.432388,-1.431982,-0.056468,-0.517569,0.184114,0.114787,-0.688048,...,0.261313,0.516481,0.253206,0.180029,0.008194,-0.346822,0.10633,-0.034765,142.9,0



Target: 0
V4 = Low
   Support:  the target has value: '0' for 100.000% of the 4747 rows matching the rule 
   Coverage: the rule matches: 4747 out of 14217 rows for target value: 0. This is:
      33.390% of total rows for target value: 0
      33.336% of total rows in data
V10 = Med
   Support:  The target has value: '0' for 100.000% of the 3902 remaining rows matching the rule
   Coverage: The rule matches: 3902 out of 9470 rows remaining for target value: '0'. This is:
      41.204% of remaining rows for target value: '0'
      27.446% of total rows for target value: 0
      27.402% of total rows in data
V12 = Med
   Support:  The target has value: '0' for 100.000% of the 1906 remaining rows matching the rule
   Coverage: The rule matches: 1906 out of 5568 rows remaining for target value: '0'. This is:
      34.231% of remaining rows for target value: '0'
      13.406% of total rows for target value: 0
      13.385% of total rows in data
V16 = High
   Support:  The target has value

In [13]:
test_openml_dataset("cardiotocography")

  warn("Multiple active versions of the dataset matching the name"


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V27,V28,V29,V30,V31,V32,V33,V34,V35,Y
0,23.0,240.0,357.0,120.0,120.0,0.0,0.0,0.0,73.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9
1,45.0,5.0,632.0,132.0,132.0,4.0,0.0,4.0,17.0,2.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
2,45.0,177.0,779.0,133.0,133.0,2.0,0.0,5.0,16.0,2.1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
3,45.0,411.0,1192.0,134.0,134.0,2.0,0.0,6.0,16.0,2.4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
4,45.0,533.0,1147.0,132.0,132.0,4.0,0.0,5.0,16.0,2.4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


Determining rules for each of 10 target classes:
Determining rules for target class: 9
Determining rules for target class: 6
  Determining next rule. 332 rows remaining for target class
Determining rules for target class: 2
  Determining next rule. 579 rows remaining for target class
Determining rules for target class: 8
  Determining next rule. 107 rows remaining for target class
Determining rules for target class: 10
  Determining next rule. 197 rows remaining for target class
Determining rules for target class: 7
  Determining next rule. 252 rows remaining for target class
Determining rules for target class: 1
  Determining next rule. 384 rows remaining for target class
Determining rules for target class: 3
Determining rules for target class: 5
Determining rules for target class: 4

Target: 9
  No rules imputed for target value 9. There are 69 rows for this class

Target: 6
V31 = 1.0
   Support:  the target has value: '6' for 100.000% of the 332 rows matching the rule 
   Coverage: 

## Example: Using a dataset described in Principles of Data Mining

In [14]:
# See the github page for references. This dataset, lens24, was presented p.163.

df = pd.DataFrame(
    {
        "age":    [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3],
        "specRX": [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2],
        "astig":  [1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2],
        "tears":  [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
        "class":  [3, 2, 3, 1, 3, 2, 3, 1, 3, 2, 3, 1, 3, 2, 3, 3, 3, 3, 3, 1, 3, 2, 3, 3]
    }
)

# As this dataset has few rows, we set min_coverage lower than the default 10.
prism = PrismRules(min_coverage=2)
_ = prism.get_prism_rules(df, 'class')


Target: 3
tears = 1
   Support:  the target has value: '3' for 100.000% of the 12 rows matching the rule 
   Coverage: the rule matches: 12 out of 15 rows for target value: 0. This is:
      80.000% of total rows for target value: 0
      50.000% of total rows in data

Target: 2
astig = 1 AND tears = 2 AND specRX = 2
   Support:  the target has value: '2' for 100.000% of the 3 rows matching the rule 
   Coverage: the rule matches: 3 out of 5 rows for target value: 1. This is:
      60.000% of total rows for target value: 1
      12.500% of total rows in data

Target: 1
astig = 2 AND tears = 2 AND specRX = 1
   Support:  the target has value: '1' for 100.000% of the 3 rows matching the rule 
   Coverage: the rule matches: 3 out of 4 rows for target value: 2. This is:
      75.000% of total rows for target value: 2
      12.500% of total rows in data
