In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification, make_blobs, fetch_openml
import sklearn.datasets as datasets

import sys
sys.path.append('../')
from prism_rules import PrismRules

This notebook provides an example of looking for all patterns in the data, not just those related to the target column.

This may be done to understand the data, as opposed to discovering rules that are predictive of the target column.

Here we take a sample dataset, the Wine dataset from sklearn's set of toy datasets, and provide an example of deriving the 
rules, where possible, for each column from the other columns.

In [2]:
data = datasets.load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Y'] = data['target']
display(df.head())

prism = PrismRules()

for col_name in df.columns:
    print()
    print("*********************************************************************************")
    print(f"Rules for {col_name}")
    print("*********************************************************************************")
    
    _ = prism.get_prism_rules(df, col_name, display_stats=False)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Y
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0



*********************************************************************************
Rules for alcohol
*********************************************************************************

Target: Low
color_intensity = Low AND nonflavanoid_phenols = Med AND alcalinity_of_ash = Med
Y = 1 AND ash = High
color_intensity = Low AND od280/od315_of_diluted_wines = Med AND Y = 1 AND flavanoids = Med

Target: Med
Y = 2 AND magnesium = High AND flavanoids = Low AND hue = Low

Target: High
Y = 0 AND ash = Med AND magnesium = High
Y = 0 AND color_intensity = High AND alcalinity_of_ash = Low
Y = 0 AND hue = High AND proline = High

*********************************************************************************
Rules for malic_acid
*********************************************************************************

Target: Low
hue = High AND Y = 1 AND total_phenols = Med
hue = High AND nonflavanoid_phenols = Low AND proanthocyanins = High

Target: Med
Y = 0 AND ash = Med AND flavanoids = High AND alcalin

In [3]:
# Again, with the statisitics for each rule

data = datasets.load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Y'] = data['target']
display(df.head())

prism = PrismRules()

for col_name in df.columns:
    print()
    print("*********************************************************************************")
    print(f"Rules for {col_name}")
    print("*********************************************************************************")
    
    _ = prism.get_prism_rules(df, col_name, display_stats=True)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Y
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0



*********************************************************************************
Rules for alcohol
*********************************************************************************

Target: Low
color_intensity = Low AND nonflavanoid_phenols = Med AND alcalinity_of_ash = Med
   Support:  the target has value: 'Low' for 100.000% of the 13 rows matching the rule 
   Coverage: the rule matches: 13 out of 60 rows for target value: 0. This is:
      21.667% of total rows for target value: 0
      7.303% of total rows in data
Y = 1 AND ash = High
   Support:  The target has value: 'Low' for 85.714% of the 12 remaining rows matching the rule
   Coverage: The rule matches: 12 out of 47 rows remaining for target value: '0'. This is:
      25.532% of remaining rows for target value: '0'
      20.000% of total rows for target value: 0
      6.742% of total rows in data
color_intensity = Low AND od280/od315_of_diluted_wines = Med AND Y = 1 AND flavanoids = Med
   Support:  The target has value: '


Target: Low
proline = Low AND od280/od315_of_diluted_wines = Med AND malic_acid = Low
   Support:  the target has value: 'Low' for 90.909% of the 10 rows matching the rule 
   Coverage: the rule matches: 10 out of 63 rows for target value: 0. This is:
      15.873% of total rows for target value: 0
      5.618% of total rows in data
proline = Low AND proanthocyanins = Med AND Y = 1
   Support:  The target has value: 'Low' for 84.615% of the 11 remaining rows matching the rule
   Coverage: The rule matches: 11 out of 53 rows remaining for target value: '0'. This is:
      20.755% of remaining rows for target value: '0'
      17.460% of total rows for target value: 0
      6.180% of total rows in data
proline = Low AND ash = Low
   Support:  The target has value: 'Low' for 75.000% of the 12 remaining rows matching the rule
   Coverage: The rule matches: 12 out of 42 rows remaining for target value: '0'. This is:
      28.571% of remaining rows for target value: '0'
      19.048% of tota


Target: Low
Y = 2 AND color_intensity = Med
   Support:  the target has value: 'Low' for 100.000% of the 16 rows matching the rule 
   Coverage: the rule matches: 16 out of 63 rows for target value: 0. This is:
      25.397% of total rows for target value: 0
      8.989% of total rows in data

Target: Med
flavanoids = Med AND hue = High AND nonflavanoid_phenols = Low
   Support:  the target has value: 'Med' for 76.923% of the 10 rows matching the rule 
   Coverage: the rule matches: 10 out of 56 rows for target value: 1. This is:
      17.857% of total rows for target value: 1
      5.618% of total rows in data

Target: High
total_phenols = High AND ash = Low AND flavanoids = High
   Support:  the target has value: 'High' for 92.308% of the 12 rows matching the rule 
   Coverage: the rule matches: 12 out of 59 rows for target value: 2. This is:
      20.339% of total rows for target value: 2
      6.742% of total rows in data
total_phenols = High AND nonflavanoid_phenols = Low AND mag


Target: 0
proline = High AND alcohol = High
   Support:  the target has value: '0' for 100.000% of the 39 rows matching the rule 
   Coverage: the rule matches: 39 out of 59 rows for target value: 0. This is:
      66.102% of total rows for target value: 0
      21.910% of total rows in data
proline = High AND alcalinity_of_ash = Low
   Support:  The target has value: '0' for 100.000% of the 10 remaining rows matching the rule
   Coverage: The rule matches: 10 out of 20 rows remaining for target value: '0'. This is:
      50.000% of remaining rows for target value: '0'
      16.949% of total rows for target value: 0
      5.618% of total rows in data

Target: 1
color_intensity = Low AND alcohol = Low
   Support:  the target has value: '1' for 100.000% of the 46 rows matching the rule 
   Coverage: the rule matches: 46 out of 71 rows for target value: 1. This is:
      64.789% of total rows for target value: 1
      25.843% of total rows in data
color_intensity = Low
   Support:  The t