Apriori

In [1]:
# Apriori is a algorithm used to extract frequent sets in transactions

#eg: Grocery store transactions
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [2]:
# Apriori needs data to be one-hot coded

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [9]:
# return the items and itemsets with min support of 60%

# Support of an item(sets) = number_of_time_item_exist_in_transaction / total_transactions

from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df, min_support=0.6)

# by default, return the support and indices of items eg: egg is index 3
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(3)
1,1.0,(5)
2,0.6,(6)
3,0.6,(8)
4,0.6,(10)
5,0.8,"(3, 5)"
6,0.6,"(8, 3)"
7,0.6,"(5, 6)"
8,0.6,"(8, 5)"
9,0.6,"(10, 5)"


In [11]:
# more human-friendly
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Eggs, Onion)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


In [13]:
# since we're using Pandas dataframe, we can select and filter result

# Add length column
frequent_itemsets['lengths'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

frequent_itemsets

Unnamed: 0,support,itemsets,lengths
0,0.8,(Eggs),1
1,1.0,(Kidney Beans),1
2,0.6,(Milk),1
3,0.6,(Onion),1
4,0.6,(Yogurt),1
5,0.8,"(Eggs, Kidney Beans)",2
6,0.6,"(Eggs, Onion)",2
7,0.6,"(Milk, Kidney Beans)",2
8,0.6,"(Onion, Kidney Beans)",2
9,0.6,"(Yogurt, Kidney Beans)",2


In [16]:
# Get itemset with length = 2 and min support = 0.8

desired_itemsets = frequent_itemsets[(frequent_itemsets['lengths'] == 2 ) & (frequent_itemsets['support'] >= 0.8)]

desired_itemsets

Unnamed: 0,support,itemsets,lengths
5,0.8,"(Eggs, Kidney Beans)",2


In [22]:
# Get itemset based on column values

desired_itemsets = frequent_itemsets[
    frequent_itemsets["itemsets"] == {"Eggs", "Onion"} # or {'Onion', 'Eggs'}
]  

desired_itemsets

Unnamed: 0,support,itemsets,lengths
6,0.6,"(Eggs, Onion)",2


In [2]:
from pyarc import CBA, TransactionDB
import pandas as pd

data_train = pd.read_csv("/home/haianh/grad_project/ml-learning/data/iris.csv")
data_test = pd.read_csv("/home/haianh/grad_project/ml-learning/data/iris.csv")

txns_train = TransactionDB.from_DataFrame(data_train)
txns_test = TransactionDB.from_DataFrame(data_test)


cba = CBA(support=0.20, confidence=0.5, algorithm="m1")
cba.fit(txns_train)



accuracy = cba.rule_model_accuracy(txns_test) 
accuracy

0.8962962962962963

In [2]:
from pyarc import TransactionDB
from pyarc.algorithms import (
    top_rules,
    createCARs,
    M1Algorithm,
    M2Algorithm
)
import pandas as pd


data_train = pd.read_csv("/home/haianh/grad_project/ml-learning/data/iris.csv")
data_test = pd.read_csv("/home/haianh/grad_project/ml-learning/data/iris.csv")

txns_train = TransactionDB.from_DataFrame(data_train)
txns_test = TransactionDB.from_DataFrame(data_test)

# get the best association rules
rules = top_rules(txns_train.string_representation)

# convert them to class association rules
cars = createCARs(rules)

classifier = M1Algorithm(cars, txns_train).build()
# classifier = M2Algorithm(cars, txns_train).build()

accuracy = classifier.test_transactions(txns_test)

print(len(cars))

983


In [5]:
# Read data from csv file, change to respective datatypes based on scheme
import csv

def str2numerical(data, value_type):
    size = len(data)
    columns = len(data[0])
    for i in range(size):
        for j in range(columns-1):
            if value_type[j] == 'numerical' and data[i][j] != '?':
                data[i][j] = float(data[i][j])
    return data


def read_data(path):
    data = []
    with open(path, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        for line in reader:
            data.append(line)
        while [] in data:
            data.remove([])
    return data

def read_scheme(path):
    with open(path, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        attributes = next(reader)
        value_type = next(reader)
    return attributes, value_type

def read(data_path, scheme_path):
    data = read_data(data_path)
    attributes, value_type = read_scheme(scheme_path)
    data = str2numerical(data, value_type)
    return data, attributes, value_type

In [7]:
test_data_path = "/home/haianh/grad_project/CBA/datasets/iris.data"

test_scheme_path = "/home/haianh/grad_project/CBA/datasets/iris.names"

test_data, test_attributes, test_value_type = read(test_data_path, test_scheme_path)

test_data

[[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
 [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
 [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
 [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
 [5.0, 3.6, 1.4, 0.2, 'Iris-setosa'],
 [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'],
 [4.6, 3.4, 1.4, 0.3, 'Iris-setosa'],
 [5.0, 3.4, 1.5, 0.2, 'Iris-setosa'],
 [4.4, 2.9, 1.4, 0.2, 'Iris-setosa'],
 [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'],
 [5.4, 3.7, 1.5, 0.2, 'Iris-setosa'],
 [4.8, 3.4, 1.6, 0.2, 'Iris-setosa'],
 [4.8, 3.0, 1.4, 0.1, 'Iris-setosa'],
 [4.3, 3.0, 1.1, 0.1, 'Iris-setosa'],
 [5.8, 4.0, 1.2, 0.2, 'Iris-setosa'],
 [5.7, 4.4, 1.5, 0.4, 'Iris-setosa'],
 [5.4, 3.9, 1.3, 0.4, 'Iris-setosa'],
 [5.1, 3.5, 1.4, 0.3, 'Iris-setosa'],
 [5.7, 3.8, 1.7, 0.3, 'Iris-setosa'],
 [5.1, 3.8, 1.5, 0.3, 'Iris-setosa'],
 [5.4, 3.4, 1.7, 0.2, 'Iris-setosa'],
 [5.1, 3.7, 1.5, 0.4, 'Iris-setosa'],
 [4.6, 3.6, 1.0, 0.2, 'Iris-setosa'],
 [5.1, 3.3, 1.7, 0.5, 'Iris-setosa'],
 [4.8, 3.4, 1.9, 0.2, 'Iris-setosa'],
 [5.0, 3.0, 1.6, 0.2, 'Iris-setosa'],
 [5.0, 3.4, 