<a href="https://colab.research.google.com/github/DE-Karpov/comaru/blob/develop/comaru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install apyori

!pip install pyfpgrowth

!pip install mlxtend

In [128]:
from apyori import apriori
from google.colab import files
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori as mapriori, fpgrowth, fpmax
import pyfpgrowth
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

In [None]:
uploaded = files.upload()

In [None]:
class AssocRules:

    def __init__(self):
        self.dataset = pd.read_csv("retail_dataset.csv")
        self.transactions = []
        self.fill_transactions()

    def fill_transactions(self):
        for i in range(0, 315): 
            self.transactions.append([str(self.dataset.values[i,j]) for j in range(0, 6) if not pd.isnull(self.dataset.values[i,j])])    

    class Eclat:

        def __init__(self, min_support = 0.01, max_items = 5, min_items = 2):
            self.min_support = min_support
            self.max_items = max_items
            self.min_items = min_items
            self.item_lst = list()
            self.item_len = 0
            self.item_dict = dict()
            self.final_dict = dict()
            self.data_size = 0
        
        def read_data(self, dataset):
            for index, row in dataset.iterrows():
                row_wo_na = set(row)
                for item in row_wo_na:
                    if pd.isnull(item):
                        continue
                    else:
                        item = item.strip()
                    if item in self.item_dict:
                        self.item_dict[item][0] += 1
                    else:
                        self.item_dict.setdefault(item, []).append(1)
                    self.item_dict[item].append(index)

            self.data_size = dataset.shape[0]
            self.item_lst = list(self.item_dict.keys())
            self.item_len = len(self.item_lst)
            self.min_support = self.min_support * self.data_size
            
        def recur_eclat(self, item_name, tids_array, minsupp, num_items, k_start):
            if tids_array[0] >= minsupp and num_items <= self.max_items:
                for k in range(k_start+1, self.item_len):
                    if self.item_dict[self.item_lst[k]][0] >= minsupp:
                        new_item = item_name + "|" + self.item_lst[k]
                        new_tids = np.intersect1d(tids_array[1:], self.item_dict[self.item_lst[k]][1:])
                        new_tids_size = new_tids.size
                        new_tids = np.insert(new_tids, 0, new_tids_size)
                        if new_tids_size >= minsupp:
                            if num_items >= self.min_items: self.final_dict.update({new_item: new_tids})
                            self.recur_eclat(new_item, new_tids, minsupp, num_items+1, k)
        
        def fit(self, dataset):
            i = 0
            self.read_data(dataset)
            for w in self.item_lst:
                self.recur_eclat(w, self.item_dict[w], self.min_support, 2, i)
                i+=1
            return self
            
        def transform(self):
            return [k[0].split("|") for k in self.final_dict.items()]


    def get_apriori(self, params):
      rules = list(apriori(self.transactions, min_support = params["min_support"], min_confidence = params["min_confidence"], min_lift = params["min_lift"], max_length = params["max_length"]))
      list_of_rules = [list(record.items) for record in rules]
      return list_of_rules

    def get_eclat(self, params):
      model = AssocRules.Eclat(min_support = params["min_support"], max_items = params["max_length"], min_items = 1)
      model.fit(self.dataset)
      return model.transform()

    def get_fpgrowth(self, params):
      support_threshold = int(len(self.transactions) * params['min_support'])
      patterns = pyfpgrowth.find_frequent_patterns(self.transactions, support_threshold)
      new_patterns = {k: v for k, v in patterns.items() if not (("nan") in k)}
      unprepared_list = list(pyfpgrowth.generate_association_rules(new_patterns, params["min_confidence"]))
      prepared_list = [list(item) for item in unprepared_list if len(item) == 2 ]
      return prepared_list

    def user_input_features(self):
      min_support = st.sidebar.slider("Minimal support", 0.01, 0.03, 0.001)
      min_confidence = st.sidebar.slider("Min confidence", 0.2, 0.6, 0.1)
      min_lift = st.sidebar.slider("Lift", 0.5, 6.0, 1.0)
      min_length = st.sidebar.slider("Min length", 1, 2, 3)
      data = {"min_support": min_support,
              "min_confidence": min_confidence,
              "min_lift": min_lift,
              "min_length": min_length}
      features = pd.DataFrame(data, index=[0])
      return features

In [None]:
rules = AssocRules()

confidence = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

def gen_rules(min_support, min_lift = 2.0, max_length = None, alg = 'apriori'):
    ap = {}
    common_rules = []
    if alg == 'apriori':
      for i in confidence:
        parameters = {"min_support" : min_support, "min_confidence" : i, "min_lift" : min_lift, "max_length" : max_length}
        apriori_rules = rules.get_apriori(parameters)
        ap[i] = len(apriori_rules)
        common_rules.append(apriori_rules)
    elif alg == 'fpgrowth':
      for i in confidence:
        parameters = {"min_support" : min_support, "min_confidence" : i}
        fpgrowth_rules = rules.get_fpgrowth(parameters)
        ap[i] = len(fpgrowth_rules)
        common_rules.append(fpgrowth_rules)
    return pd.Series(ap).to_frame("Support: %s"%min_support), common_rules

apriori_plot = []
fpgrowth_plot = []
common_rules_plot = []
for i in [0.005,0.01,0.05,0.1]:
    apriori_alg = gen_rules(min_support = i)
    fpgrowth_alg = gen_rules(min_support = i, alg='fpgrowth')
    apriori_plot.append(apriori_alg[0])
    fpgrowth_plot.append(fpgrowth_alg[0])

apriori_all_conf = pd.concat(apriori_plot, axis=1)
fpgrowth_all_conf = pd.concat(fpgrowth_plot, axis=1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 8))
fig.suptitle('Association rules')
ax1.set_title('Apriori')
ax2.set_title('FPGrowth')
ax1.plot(apriori_all_conf)
ax2.plot(fpgrowth_all_conf)

In [None]:
def getCommonRules(leftRules, rightRules):
  commonRules = []
  print(rightRules)
  for leftSubList in leftRules:
    for rightSubList in rightRules:
      if leftSubList == rightRules:
        commonRules.append(rightRules)
  return commonRules

In [125]:
def createOneHotVec(df):
  items = (df['0'].unique())
  itemset = set(items)
  encoded_vals = []
  for index, row in df.iterrows():
      rowset = set(row) 
      labels = {}
      uncommons = list(itemset - rowset)
      commons = list(itemset.intersection(rowset))
      for uc in uncommons:
          labels[uc] = 0
      for com in commons:
          labels[com] = 1
      encoded_vals.append(labels)
  encoded_vals[0]
  ohe_df = pd.DataFrame(encoded_vals)
  return ohe_df

In [126]:
def getAprioriRules(df):
  ohe_df = createOneHotVec(df)
  freq_items = mapriori(ohe_df, min_support=0.2, use_colnames=True, verbose=1)
  assoc_rules = association_rules(freq_items, metric="confidence", min_threshold=0.2)
  return assoc_rules

In [None]:
def getFPGrowthRules(df):
  ohe_df = createOneHotVec(df)
  freq_items = fpgrowth(ohe_df, min_support=0.2, use_colnames=True, verbose=1)
  assoc_rules = association_rules(freq_items, metric="confidence", min_threshold=0.2)
  return assoc_rules

In [None]:
def getFPMaxRules(df):
  ohe_df = createOneHotVec(df)
  freq_items = fpmax(ohe_df, min_support=0.2, use_colnames=True, verbose=1)
  assoc_rules = association_rules(freq_items, metric="confidence", min_threshold=0.2, support_only=True)
  return assoc_rules

In [118]:
new_rules = rules.dataset

In [129]:
t0= time.clock()
getAprioriRules(new_rules).head()
t1 = time.clock() - t0
print("Apriori time: ", t1) # CPU seconds elapsed (floating point)

Processing 72 combinations | Sampling itemset size 2Processing 189 combinations | Sampling itemset size 3
Apriori time:  0.04781200000000041


In [130]:
t0= time.clock()
getFPGrowthRules(new_rules).head()
t1 = time.clock() - t0
print("FPGrowth time: ", t1) # CPU seconds elapsed (floating point)

9 itemset(s) from tree conditioned on items ()
0 itemset(s) from tree conditioned on items (Bread)
1 itemset(s) from tree conditioned on items (Cheese)
3 itemset(s) from tree conditioned on items (Meat)
0 itemset(s) from tree conditioned on items (Meat, Cheese)
0 itemset(s) from tree conditioned on items (Meat, Bread)
1 itemset(s) from tree conditioned on items (Meat, Milk)
3 itemset(s) from tree conditioned on items (Eggs)
0 itemset(s) from tree conditioned on items (Eggs, Cheese)
1 itemset(s) from tree conditioned on items (Eggs, Meat)
0 itemset(s) from tree conditioned on items (Eggs, Milk)
5 itemset(s) from tree conditioned on items (Wine)
0 itemset(s) from tree conditioned on items (Wine, Cheese)
0 itemset(s) from tree conditioned on items (Wine, Meat)
0 itemset(s) from tree conditioned on items (Wine, Bread)
0 itemset(s) from tree conditioned on items (Wine, Eggs)
0 itemset(s) from tree conditioned on items (Wine, Milk)
3 itemset(s) from tree conditioned on item

In [131]:
t0= time.clock()
getFPMaxRules(new_rules).head()
t1 = time.clock() - t0
print("FPMax time: ", t1) # CPU seconds elapsed (floating point)

0 itemset(s) from tree conditioned on items ()
0 itemset(s) from tree conditioned on items (Pencil)
1 itemset(s) from tree conditioned on items (Pencil, Bread)
1 itemset(s) from tree conditioned on items (Pencil, Cheese)
1 itemset(s) from tree conditioned on items (Pencil, Wine)
0 itemset(s) from tree conditioned on items (Diaper)
1 itemset(s) from tree conditioned on items (Diaper, Cheese)
1 itemset(s) from tree conditioned on items (Diaper, Bread)
1 itemset(s) from tree conditioned on items (Diaper, Wine)
0 itemset(s) from tree conditioned on items (Bagel)
1 itemset(s) from tree conditioned on items (Bagel, Milk)
1 itemset(s) from tree conditioned on items (Bagel, Bread)
0 itemset(s) from tree conditioned on items (Wine)
1 itemset(s) from tree conditioned on items (Wine, Milk)
1 itemset(s) from tree conditioned on items (Wine, Eggs)
1 itemset(s) from tree conditioned on items (Wine, Bread)
1 itemset(s) from tree conditioned on items (Wine, Meat)
1 itemset(s) from tr