In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
from sklearn import datasets
from matplotlib.colors import ListedColormap

# ***APRIORI***

In [2]:


class Apriori:
  def __init__(self, T, min_supp = 0.3, min_confidence = 0.75, delimeter =' '):
    self.T = T
    self.Tcount = len(T)
    self.supp = min_supp
    self.conf = min_confidence
    self.delimeter = delimeter
    self.supports = {}
    self.L = [self.freq_items()]
    

  def freq_items(self):
    res = {}
    for i in range(self.Tcount):
      for product in self.T[i]:
        if product in res:
          res[product] += 1
        else:
          res[product] = 1
    freq = []
    for product in res:
      supp = res[product]/self.Tcount
      if supp >= self.supp:
        freq.append(set([product]))
        self.supports[tuple(product)] = supp 
    print(freq)  
    return freq

  def genCk(self, Lk, k):
    res = set()
    for i in range(len(Lk)):
      a = Lk[i]
      for j in range(i+1, len(Lk)):
        b = Lk[j]
        new_set = list(a.union(b))
        new_set.sort()
        if len(new_set) == k:
          res.add(tuple(new_set))
    return list(res)


  def freqs(self, Ck):
    res =[]
    for prod_set in Ck:
      count = 0
      for row in self.T:
        if set(prod_set).issubset(row):
          count+=1
      supp = count/self.Tcount
      if supp >= self.supp:
        res.append(set(prod_set))
        sort = list(prod_set)
        sort.sort()
        self.supports[tuple(sort)] = supp
    return res

  def fit(self):
    k = 1
    while len(self.L[k-1]) != 0:
      Ck = self.genCk(self.L[k-1], k+1)
      self.L.append(self.freqs(Ck))
      k+=1
    self.L  = [item for sublist in self.L[1:] for item in sublist] #flatten and kick L0
  
  def gen_reg_X(self, X):
    res = [[(X, set(), 0)]]
    X = list(X)
    X.sort()
    supp = self.supports[tuple(X)]
    for k in range(len(X)):
      res.append([])
      for reg in res[k]:
        for x in reg[0]:
          A = reg[0].difference(x)
          A = list(A)
          A.sort()
          A = set(A)
          if len(A) == 0:
            continue
          B = reg[1].union(x)
          B = list(B)
          B.sort()
          B = set(B)
          confidence = supp/self.supports[tuple(A)]
          if confidence >= self.conf:
            lift = confidence/self.supports[tuple(B)]
            res[k+1].append((A, B, lift))
      if len(res[k]) == 0:
        break
        
    return [item for sublist in res for item in sublist]
    
  def gen_regs(self):
    res = []
    for zbior in self.L:
      res.append(self.gen_reg_X(zbior))
    res = sorted([item for sublist in res for item in sublist], key=lambda x: x[2])
    res.reverse()
    return res

example = [['a', 'b', 'e'],['a', 'b', 't'],['a', 'b', 'e'],['a', 'b', 't']]
apr = Apriori(example)
apr.fit()
print(apr.L)
print(apr.supports)
print(apr.gen_regs())

[{'a'}, {'b'}, {'e'}, {'t'}]
[{'a', 'b'}, {'e', 'b'}, {'a', 't'}, {'t', 'b'}, {'a', 'e'}, {'a', 't', 'b'}, {'a', 'e', 'b'}]
{('a',): 1.0, ('b',): 1.0, ('e',): 0.5, ('t',): 0.5, ('a', 'b'): 1.0, ('b', 'e'): 0.5, ('a', 't'): 0.5, ('b', 't'): 0.5, ('a', 'e'): 0.5, ('a', 'b', 't'): 0.5, ('a', 'b', 'e'): 0.5}


KeyError: ignored

In [0]:
store_df = pd.read_csv('groceries - groceries.csv', header=None)
store_df = store_df[1:1000]
transactions = []
for i in range(998):
  transactions.append([]) 
  for j in range(1, int(store_df.iloc[i][0])+1):
    transactions[i].append(store_df.iloc[i][j])
print(transactions)
groceries = Apriori(transactions)
groceries.fit()
print(groceries.freq_items())
print(groceries.gen_regs())
