# Association Analysis by Apriori Algorithm
dataset: IBM-Quest-Data-Generaton
reference:[用Numpy實現高效的Apriori算法](https://hk.saowen.com/a/1f0411f543fbdb2cb7a27698a55920cf72a41bf9f09bb2af68b714d0fdf43b5b), [唐磊的个人博客
](https://www.tanglei.name/blog/apriori-algorithm-in-python.html), [Apriori——python3實現](https://hk.saowen.com/a/eb213191098240d6b33f4b86106457bc099c8bc0782240f1f7afba7a05c32537)

In [45]:
import numpy as np
from pprint import pprint

## Apriori Algorithm
### component
* support_threshold
* confidence_threshold
* total = total transactions

### function 
* `count(items)`
    * count the number of appearance of item in every transaction
* `find_rules(filename)`
    * find out the association rule of the transaction

In [46]:
class Apriori:
    def __init__(self, support_threshold, confidence_threshold):
        self.support_threshold = support_threshold 
        self.confidence_threshold = confidence_threshold

    def count(self, filename):
        self.total = 0 
        items = {}

        with open(filename) as f:
            for transaction in f:
                self.total += 1
                for i in transaction.strip().split(','):
                    if i in items:
                        items[i] += 1.
                    else:
                        items[i] = 1.
            
        self.items = {i:j/self.total for i,j in items.items() if j/self.total > self.support_threshold}
        self.item2id = {j:i for i,j in enumerate(self.items)}
        self.D = np.zeros((self.total, len(items)), dtype=bool)
        
        with open(filename) as f:
            for n,transaction in enumerate(f):
                for i in transaction.strip().split(','):
                    if i in self.items:
                        self.D[n, self.item2id[i]] = True

    def find_rules(self, filename):
        self.count(filename)
        rules = [{(i,):j for i,j in self.items.items()}]
        l = 0
        while rules[-1]:
            rules.append({})
            keys = sorted(rules[-2].keys())
#             print('keys = =',keys)
            num = len(rules[-2])
#             print(num)
#             print(len(rules))
            l += 1
            for i in range(num):
                for j in range(i+1,num):
#                     print(keys[i][:l-1],keys[j][:l-1])
                    if keys[i][:l-1] == keys[j][:l-1]:
#                         print('i', i, 'j', j, 'keys[i] + (keys[j][l-1],)', keys[i], (keys[j][l-1]), keys[j])
                        _ = keys[i] + (keys[j][l-1],)
#                         print('_=', _)
                        _id = [self.item2id[k] for k in _]
#                         print('_id = ',_id)
#                         print('self.D[:, _id] = ',self.D[:, _id])
                        support = 1. * sum(np.prod(self.D[:, _id], 1)) / self.total
                        if support > self.support_threshold:
                            rules[-1][_] = support
        
#         pprint(rules)
        result = {}
        for n,j in enumerate(rules[1:]):
            for r,v in j.items():
                for i,_ in enumerate(r): # find association rule
                    x = r[:i] + r[i+1:]
                    if v / rules[n][x] > self.confidence_threshold:
                        result[x+(r[i],)] = (v / rules[n][x], v)
#                         print(result[x+(r[i],)], (confidence, v))

        return sorted(result.items(), key=lambda x: -x[1][0])

In [47]:
model = Apriori(0.1, 0.9)
rules = model.find_rules('dataset.txt')
pprint(rules[:5])

[(('472', '885', '732'), (0.9464285714285714, 0.1083844580777096)),
 (('472', '833', '732'), (0.9439252336448599, 0.1032719836400818)),
 (('732', '833', '472'), (0.9351851851851852, 0.1032719836400818)),
 (('833', '885', '732'), (0.9339622641509434, 0.10122699386503067)),
 (('732', '885', '472'), (0.9298245614035088, 0.1083844580777096))]
