# Get the transactions as a list of list from the file

In [1]:
lines = open('retail.dat').readlines()
transactions = []
for line in lines:
    transactions.append(line.strip().split())
#     total transactions is close to 80k, therefore to execute the code faster
    if len(transactions) == 2000:
        break

### In this question, we are asked to perform Apriori using hash function , so we define a function called calc_hash, which gets the hashed value of a particualr itemset. Now using this hash value , we can put that itemset in the corresponding bucket.

### This is done in order to remove itemsets with low frequency in one go , for eg , if the number of itemsets in a bucket is less than minimum support then its obvious that none of the candidate itemset in the bucket would be frequent.

In [2]:
import itertools
def calc_hash(itemset,size,val):
    power=size-1
    hashval=0
    for i in itemset:
        hashval+=int(i)*(10**power)
        power-=1
    return hashval%val
def getBucket(val,transactions,size,new_itemsets):
    C_bucket={}
    for i in range(len(transactions)):
        for itemset in new_itemsets:
            if itemset.issubset(transactions[i]):
                hashval=calc_hash(sorted(itemset),size,val)

                if hashval in C_bucket:
                    C_bucket[hashval].append(sorted(itemset))
                else:
                    C_bucket[hashval]=[]
                    C_bucket[hashval].append(sorted(itemset))
    return C_bucket

## Now we have to generate C and L for every step of the Algorithm

### In this, initially we get all the candidate 1-length itemsets from the transactions and then calculate their frequency. Then , by checking for minimum support for each of the itemsets, we get L (frequent itemsets).

### Then in each subsequent step , we generate itemset of length len by the union of itemsets of length len-1. Then we apply the pruning step , to remove the generated itemsets for which not all subsets are frequent. This saves computation afterwards as we have to scan the transaction set again to get the frequency of the generated itemsets. Then generated L using the minimum support value.

In [3]:
def generateL(min_support, L_old, size, transactions,val):
    if size>1:
        old_itemsets = list(L_old.keys())
        new_itemsets = set()
        for i in old_itemsets:
            for j in old_itemsets:
                if len(i.union(j)) == size:
                    generated_itemset=i.union(j)
                    new_itemsets.add(generated_itemset)

        C_bucket=getBucket(val,transactions,size,new_itemsets)
        L={}
        for i,j in C_bucket.items():
            if len(j) >= min_support:
                count={}
                for itemset in j:
                    if frozenset(itemset) in count:
                        count[frozenset(itemset)]+=1
                    else:
                        count[frozenset(itemset)]=1
                    for itemset in count:
                        if count[itemset] >= min_support:
                            L[itemset]=count[itemset]
        return L


# For 1st iteration
    else:
        new_itemsets = set()
        for i in range(len(transactions)):
            for j in range(len(transactions[i])):
                new_itemsets.add(frozenset([transactions[i][j]]))
        C = {}
        for i in range(len(transactions)):
            for itemset in new_itemsets:
                if itemset.issubset(transactions[i]):
                    if itemset in C:
                        C[itemset] += 1
                    else:
                        C[itemset] = 1
        L = {}
        for i, j in C.items():
            if j >= min_support:
                L[i] = j
        return L

In [4]:
# min_support=25
# min_confidence=0.3
# val=19
min_support=int(input('min_support='))
min_confidence=float(input('min_confidence='))
val=int(input('buckethash value='))


size = 1
TL = {}
L={}
while True:
    new_L = generateL(min_support, L, size, transactions,val)
    if len(new_L) == 0:
        break
    TL[size] = new_L
    L = new_L
    size += 1
    
for i,j in L.items():
    print(i, ' ', j, '\n')

min_support=25
min_confidence=0.3
buckethash value=25
frozenset({'41', '38', '110', '39'})   29 

frozenset({'41', '36', '38', '39'})   27 

frozenset({'32', '38', '48', '39'})   30 

frozenset({'41', '38', '48', '39'})   70 

frozenset({'32', '41', '48', '39'})   33 



In [5]:
association_rules = {}
for itemset in L:
    ele = itemset
    for i in range(1, len(itemset)):
        for a_set in itertools.combinations(itemset, i):
            a_set = frozenset(a_set)
            b_set = itemset - a_set
            confidence = TL[len(itemset)][itemset] / TL[len(a_set)][a_set]
            if confidence >= min_confidence:
                association_rules[str(set(a_set)) + '->' + str(set(b_set))] = confidence
print('association_rule', '\t','\t','confidence','\n')
for i in association_rules:
    print(i, ' ', association_rules[i], '\n')

association_rule 	 	 confidence 

{'110'}->{'38', '39', '41'}   0.3625 

{'110', '41'}->{'38', '39'}   0.7631578947368421 

{'38', '110'}->{'39', '41'}   0.3815789473684211 

{'110', '39'}->{'38', '41'}   0.6904761904761905 

{'38', '110', '41'}->{'39'}   0.7837837837837838 

{'110', '39', '41'}->{'38'}   1.0 

{'38', '110', '39'}->{'41'}   0.7073170731707317 

{'36'}->{'38', '39', '41'}   0.32926829268292684 

{'36', '41'}->{'38', '39'}   0.75 

{'36', '38'}->{'39', '41'}   0.35526315789473684 

{'36', '39'}->{'38', '41'}   0.48214285714285715 

{'36', '38', '41'}->{'39'}   0.8181818181818182 

{'36', '39', '41'}->{'38'}   0.9642857142857143 

{'36', '38', '39'}->{'41'}   0.5192307692307693 

{'32', '38'}->{'48', '39'}   0.410958904109589 

{'32', '38', '48'}->{'39'}   0.8108108108108109 

{'32', '38', '39'}->{'48'}   0.5769230769230769 

{'32', '48', '39'}->{'38'}   0.30612244897959184 

{'38', '41'}->{'48', '39'}   0.42424242424242425 

{'48', '38'}->{'39', '41'}   0.360824742268041

## 2nd set of input values

In [6]:
# min_support=25
# min_confidence=0.3
# val=19
min_support=int(input('min_support='))
min_confidence=float(input('min_confidence='))
val=int(input('buckethash value='))


size = 1
TL = {}
L={}
while True:
    new_L = generateL(min_support, L, size, transactions,val)
    if len(new_L) == 0:
        break
    TL[size] = new_L
    L = new_L
    size += 1
    
for i,j in L.items():
    print(i, ' ', j, '\n')

min_support=40
min_confidence=0.5
buckethash value=35
frozenset({'41', '38', '48', '39'})   70 



In [7]:
association_rules = {}
for itemset in L:
    ele = itemset
    for i in range(1, len(itemset)):
        for a_set in itertools.combinations(itemset, i):
            a_set = frozenset(a_set)
            b_set = itemset - a_set
            confidence = TL[len(itemset)][itemset] / TL[len(a_set)][a_set]
            if confidence >= min_confidence:
                association_rules[str(set(a_set)) + '->' + str(set(b_set))] = confidence
print('association_rule', '\t','\t','confidence','\n')
for i in association_rules:
    print(i, ' ', association_rules[i], '\n')

association_rule 	 	 confidence 

{'48', '38', '41'}->{'39'}   0.8235294117647058 

{'38', '39', '41'}->{'48'}   0.5426356589147286 

