# Zadanie 1.

## Implementacja APRIORI i jego komponentów

In [15]:
import functools
import itertools

import numpy as np
import pandas as pd
import scipy.sparse

In [2]:
def load_dataset(filename):
    rows = []
    with open(filename) as dataset:
        for line in dataset:
            rows.append([int(item) for item in line.split()])
    # wiersz 0 i kolumna 0 być może pozostaną puste, trudno
    matrix = scipy.sparse.lil_matrix(
        (1+len(rows), 1+max([max([v for v in r]) for r in rows])), dtype='?')
    print(matrix.shape)
    for idx, r in enumerate(rows):
        matrix[idx, r] = True
    return matrix.tocsc()

In [3]:
retail_matrix = load_dataset('retail.dat')

(88163, 16470)


In [4]:
retail_matrix[:20]

<20x16470 sparse matrix of type '<class 'numpy.bool_'>'
	with 152 stored elements in Compressed Sparse Column format>

In [5]:
@functools.lru_cache(maxsize=42_000_000)
def indices_containing(matrix_name, item):
    # TODO
    return set(retail_matrix[:, item].nonzero()[0].tolist())

In [89]:
def support(matrix, itemset):
    first_element = True
    indices = set()
    for item in itemset:
        this_item_indices = indices_containing('retail', item)
        indices = this_item_indices if first_element else indices & this_item_indices
        first_element = False
    return len(indices)

In [90]:
def gen_frequent_itemsets(matrix, min_supp):
    min_supp = min_supp if min_supp == round(min_supp) else min_supp * matrix.shape[1]
    sets = {}
    sets[1] = [
        {col}
        for col in range(matrix.shape[1]) if support(matrix, {col}) > min_supp
    ]
    K = 2
    while sets[K-1]:
        print(f'doing K = {K}')
        print(len(sets[K-1]))
        sets[K] = set([
            frozenset(oldset | {newelem})
            for newelem in
                functools.reduce(lambda s1, s2: s1 | s2, sets[K-1])  # flatten
            for oldset in sets[K-1]
        ])
        sets[K] = list(filter(
            lambda newset: 
                support(matrix, newset) > min_supp
                and len(newset) == K,
            sets[K]))
        K += 1
    
    return functools.reduce(lambda s1, s2: s1 + s2, sets.values())

In [91]:
retail_frequent_itemsets = gen_frequent_itemsets(retail_matrix, 0.05)

doing K = 2
79
doing K = 3
65
doing K = 4
26
doing K = 5
6


In [92]:
retail_frequent_itemsets

[{9},
 {18},
 {19},
 {31},
 {32},
 {36},
 {37},
 {38},
 {39},
 {41},
 {45},
 {48},
 {49},
 {60},
 {65},
 {76},
 {78},
 {79},
 {89},
 {101},
 {110},
 {117},
 {123},
 {147},
 {156},
 {161},
 {170},
 {175},
 {179},
 {185},
 {201},
 {225},
 {237},
 {242},
 {249},
 {255},
 {258},
 {264},
 {270},
 {271},
 {286},
 {301},
 {310},
 {338},
 {389},
 {405},
 {413},
 {438},
 {475},
 {479},
 {522},
 {533},
 {544},
 {548},
 {549},
 {589},
 {592},
 {604},
 {649},
 {677},
 {740},
 {783},
 {824},
 {956},
 {1004},
 {1146},
 {1327},
 {1393},
 {1578},
 {2238},
 {2958},
 {3270},
 {10515},
 {12925},
 {13041},
 {14098},
 {15832},
 {16010},
 {16217},
 frozenset({39, 438}),
 frozenset({48, 101}),
 frozenset({48, 270}),
 frozenset({39, 185}),
 frozenset({39, 12925}),
 frozenset({39, 533}),
 frozenset({36, 48}),
 frozenset({48, 475}),
 frozenset({36, 38}),
 frozenset({48, 271}),
 frozenset({48, 78}),
 frozenset({41, 65}),
 frozenset({32, 48}),
 frozenset({48, 79}),
 frozenset({48, 255}),
 frozenset({39, 225}),
 f

In [143]:
def gen_association_rules(matrix, itemsets, min_confidence):
    rules = []
    matrix_size = matrix.shape[0]

    for itemset in itemsets:
        itemset = list(itemset)
        antedecent_count = 1
        antedecents = set(itemset[:antedecent_count])
        consequents = set(itemset[antedecent_count:])
        while (confidence := \
            support(retail_matrix, full_set := antedecents | consequents) / \
            support(retail_matrix, antedecents)) >= min_confidence \
            and antedecent_count < len(full_set):
                
            antedecents = set(itemset[:antedecent_count])
            consequents = set(itemset[antedecent_count:])
            rules.append({
                'rule': f'{antedecents} -> {consequents}',
                'confidence': confidence,
                # TODO
                'lift': confidence / (support(retail_matrix, consequents) / matrix_size),
            })
            antedecent_count += 1
            
    return pd.DataFrame(rules)

In [145]:
rules = gen_association_rules(retail_matrix, retail_frequent_itemsets, 0.3)

In [152]:
rules.sort_values('lift', ascending=False)

Unnamed: 0,rule,confidence,lift
30,"{36} -> {38, 39}",0.662466,5.645721
32,"{170} -> {38, 39}",0.6515,5.55227
9,{170} -> {38},0.978057,5.528884
15,{37} -> {38},0.973929,5.505548
4,{36} -> {38},0.950272,5.371818
22,{41} -> {39},0.763734,1.328723
6,{2238} -> {39},0.750437,1.305591
5,{225} -> {39},0.72183,1.25582
14,{89} -> {39},0.716445,1.246452
27,{310} -> {39},0.713955,1.24212
