In [68]:
data = sc.textFile('../../data/msnbc/data_sample.csv') \
         .map(lambda line: list(map(int,line.split(',')))) \
         .zipWithIndex() \
         .map(lambda x: (x[1], x[0]))

In [69]:
dataset = sc.broadcast(data.collect())

In [70]:
dataset.value[498]

(498, [1, 10, 4])

## getMinSupItems

### itemlocmap

In [132]:
def get_indices_for(x):
    key = x[0]
    seq_id = x[1]
    id, seq = dataset.value[seq_id]
    indices = list()
    for idx in range(len(seq)):
        if seq[idx] == key:
            indices.append(idx)
    return key, seq_id, indices

In [133]:
def get_first_last_index(x):
    key = x[0]
    seq_id = x[1]
    indices = x[2]
    return key, (seq_id, indices[0], indices[-1])

In [148]:
def get_freq(x):
    key = x[0]
    sequences = x[1]
    freq = len(sequences)
    return key, freq, sequences

In [149]:
itemlocmap = data.map(lambda x: (x[0], set(x[1])))  \
        .flatMapValues(lambda x:x) \
        .map(lambda x: (x[1], x[0])) \
        .map(lambda x: get_indices_for(x)) \
        .map(lambda x: get_first_last_index(x)) \
        .groupByKey() \
        .mapValues(list) \
        .map(lambda x: get_freq(x))

In [150]:
result = itemlocmap.collect()

### above_threshold

In [187]:
THRESHOLD = 50

In [188]:
def above_threshold_filter(x, threshold=THRESHOLD):
    return x[1] > threshold

In [189]:
above_threshold = itemlocmap.filter(lambda x: above_threshold_filter(x, THRESHOLD))

In [190]:
items = sc.broadcast(above_threshold.collect())

In [191]:
result = above_threshold.collect()

In [414]:
# result[0]

## genRules

In [218]:
items_ids = above_threshold.map(lambda x: x[0]) \
                           .zipWithIndex()
                           

In [219]:
count_items = items_ids.count()

In [220]:
result = items_ids.collect()

In [221]:
result

[(2, 0), (4, 1), (6, 2), (8, 3), (12, 4), (14, 5), (1, 6), (3, 7), (9, 8)]

### Join

In [284]:
def prepare_join(x, count):
    return x, list(range(x[1] + 1, count))

In [286]:
items_join = items_ids.map(lambda x: prepare_join(x, count_items)) \
                      .flatMapValues(lambda x:x) \
                      .map(lambda x: (x[0][1], x[1])) 


In [287]:
result = items_join.collect()
result[:5]

[(0, 1), (0, 2), (0, 3), (0, 4), (0, 5)]

### I => J combination with no repeats

In [295]:
def prepare_common_sequences(x):
    i = x[0]
    j = x[1]
    itemI, itemJ = items.value[i], items.value[j]
    occurancesI, occurancesJ = itemI[2], itemJ[2]
    
    allseqboth = []
    
    for seq_idI, firstI, lastI in occurancesI:
        for seq_idJ, firstJ, lastJ in occurancesJ:
            if seq_idI == seq_idJ:
                allseqboth.append((seq_idI, (firstI, lastI), (firstJ, lastJ)))
    
    return x, allseqboth

In [316]:
def count_IJ_JI_rules(x):
    key = x[0]
    seq_id, itemI, itemJ = x[1]
    
    IJ = []
    JI = []
    if itemI[0] < itemJ[1]:
        IJ.append(seq_id)
    if itemJ[0] < itemI[1]:
        JI.append(seq_id)
        
    return key, [IJ, JI]

In [317]:
def group_IJ_JI_rules(a, b):
    a[0] += b[0]
    a[1] += b[1]
    return a

In [342]:
def add_index_info(x):
    key = x[0]
    valuesI, valuesJ = x[1]
    return key, ((valuesI, key), (valuesJ, tuple(reversed(key))))

In [343]:
reduced_IJ_JI = items_join.map(lambda x: prepare_common_sequences(x)) \
                    .flatMapValues(lambda x:x) \
                    .map(lambda x: count_IJ_JI_rules(x)) \
                    .reduceByKey(group_IJ_JI_rules) \
                    .map(lambda x: add_index_info(x)) 


In [344]:
result = reduced_IJ_JI.collect()
result[:2]

[((0, 2),
  (([24, 77, 147, 153, 157, 180, 189, 230, 260, 308, 323, 339, 349, 355, 493],
    (0, 2)),
   ([136, 157, 180, 185, 189, 230, 260, 319, 320, 323, 349, 355, 385],
    (2, 0)))),
 ((0, 4),
  (([63, 106, 157, 177, 180, 260, 319, 324, 339, 349, 402], (0, 4)),
   ([80, 116, 157, 180, 260, 319, 340, 349], (4, 0))))]

### Accumulate rules

In [388]:
MIN_SUP_REL = 0.01 * len(dataset.value)
MIN_CONF = 0.1

In [389]:
def get_freq(x):
    key = x[0]
    values = x[1]
    frequencies = len(values)
    return key, (values, frequencies)

def is_above_minsup_relative(x, minsup_relative):
    return x[1][1] > minsup_relative 

In [390]:
def generate_rule_and_expands(x, min_conf=MIN_CONF):
    i, j = x[0]
    allseqIJ, length = x[1]
    itemI, itemJ = items.value[i], items.value[j]
    occurancesI, occurancesJ = itemI[2], itemJ[2]
    
    confIJ = length / len(occurancesI)
    
    antecedentSet = set([i])
    consequentSet = set([j])
    
    rules = []
    if confIJ >= min_conf:
        rules.append((antecedentSet, consequentSet, length / len(dataset.value), confIJ))
    
    expand_left = (0, antecedentSet, consequentSet, i, j, allseqIJ)
    expand_right = (1, antecedentSet, consequentSet, i, j, allseqIJ)
    return rules, expand_left, expand_right

In [395]:
rules_and_expands = reduced_IJ_JI.flatMapValues(lambda x:x) \
                     .map(lambda x: (x[1][1], x[1][0])) \
                     .map(lambda x: get_freq(x)) \
                     .filter(lambda x: is_above_minsup_relative(x, MIN_SUP_REL)) \
                     .map(lambda x: generate_rule_and_expands(x, MIN_CONF))


In [396]:
result = rules.collect()
result[:1]

[([({0}, {2}, 0.03, 0.22727272727272727)],
  (0,
   {0},
   {2},
   0,
   2,
   [24, 77, 147, 153, 157, 180, 189, 230, 260, 308, 323, 339, 349, 355, 493]),
  (1,
   {0},
   {2},
   0,
   2,
   [24, 77, 147, 153, 157, 180, 189, 230, 260, 308, 323, 339, 349, 355, 493]))]

In [399]:
rules = rules_and_expands.map(lambda x: x[0]) \
                         .filter(lambda x: len(x) > 0) \
                         .collect()

In [411]:
expands = rules_and_expands.map(lambda x: (x[1], x[2])) \
                           .flatMap(lambda x: x) \
                           .collect()

In [413]:
expands[0]

(0,
 {0},
 {2},
 0,
 2,
 [24, 77, 147, 153, 157, 180, 189, 230, 260, 308, 323, 339, 349, 355, 493])