In [96]:
import glob
import os.path
import shutil
import time

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("Apriori")
sc = SparkContext.getOrCreate(conf=conf)

In [97]:
data = sc.textFile('../data/msnbc/data50k.csv') \
         .map(lambda line: list(map(int,line.split(',')))) \
         .zipWithIndex() \
         .map(lambda x: (x[1], x[0]))

In [98]:
data.sortByKey().saveAsTextFile("./dataset")

dataset = []
for file in glob.glob("./dataset/part-*"):
    for line in open(file, "r").readlines():
        line = eval(line)
        dataset.append(line)

shutil.rmtree("./dataset")

dataset = sc.broadcast(dataset)

In [99]:
dataset.value[1]

(1, [2])

In [100]:
lines = open("../data/msnbc/data.csv", 'r').readlines()[:100000]
open("../data/msnbc/data50k.csv", 'w').writelines(lines)

## getMinSupItems

### itemlocmap

In [101]:
def get_indices_for(x):
    key = x[0]
    seq_id = x[1]
    id, seq = dataset.value[seq_id]
    indices = []
    for idx in range(len(seq)):
        if seq[idx] == key:
            indices.append(idx)
    return key, seq_id, indices

In [102]:
def get_first_last_index(x):
    key = x[0]
    seq_id = x[1]
    indices = x[2]
    return key, (seq_id, indices[0], indices[-1])

In [103]:
def get_freq(x):
    key = x[0]
    sequences = x[1]
    freq = len(sequences)
    return key, freq, sequences

In [104]:
itemlocmap = data.map(lambda x: (x[0], set(x[1])))  \
        .flatMapValues(lambda x:x) \
        .map(lambda x: (x[1], x[0])) \
        .map(lambda x: get_first_last_index(get_indices_for(x))) \
        .groupByKey() \
        .mapValues(list) \
        .map(get_freq)


### above_threshold

In [105]:
THRESHOLD = 50

In [106]:
def above_threshold_filter(x, threshold=THRESHOLD):
    return x[1] > threshold

In [107]:
above_threshold = itemlocmap.filter(lambda x: above_threshold_filter(x, THRESHOLD)) \
                            .map(lambda x: (x[0], (x[1:])))

In [108]:
above_threshold.\
    sortByKey().\
    saveAsTextFile("./above_threshold")


items = []
for file in glob.glob("./above_threshold/part-*"):
    for line in open(file, "r").readlines():
        line = eval(line)
        items.append(line)


shutil.rmtree("./above_threshold")

items = sc.broadcast(items)

In [109]:
# items.value[5]

In [110]:
def extract_seq(x):
    item, (freq, seq) = x
    return item, [y[0] for y in seq]

In [111]:
above_threshold.\
    map(extract_seq).\
    sortByKey().\
    saveAsTextFile("./item_sequences")

item_sequences = []

for file in glob.glob("./item_sequences/part-*"):
    for line in open(file, "r").readlines():
        line = eval(line)
        item_sequences.append(line)

shutil.rmtree("./item_sequences")
item_sequences = sc.broadcast(item_sequences)

In [112]:
# item_sequences.value[0]

In [113]:
above_threshold.\
    map(lambda x:x[0]).\
    sortBy(lambda x:x).\
    saveAsTextFile("./items_ids_above_threshold")

items_ids_above_threshold = []
for file in glob.glob("./items_ids_above_threshold/part-*"):
    for line in open(file, "r").readlines():
        line = eval(line)
        items_ids_above_threshold.append(line)

shutil.rmtree("./items_ids_above_threshold")
items_ids_above_threshold = sc.broadcast(items_ids_above_threshold)

In [114]:
items_ids_above_threshold.value

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

## genRules

In [115]:
items_ids = above_threshold.map(lambda x: x[0]) \
                           .zipWithIndex()


In [116]:
count_items = items_ids.count()
count_items

17

### Join

In [117]:
def prepare_join(x, count):
    return x, tuple(range(x[1] + 1, count))

In [118]:
items_join = items_ids.map(lambda x: prepare_join(x, count_items)) \
                      .flatMapValues(lambda x:x) \
                      .map(lambda x: (x[0][1], x[1]))


### I => J combination with no repeats

In [119]:
def prepare_common_sequences(x):
    i = x[0]
    j = x[1]
    itemI, itemJ = items.value[i], items.value[j]
    occurancesI, occurancesJ = itemI[1][1], itemJ[1][1]

    allseqboth = []

    # Old
    # for seq_idI, firstI, lastI in occurancesI:
    #     for seq_idJ, firstJ, lastJ in occurancesJ:
    #         if seq_idI == seq_idJ:
    #             allseqboth.append((seq_idI, (firstI, lastI), (firstJ, lastJ)))

    # New
    dictI = dict()
    for seq_idI, firstI, lastI in occurancesI:
        dictI[seq_idI] = (firstI, lastI)
    for seq_idJ, firstJ, lastJ in occurancesJ:
        if seq_idJ in dictI:
            allseqboth.append((seq_idJ, dictI[seq_idJ], (firstJ, lastJ)))

    return x, allseqboth

In [120]:
def count_IJ_JI_rules(x):
    key = x[0]
    seq_id, itemI, itemJ = x[1]

    IJ = []
    JI = []
    if itemI[0] < itemJ[1]:
        IJ.append(seq_id)
    if itemJ[0] < itemI[1]:
        JI.append(seq_id)

    return key, [IJ, JI]

In [121]:
def group_IJ_JI_rules(a, b):
    a[0] += b[0]
    a[1] += b[1]
    return a

In [122]:
def add_index_info(x):
    key = x[0]
    valuesI, valuesJ = x[1]
    return key, ((valuesI, key), (valuesJ, tuple(reversed(key))))

In [123]:
reduced_IJ_JI = items_join.map(prepare_common_sequences) \
                    .flatMapValues(lambda x:x) \
                    .map(count_IJ_JI_rules) \
                    .reduceByKey(group_IJ_JI_rules) \
                    .map(add_index_info)



### Accumulate rules

In [124]:
MIN_SUP_REL = 0.01 * len(dataset.value)
MIN_CONF = 0.1

In [125]:
def get_freq(x):
    key = x[0]
    values = x[1]
    frequencies = len(values)
    return key, (values, frequencies)

def is_above_minsup_relative(x, minsup_relative):
    return x[1][1] > minsup_relative

In [126]:
def generate_rule_and_expands(x, min_conf=MIN_CONF):
    # Try to optimize
    i, j = x[0]
    allseqIJ, length = x[1]
    itemI, itemJ = items.value[i], items.value[j]
    occurancesI, occurancesJ = itemI[1][1], itemJ[1][1]
#     occurancesI = (seq_id, first, last)

    allseqI = [x[0] for x in occurancesI]
    allseqJ = [x[0] for x in occurancesJ]

    confIJ = length / len(occurancesI)

    antecedentSet = set([i])
    consequentSet = set([j])

    rules = []
    if confIJ >= min_conf:
        rules.append((antecedentSet, consequentSet, length / len(dataset.value), confIJ))

#     expandLeft(antecedentSet,consequentSet,allseqI,allseqIJ,occurancesJ)
#     expandRight(antecedentSet,consequentSet,allseqI,allseqJ,allseqIJ,occurancesI,occurancesJ)

    expand_left = (0, antecedentSet, consequentSet, allseqI, occurancesJ, allseqIJ)
    expand_right = (1, antecedentSet, consequentSet, allseqI, allseqJ, occurancesI, occurancesJ, allseqIJ)
    return rules, expand_left, expand_right

## Expand Left - Right

In [127]:
def expand(x, min_sup_rel=MIN_SUP_REL, min_conf=MIN_CONF):
    if x[0] == 0:
        return expand_left(x[1:], min_sup_rel, min_conf)
    else:
        return expand_right(x[1:], min_sup_rel, min_conf)


def item_suitable_for_antecedentSet(item, index_item, antecedentSet, consequentSet):
    for i in antecedentSet:
        if i >= index_item:
            return False
    if index_item in consequentSet or item not in items_ids_above_threshold.value:
        return False
    return True

In [128]:
def index_of_item(item):
    for i, j in enumerate(items_ids_above_threshold.value):
        if j == item:
            return i
    return len(items_ids_above_threshold.value)

In [129]:
def find_first_last(occurances, seq_id):
    for id, first, last in occurances:
        if id == seq_id:
            return first, last

In [130]:
def expand_left(x, min_sup_rel, min_conf):
    # Try to optimize
    antecedentSet, consequentSet, allseqI, occurancesJ, allseqIJ = x

    possibleC = dict()
    rules = []
    expand_lefts = []
    seqsLeft = len(allseqIJ)

    for seqID in allseqIJ:
        _, seq = dataset.value[seqID] # Get sequence
        firstJ, lastJ = find_first_last(occurancesJ, seqID) # Get last occurance of J in sequene

        for item in seq[:lastJ]:
            index_item = index_of_item(item)
            if not item_suitable_for_antecedentSet(item, index_item, antecedentSet, consequentSet):
                continue

            if index_item not in possibleC: # first time item was found
                if seqsLeft >= min_sup_rel: # min_sup_rel can be accomplished
                    possibleC[index_item] = set([seqID])
            elif len(possibleC[index_item]) + seqsLeft < min_sup_rel: # there no enough seqLeft to accomplish min_sup
                del possibleC[index_item]
            else:
                possibleC[index_item].add(seqID)

        seqsLeft -= 1

    # Loop through possibleC to generate valid rules
    for itemC, seqIDs in possibleC.items():
        # Check if minimum support requirement is met
        if len(seqIDs) >= min_sup_rel:
            # SeqIDs of IuC 
            item, seqC = item_sequences.value[itemC]
            allseqIC = set.intersection(set(seqC),allseqI)

            confIC_J = len(seqIDs) / len(allseqIC)

            itemsIC = antecedentSet.copy()
            itemsIC.add(itemC)

            if confIC_J >= min_conf:
                rules.append((itemsIC,consequentSet,len(seqIDs)/len(dataset.value),confIC_J))

            expand_lefts.append((0, itemsIC,consequentSet, allseqIC, occurancesJ, seqIDs))
    return rules, expand_lefts


In [131]:
def expand_right(x, min_sup_rel, min_conf):
    antecedentSet, consequentSet, allseqI, allseqJ, occurancesI, occurancesJ, allseqIJ = x

    possibleC = dict()
    rules = []
    expand_lefts = []
    expand_rights = []
    seqsLeft = len(allseqIJ)

    for seqID in allseqIJ:
        _, seq = dataset.value[seqID] # Get sequence
        firstI, lastI = find_first_last(occurancesI, seqID) # Get last occurance of J in sequene

        for item in seq[firstI+1:]:
            index_item = index_of_item(item)
            if not item_suitable_for_antecedentSet(item, index_item, consequentSet, antecedentSet):
                continue
            if index_item not in possibleC: # first time item was found
                if seqsLeft >= min_sup_rel: # min_sup_rel can be accomplished
                    possibleC[index_item] = set([seqID])
            elif len(possibleC[index_item]) + seqsLeft < min_sup_rel: # there no enough seqLeft to accomplish min_sup
                del possibleC[index_item]
            else:
                possibleC[index_item].add(seqID)

        seqsLeft -= 1

    for itemC, seqIDs in possibleC.items():
        if len(seqIDs) >= min_sup_rel:

            allseqJC = set()
            # New consequent occurance map
            occurancesJC = dict()

            for seqID_J in allseqJ:
                item, seqC = item_sequences.value[itemC]
                if seqID_J in seqC:
                    firstC, lastC = find_first_last(items.value[itemC][1][1], seqID_J)
                    allseqJC.add(seqID_J)
                    firstJ, lastJ = find_first_last(occurancesJ,seqID_J)
                    if lastC < lastJ:
                        occurancesJC[seqID_J] = [seqID_J,firstC,lastC]
                    else:
                        occurancesJC[seqID_J] = [seqID_J,firstJ,lastJ]

            occurancesJC = list(occurancesJC.values())
            confI_JC = len(seqIDs) / len(allseqI)
            itemsJC = consequentSet.copy()
            itemsJC.add(itemC)

            if confI_JC >= min_conf:
                rules.append((antecedentSet,itemsJC,len(seqIDs)/len(dataset.value),confI_JC))

            expand_lefts.append((0, antecedentSet, itemsJC, allseqI,occurancesJC, seqIDs))
            expand_rights.append((1, antecedentSet, itemsJC, allseqI, allseqJC, occurancesI, occurancesJC, seqIDs))

    return rules,  expand_rights + expand_lefts



## Compute Rules

In [132]:
MIN_SUP_REL = 0.01 * len(dataset.value)
MIN_CONF = 0.1

In [133]:
rules_and_expands = reduced_IJ_JI.flatMapValues(lambda x:x) \
                     .map(lambda x: (x[1][1], x[1][0])) \
                     .map(get_freq) \
                     .filter(lambda x: is_above_minsup_relative(x, MIN_SUP_REL)) \
                     .map(lambda x: generate_rule_and_expands(x, MIN_CONF))
rules_and_expands.count()

47

In [134]:
rules_and_expands.map(lambda x: x[0]) \
                         .filter(lambda x: len(x) > 0) \
                         .saveAsTextFile("./rules")

In [135]:
expands = rules_and_expands.map(lambda x: (x[1], x[2])) \
                           .flatMap(lambda x: x)

In [136]:
rules = []

def read_rules():
    if not os.path.isdir('./rules'):
        print("Not dir")
        return
    global rules
    lines = []
    for file in glob.glob('./rules/part-*'):
        lines += open(file, "r").readlines()
    for line in lines:
        line = eval(line)
        for rule in line:
            rules.append(rule)
    shutil.rmtree("./rules")

read_rules()

In [137]:
print(f'Found {len(rules)} rules')
# print(len(expands.collect()))

Found 29 rules


In [138]:
start = time.time()
while True:
    rules_and_expands = expands.map(lambda x: expand(x, MIN_SUP_REL, MIN_CONF))
    rules_and_expands.map(lambda x: x[0]) \
                          .filter(lambda x: len(x) > 0) \
                          .saveAsTextFile("./rules")
    expands = rules_and_expands.map(lambda x: x[1]) \
                           .filter(lambda x: len(x) > 0) \
                           .flatMap(lambda x: x)
    read_rules()
    print(f'Found {len(rules)} rules')
    print("Time:", time.time() - start)
    if expands.isEmpty():
        break

print("Time:", time.time() - start)

Found 35 rules
Time: 82.66924667358398
Found 35 rules
Time: 170.87187314033508
Time: 256.5634446144104


In [139]:
open("all_rules.txt", "w").write(str(rules))
rules


[({0}, {1}, 0.07204, 0.22549142356329036),
 ({1}, {0}, 0.04277, 0.23892519970951343),
 ({2}, {0}, 0.01895, 0.15461814621409922),
 ({0}, {3}, 0.03697, 0.11571929385251033),
 ({3}, {0}, 0.0235, 0.19019100032372938),
 ({0}, {5}, 0.0362, 0.11330912733191437),
 ({0}, {6}, 0.03488, 0.10917741329660699),
 ({6}, {0}, 0.02606, 0.3220862686936102),
 ({8}, {0}, 0.01046, 0.11636444543330737),
 ({9}, {0}, 0.0141, 0.27920792079207923),
 ({10}, {0}, 0.02234, 0.3841127922971114),
 ({0}, {11}, 0.04127, 0.12917866533116315),
 ({11}, {0}, 0.02591, 0.23055703861897134),
 ({0}, {13}, 0.0379, 0.11863027419556779),
 ({13}, {0}, 0.02294, 0.19275691118393412),
 ({2}, {1}, 0.01416, 0.11553524804177545),
 ({1}, {3}, 0.0199, 0.11116697391207195),
 ({3}, {1}, 0.01733, 0.14025574619618),
 ({6}, {1}, 0.01215, 0.15016685205784205),
 ({9}, {1}, 0.01127, 0.22316831683168317),
 ({10}, {1}, 0.01211, 0.20821870701513068),
 ({11}, {1}, 0.01483, 0.13196298273714185),
 ({3}, {6}, 0.01789, 0.14478795726772417),
 ({6}, {3}, 0.