In [2]:
import numpy
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from tqdm import tqdm

In [3]:
# read the text file
with open('data/apriori/docword.enron.txt') as f:
    data = f.read().splitlines()
    # skip the first 3 lines
    data = data[3:]
    # skip the last column 
    data = [line.split()[:2] for line in data]
    # convert to numpy array
    data = numpy.array(data, dtype=int)

In [4]:
data

array([[    1,   118],
       [    1,   285],
       [    1,  1229],
       ...,
       [39861, 23449],
       [39861, 25721],
       [39861, 27196]])

In [5]:
# group by document
transactions = defaultdict(list)
for doc_id, word_id in data:
    transactions[doc_id].append(word_id)

## Apriori Algorithm
### Pass 1

In [6]:
min_support = 0.02
min_confidence = 0.5

In [7]:
# num_transactions = df['docID'].nunique()
# frequency = df.groupby('wordID').count()
# frequency.columns = ['count']
# frequency['support'] = frequency['count'] / num_transactions
# frequent_items = frequency[frequency['support'] >= min_support]
# frequent_items = frequent_items.reset_index()
# frequent_items = frequent_items.drop('count', axis=1)
# frequent_items

num_transactions = len(transactions)

# count the frequency of each word
word_frequency = Counter()
for words in transactions.values():
    word_frequency.update(words)

# filter out the words that have frequency less than the minimum support
frequent_words = {word for word, freq in word_frequency.items() if freq/num_transactions >= min_support}
frequent_words

{20480,
 16385,
 14338,
 20481,
 16386,
 2053,
 20486,
 12295,
 22535,
 8201,
 16394,
 10251,
 20485,
 22541,
 18448,
 22546,
 22547,
 2068,
 4115,
 22555,
 22558,
 12321,
 6182,
 44,
 20525,
 46,
 20526,
 8243,
 6196,
 18483,
 20548,
 10309,
 18500,
 10312,
 6217,
 10313,
 2050,
 18511,
 4183,
 24665,
 12379,
 12380,
 12381,
 12382,
 14431,
 6236,
 18530,
 6243,
 6244,
 6251,
 112,
 14449,
 10354,
 114,
 12405,
 118,
 12406,
 12407,
 18551,
 18549,
 22648,
 22653,
 4222,
 18558,
 14468,
 14470,
 4230,
 24718,
 24719,
 145,
 147,
 2195,
 2198,
 24727,
 24728,
 24729,
 2201,
 20635,
 4248,
 10397,
 14494,
 153,
 20639,
 12440,
 2203,
 10405,
 10406,
 10409,
 14505,
 8364,
 26797,
 14514,
 8372,
 12469,
 2233,
 20670,
 12484,
 18629,
 8392,
 12489,
 10447,
 26833,
 20690,
 24788,
 24793,
 16605,
 14559,
 16607,
 14561,
 226,
 232,
 8426,
 10474,
 2284,
 8427,
 238,
 26868,
 18677,
 246,
 24821,
 245,
 250,
 12539,
 20732,
 251,
 20734,
 14591,
 6396,
 4355,
 6404,
 22792,
 20744,
 20745,

### Pass 2

In [8]:
import time

In [9]:
# count the frequency of only those pairs that contain frequent items
num_items = data[:, 1].max() + 1    
candidate_pairs = defaultdict(int)
# time taken 
start_time = time.time()
for words in tqdm(transactions.values()):
    for i, word1 in enumerate(words):
        if word1 not in frequent_words:
            continue
        for word2 in words[i+1:]:
            if word2 not in frequent_words:
                continue
            candidate_pairs[(word1, word2)] += 1
end_time = time.time()
candidate_gen_apriori_algorithm_time = end_time - start_time
print('Time taken:', end_time - start_time)

  0%|          | 0/39861 [00:00<?, ?it/s]

100%|██████████| 39861/39861 [02:09<00:00, 307.72it/s] 

Time taken: 129.5605926513672





In [20]:
# filter out the pairs that have frequency less than the minimum support
frequent_pairs = [pair for pair, freq in candidate_pairs.items() if freq/num_transactions >= min_support]
frequent_pairs = [set(list(pair)) for pair in list(frequent_pairs)]

In [21]:
len(frequent_pairs)

7780

In [22]:
type(frequent_pairs)

list

In [23]:
frequent_pairs[0]

{118, 5299}

In [13]:
len(candidate_pairs)

559127

### Pass 3

In [26]:
# find the candidate triplets
candidate_triplets = defaultdict(int)
# time taken
start_time = time.time()
# iterate over the frequent pairs
for pair in tqdm(frequent_pairs):
    pair = list(pair)
    for word in frequent_words:
        if word not in pair:
            if {pair[0], word} in frequent_pairs and {pair[1], word} in frequent_pairs:
                candidate_triplets[tuple(sorted(list(pair) + [word]))] +=1 
end_time = time.time()

candidate_gen_triplets_apriori_algorithm_time = end_time - start_time
print('Time taken:', end_time - start_time)

  4%|▍         | 335/7780 [00:56<20:52,  5.94it/s]