This code uses [FPgrowth algorithm](https://spark.apache.org/docs/2.3.0/mllib-frequent-pattern-mining.html) to extract frequent itemsets from transactions.

In [2]:
#Run this cell first time
#!pip install FPgrowth

In [3]:
from pyspark.mllib.fpm import FPGrowth

In [4]:
#Connect to your data store and data file as needed
data = sc.textFile("/FileStore/tables/retail_25k.txt")

In [5]:
#Looking at the first 5 lines in data
data.take(5)

In [6]:
#Removing extra space at the end of the line and splitting each line into list of items
transactions = data.map(lambda line: line.strip(" \n").split(' '))

In [7]:
#Looking at first 5 lists of items
transactions.take(5)

In [8]:
#Instantiating Fpgrowth model
#If we would like to extract itemsets that have frequency or sigma >=4 and we have 25,000 total transactions, minsupport = 4/25000 = 0.00016
model = FPGrowth.train(transactions, minSupport=0.00016, numPartitions=10)

In [9]:
#Collecting all the frequent itemsets into one variable
freq_itemsets = model.freqItemsets().collect()

In [10]:
# Looking at Frequent itemsets
for freq_itemset in freq_itemsets:
    print(freq_itemset)

In [11]:
len(freq_itemsets)

In [12]:
#Transforming results from fpgrowth model to the below format to get ready to save into a file
#<item set size (N)>, <co-occurrence frequency>, <item 1 id >, <item 2 id>, …. <item N id>

#Instantiate variable to collect info about all the itemsets 
output_itemsets = ""

for n, itemset in enumerate(freq_itemsets):

    #Get itemset size 
    itemset_size = len(itemset[0])
    
    #Instantiate variable to collect info about each itemset 
    output_str = ""
    
    #Extract itemsets that have 3 or more elements
    if (itemset_size >= 3):
      
        #Get the freq of itemset
        sigma = itemset[1]
        
        output_str = "<" + "item set size " + "(" + str(itemset_size) + ")" + ">, " \
                     + "<" + "co-occuring freq=" + str(sigma) + ">, " 
      
        #Adding one item at a time to output formatting
        for m, item in enumerate(itemset[0]):
          
            if m < itemset_size - 1 :
                output_str += "<" + item + ">,"  
            
            #Adding end of line to last item
            elif m == itemset_size - 1:
                output_str += "<" + item + ">\n"  
    
        output_itemsets += output_str

In [13]:
print (output_itemsets)

In [14]:
#write results to data store
with open("/dbfs/FileStore/output.txt", 'w') as f:
    f.write(output_itemsets)