#### **1. Importing Libraries**

In [48]:
import pandas as pd                                                 # Importing for panel data analysis
#-------------------------------------------------------------------------------------------------------------------------------
import numpy as np                                                  # Importing package numpys (For Numerical Python)
#-------------------------------------------------------------------------------------------------------------------------------
import matplotlib.pyplot as plt                                     # Importing pyplot interface of matplotlib
import seaborn as sns                                               # Importing seaborn library for interactive visualization
%matplotlib inline
import random
import math
import time
import os
#--------------------~-----------------------------------------------------------------------------------------------------------
import pyfpgrowth                                                   # For testing the scratch implementation
#-------------------------------------------------------------------------------------------------------------------------------
import warnings                                                     # Importing warning to disable runtime warnings
warnings.filterwarnings("ignore")                                   # Warnings will appear only once

- Divide the data set into 80% training set and 20% test set. Remove 20% of
movies watched from each user and create a test set using the removed
movies

In [49]:
# dummy data
dummy_df = pd.DataFrame({'userId':[1,2,4,6,8], 'movies_rated_above_2':[[100,200,300,400,500,600,700], [100,200,300,400], [100,300,400,500,600,700,800], [100,400, 500,600,700,800,900,1000,1100,1200], [700,800,900]]})
dummy_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[100, 200, 300, 400, 500, 600, 700]"
1,2,"[100, 200, 300, 400]"
2,4,"[100, 300, 400, 500, 600, 700, 800]"
3,6,"[100, 400, 500, 600, 700, 800, 900, 1000, 1100..."
4,8,"[700, 800, 900]"


In [50]:
# dividing dummy df into 80-20 train-test, such that 20% of movies watched from each user is test set.
'''
- Parse through each user
- Randomly shuffle the items in the list and split into 80-20
- extract 20 of each user and make a separate df
'''

'\n- Parse through each user\n- Randomly shuffle the items in the list and split into 80-20\n- extract 20 of each user and make a separate df\n'

In [51]:
cols = ['userId', 'movies_rated_above_2']
train_df = pd.DataFrame(columns=cols)
test_df = pd.DataFrame(columns=cols)


# loop through the rows using iterrows()
for index, row in dummy_df.iterrows():
    # print(row['userId'], row['movies_rated_above_2'])
    print(row['movies_rated_above_2'])
    print("-----")
    n = int(np.ceil(0.2 * len(row['movies_rated_above_2'])))  # initialize a value that represents 20% of the total items in the list.
    test_list = random.sample(row['movies_rated_above_2'], n)  # randomly choose 20% of the values (n) from list and make a sublist.
    print("test_list", test_list)
    print("-----")
    train_list = [i for i in row['movies_rated_above_2'] if i not in test_list] # rest 80% values of list is in train/-list
    print("train_list", train_list) # randomly choose 20% of the values from list and make a sublist
    print("******************************************************")
    
    df_1 = pd.DataFrame({
    'userId': [row['userId']],
    'movies_rated_above_2': [train_list]
    })

    df_2 = pd.DataFrame({
    'userId': [row['userId']],
    'movies_rated_above_2': [test_list]
    })

    train_df = pd.concat([train_df, df_1])
    test_df = pd.concat([test_df, df_2])
    # print("index", index)
    # train_df.loc[index].userId = row['userId']
    # train_df.loc[index].movies_rated_above_2 = train_list

    # test_df.loc[index].userId = row['userId']
    # test_df.loc[index].movies_rated_above_2 = test_list


[100, 200, 300, 400, 500, 600, 700]
-----
test_list [100, 200]
-----
train_list [300, 400, 500, 600, 700]
******************************************************
[100, 200, 300, 400]
-----
test_list [100]
-----
train_list [200, 300, 400]
******************************************************
[100, 300, 400, 500, 600, 700, 800]
-----
test_list [500, 400]
-----
train_list [100, 300, 600, 700, 800]
******************************************************
[100, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]
-----
test_list [100, 400]
-----
train_list [500, 600, 700, 800, 900, 1000, 1100, 1200]
******************************************************
[700, 800, 900]
-----
test_list [800]
-----
train_list [700, 900]
******************************************************


In [52]:
dummy_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[100, 200, 300, 400, 500, 600, 700]"
1,2,"[100, 200, 300, 400]"
2,4,"[100, 300, 400, 500, 600, 700, 800]"
3,6,"[100, 400, 500, 600, 700, 800, 900, 1000, 1100..."
4,8,"[700, 800, 900]"


In [53]:
train_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[300, 400, 500, 600, 700]"
0,2,"[200, 300, 400]"
0,4,"[100, 300, 600, 700, 800]"
0,6,"[500, 600, 700, 800, 900, 1000, 1100, 1200]"
0,8,"[700, 900]"


In [54]:
test_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[100, 200]"
0,2,[100]
0,4,"[500, 400]"
0,6,"[100, 400]"
0,8,[800]


----

#### **3. Association rule mining**

- From the training set, extract the set of all association rules of form X→Y, <br />
where X contains a single movie and Y contains the set of movies from the training set <br />
by employing the apriori or FPgrowth approach and set some minsup and minconf (eg : 50 and 0.1 respectively) <br />

In [55]:
# reading the training transactional data
print('Shape of the dataset:', train_df.shape)
train_df

Shape of the dataset: (5, 2)


Unnamed: 0,userId,movies_rated_above_2
0,1,"[300, 400, 500, 600, 700]"
0,2,"[200, 300, 400]"
0,4,"[100, 300, 600, 700, 800]"
0,6,"[500, 600, 700, 800, 900, 1000, 1100, 1200]"
0,8,"[700, 900]"


In [56]:
# the type of the rows in the second column of the transactional dataframe 'train_df'
type(train_df.movies_rated_above_2.iloc[0])

list

In [57]:
train_df.movies_rated_above_2.iloc[0] 

[300, 400, 500, 600, 700]

In [58]:
train_df_key_value = dict(zip(train_df['userId'], train_df['movies_rated_above_2']))
train_df_key_value

{1: [300, 400, 500, 600, 700],
 2: [200, 300, 400],
 4: [100, 300, 600, 700, 800],
 6: [500, 600, 700, 800, 900, 1000, 1100, 1200],
 8: [700, 900]}

In [59]:
train_df_key_value[1]

[300, 400, 500, 600, 700]

----

In [61]:
from collections import deque 

def traversetree(root):
    queue = deque([(root, root, 0)])
    while queue:
        parent_node, node, level = queue.popleft()
        print(f"{level = }")
        print(f"Parent: {parent_node.item}, Parent count: {parent_node.count}, Data: {node.item}, Count: {node.count}")
        for node_name in node.children:
            queue.append((node, node.children[node_name], level + 1))

def traverseheader(header_table):
    for key in header_table.keys():
        node = header_table[key]
        while node is not None:
            print(f"Header item: {key}, Link data: {node.item}, Link count: {node.count}")
            node = node.link 

In [63]:
#Global variable
id = 0
class Node:
    def __init__(self, item, count, parent):
        self.item = item           # Item value
        self.count = count         # Support count of the itemset
        self.parent = parent       # Parent node
        self.children = {}         # Children nodes (item: Node)
        self.link = None 

class FPGrowth:
    def __init__(self, data, minsup):
        self.data = data

    
    def find_frequent_items(self,data, minsup):
        header_table = {}
        for _, item_ls in data.items():
            for item in item_ls:
                header_table[item] = header_table.get(item, 0) + 1
        
        #Sort the dictionary
        # print(f"Before sorting {header_table = }")
        header_table = {k: v for k, v in sorted(header_table.items(), key=lambda item: (item[1], item[0]), reverse=True)}
        # print(f"After sorting {header_table = }")
        header_table = {k:-1 for k,v in header_table.items() if v>minsup}
        self.l = [*header_table.keys()]
        return header_table 
    
    #Constructing an FPTree
    def construct_fptree(self, data, header_table):
        root = Node(None,0,None)
        for _, transaction in data.items():
            ordered_transaction = [item for item in transaction if item in self.l]
            ordered_transaction.sort(key = lambda x:self.l.index(x))
            current_node = root
            # print(f"{ordered_transaction = }")
            for item in ordered_transaction:
                if item in current_node.children:
                    #Update the count of the already existing node
                    child_node = current_node.children[item]
                    child_node.count += 1
                else:
                    #Create a new node 
                    child_node = Node(item, 1, current_node)
                    current_node.children[item] = child_node
                    #Update header table
                    if item in header_table: #Why does this exist?
                        if header_table[item] == -1:
                            header_table[item] = child_node
                        else:
                            header_node = header_table[item]
                            while header_node.link is not None:
                                header_node =  header_node.link
                            header_node.link = child_node 
                current_node = child_node 
        return root, header_table

    #Mining an FPTree
    def mine_frequent_patterns(self, header_table, min_support, prefix=[]):
        global id
        frequent_patterns = []
        # Sort items in header table in descending order of frequency
        sorted_items = [item for item in header_table.keys()]
        sorted_items.sort(key=lambda x: (header_table[x].count, x))
        for item in sorted_items:
            new_prefix = prefix + [item]
            support = 0
            # Build the conditional pattern base
            conditional_dataset = {}
            node = header_table[item]
            while node is not None:
                count = node.count
                support += count 
                path = []
                current_node = node.parent
                while current_node.parent is not None:
                    path.append(current_node.item)
                    current_node = current_node.parent
                for _ in range(count):
                    conditional_dataset[id] = path
                    id += 1
                node = node.link
            frequent_patterns.append((new_prefix, support))
 
            
            # Recursively mine the conditional FP-tree
            conditional_header_table = self.find_frequent_items(conditional_dataset, min_support)
            root, conditional_header_table = self.construct_fptree(conditional_dataset, conditional_header_table)
            # print(f"Conditional prefix tree for prefix: {new_prefix}")
            # traversetree(root)
            # print()
            if conditional_header_table:
                frequent_patterns.extend(self.mine_frequent_patterns(conditional_header_table, min_support, new_prefix))
  
        return frequent_patterns
        

minsup = 2 # actual - 10
FPGrowth_obj = FPGrowth(train_df_key_value, minsup)
header_table = FPGrowth_obj.find_frequent_items(train_df_key_value,minsup)
root, header_table = FPGrowth_obj.construct_fptree(train_df_key_value, header_table)
frequent_patterns = FPGrowth_obj.mine_frequent_patterns(header_table, minsup, [])
print(f"{frequent_patterns = }")
#For debugging
# traversetree(root)
# traverseheader(header_table)

frequent_patterns = [([300], 3), ([600], 3), ([600, 700], 3), ([700], 4)]


In [65]:
#Testing with in-built python package
transactions = [[300, 400, 500, 600, 700], [200, 300, 400], [100, 300, 600, 700, 800], [500, 600, 700, 800, 900, 1000, 1100, 1200], [700, 900]]
patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
print(f"{patterns = }")

patterns = {(300,): 3, (600,): 3, (600, 700): 3, (700,): 4}


Cool! Frequent patterns are accurately computed.

----

In [66]:
import itertools

def calc_confidence(data, antecedant, consequent):
    item_ls = [*data.values()]
    antecedant_union_consequent = set([antecedant] + list(consequent))
    support_antecedant = 0
    support_antecedant_union_consequent = 0
    for item in item_ls:
        if set([antecedant]).issubset(set(item)):
            support_antecedant += 1
        if set(antecedant_union_consequent).issubset(set(item)):
            support_antecedant_union_consequent += 1
    conf = support_antecedant_union_consequent / support_antecedant
    return conf  

def mine_association_rules(data, frequent_patterns, minconf):
    association_rules_ls = []
    for i_iter, frequent_pattern in enumerate(frequent_patterns):
        print(f"Processing  pattern {i_iter} out of {len(frequent_patterns)}")
        support = frequent_pattern[1]
        freq_itemset = frequent_pattern[0]
        if len(freq_itemset) > 1:
            for antecedant in freq_itemset:
                consequent_superset = [x for x in freq_itemset if x != antecedant]
                for i_iter in range(1, len(consequent_superset)+1):
                    consequent_ls = list(itertools.combinations(consequent_superset, i_iter))
                    for consequent in consequent_ls:
                        conf = calc_confidence(data, antecedant, consequent)
                        if conf > minconf:
                            association_rule = [antecedant] + list(consequent)
                            flag = True  
                            for x in association_rules_ls:
                                if set(association_rule) == set(x[0]):
                                    flag = False 
                                    break 
                            if flag == True: 
                                association_rules_ls.append([association_rule, support, conf])
    return association_rules_ls 

minconf = 0.1
association_rules_ls = mine_association_rules(train_df_key_value, frequent_patterns, minconf)
print(f"{association_rules_ls = }")

Processing  pattern 0 out of 4
Processing  pattern 1 out of 4
Processing  pattern 2 out of 4
Processing  pattern 3 out of 4
association_rules_ls = [[[600, 700], 3, 1.0]]


In [68]:
# Testing with in-built python package
rules = pyfpgrowth.generate_association_rules(patterns, 0.1)
rules

{(600,): ((700,), 1.0), (700,): ((600,), 0.75)}