# Implement Probabilistic maximal frequent itemset mining methods over uncertain databases

In [10]:
import numpy as np
import random
import time


# 1. Preliminary

In [11]:
def get_probability_in_transaction(X, T):
    """get the probabilistic of itemset in transaction T

    Args:
        X (list<string>): itemset X
        T (dict<string, double>): transaction T

    Returns:
        prob: the probabilistic of itemset in transaction T
    """
    prob = 1.0
    for i in X:
        prob *= T.get(i, 0.0)
    return prob

def get_weighted_itemset(X, W):
    """get the weight of itemset in weighted table

    Args:
        X (list<string>): itemset X
        W (dict<string, double>): weighted table W

    Returns:
        weight of itemset: the weight of itemset in weighted table
    """
    weighted = 0

    for i in X:
        weighted += W.get(i, 0.0)

    return weighted / len(X)

def compute_support(X, UD):
    """compute the support of itemset X

    Args:
        X (list<string>): itemset X
        UD (list<dict<string, double>>): Uncertain database

    Returns:
        support: the support of itemset X
    """
    support = 0

    for Ti in UD:
        probX= get_probability_in_transaction(X, Ti)
        
        if probX > 0:
            support += 1

    return support

def compute_expected_support(X, UD):
    """compute the expected support of itemset X

    Args:
        X (list<string>): itemset X
        UD (list<dict<string, double>>): Uncertain database

    Returns:
        expected_support: the expected support of itemset X
    """
    expected_support = 0

    for Ti in UD:
        expected_support += get_probability_in_transaction(X, Ti)

    return expected_support

In [12]:
def compute_prWF(X, UD, W, min_conf):
    """compute the probabilistic support of itemset X

    Args:
        X (list<string>): itemset X
        UD (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_conf (double): min confidence

    Returns:
        probabilistic support vector: the probabilistic support vector of itemset X
    """
    convolution_vector = compute_support_probabilistic_vector(X, UD)
    weighted = get_weighted_itemset(X, W)
    prWF = 0
    
    for i in range(len(UD), 0, -1):
        prWF += convolution_vector[i]*weighted

        if prWF > min_conf:
            return i

    return -1

def compute_support_probabilistic_vector(X, UD):
    """calculate support probabilistic vector of itemset X in UD 

    Args:
        X (list<string>): itemset X
        UD (list<dict<string, double>>): Uncertain database

    Returns:
        support_probabilistic_vector: support probabilistic vector of itemset X in UD 
    """
    n = len(UD)
    
    # Base case
    if n == 1:
        prob = get_probability_in_transaction(X, UD[0])
        f_X = np.zeros(2)
        f_X[0] = 1 - prob
        f_X[1] = prob
        return f_X

    # Recursive case: Partition PDB
    mid = n // 2
    D1 = UD[:mid]
    D2 = UD[mid:]

    # Recursively call DC on partitions
    f_X1 = compute_support_probabilistic_vector(X, D1)
    f_X2 = compute_support_probabilistic_vector(X, D2)

    # Convolution 2 array
    # Adjust length of f_X1 and f_X2 for convolution
    len_total = len(f_X1) + len(f_X2) - 1
    f_X1 = np.pad(f_X1, (0, len_total - len(f_X1)), mode='constant')
    f_X2 = np.pad(f_X2, (0, len_total - len(f_X2)), mode='constant')

    # Convolution using FFT
    f_X = np.fft.ifft(np.fft.fft(f_X1) * np.fft.fft(f_X2)).real
    f_X = f_X[:n+1]  # Take only the first n+1 elements
    f_X = np.round(f_X, decimals=10)  # Fix numerical inaccuracies

    return f_X

In [13]:

def compute_lower_bound(expected_support, min_conf, weighted_itemset):
    """compute the lower bound of expected support itemset X

    Args:
        expected_support (double): expected support of itemset X
        min_conf (double): minimum confidence
        weighted_itemset (double): weight of itemset X

    Returns:
        lower_bound: the lower bound of expected support itemset X
    """
    quotient_conf_weighted = min_conf / weighted_itemset
    return expected_support - np.sqrt(-2 * expected_support * np.log(1 - quotient_conf_weighted))

def compute_upper_bound(expected_support, min_conf, weighted_itemset):
    """compute the upper bound of expected support itemset X

    Args:
        expected_support (double): expected support of itemset X
        min_conf (double): minimum confidence
        weighted_itemset (double): weight of itemset X

    Returns:
        upper_bound: the upper bound of expected support itemset X
    """
    quotient_conf_weighted = min_conf / weighted_itemset
    return (2 * expected_support - np.log(quotient_conf_weighted) + np.sqrt(np.log(quotient_conf_weighted)**2 - 8 * expected_support * np.log(quotient_conf_weighted))) / 2


# 2. Implement algorithms

In [14]:
def algorithms(UD, W, min_sup, min_conf):
    """implement the WPMFI algorithms

    Args:
        UD (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence

    Returns:
        list: list of weighted probabilistic maximal frequent itemset
    """
    list_result = []
    
    sorted_item_list = get_sorted_list_item(UD, W, min_sup, min_conf)
    
    map_sorted_item_list = {}
    
    for item in sorted_item_list:
        map_sorted_item_list[item[0]] = [item[1], item[2], item[3], item[4], item[5]]
    
    print("sorted list: ", sorted_item_list)
    
    root = [[], 0, 0, 0, 0, 0, []]
    
    WPMFIM(root, list_result, sorted_item_list, UD, W, min_sup, min_conf, map_sorted_item_list)
    
    return list_result
    
    
def WPMFIM(node, list_result, sorted_item_list, UD, W, min_sup, min_conf, map_sorted_item_list):
    """implement the WPMFIM method

    Args:
        node (list): current node of itemset 
        list_result (list): list of weighted probabilistic maximal frequent itemset
        sorted_item_list (list): sorted item list in the UD
        UD (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence
    """
    itemset_of_node = node[0]
    
    item_J_order_larger_than_I_List = get_itemset_j_order_larger_than_i(itemset_of_node, sorted_item_list)
    
    for index in range(0, len(item_J_order_larger_than_I_List)):
        itemJ = item_J_order_larger_than_I_List[index]
        
        tempNode = [itemJ, 0, 0, 0, 0, 0, []]
        
        tempChild = node[6]
        # tempChild.append(tempNode)
        
        support = compute_support(itemJ, UD)
        expected_support = compute_expected_support(itemJ, UD)
        weighted_J = get_weighted_itemset(itemJ, W)
        lower_bound = compute_lower_bound(expected_support, min_conf, weighted_J)
        upper_bound = compute_upper_bound(expected_support, min_conf, weighted_J)
        
        if len(itemJ) == 1:
            probabilistic_support = map_sorted_item_list[itemJ[0]][4]
            
            if probabilistic_support >= min_sup:
                
                tempChild.append(tempNode)
                WPMFIM(tempNode, list_result, sorted_item_list, UD, W, min_sup, min_conf, map_sorted_item_list)
        else:
        
            if min(support, upper_bound) < min_sup :
                continue
            
            if lower_bound >= min_sup:
                tempChild.append(tempNode)
                WPMFIM(tempNode, list_result, sorted_item_list, UD, W, min_sup, min_conf, map_sorted_item_list)
            else:
                probabilistic_support = compute_prWF(itemJ, UD, W, min_conf)
                
                if probabilistic_support >= min_sup:
                    tempChild.append(tempNode)
                    WPMFIM(tempNode, list_result, sorted_item_list, UD, W, min_sup, min_conf, map_sorted_item_list)

    isContain = False
    
    for item in list_result:
        if set(itemset_of_node).issubset(set(item)):
            isContain = True
            break
        
    if len(node[6]) == 0 and not isContain:
        list_result.append(itemset_of_node)

def get_itemset_j_order_larger_than_i(itemset_i, sorted_item_list):
    """get itemsets J have order lager than itemset I

    Args:
        itemset_i (list<item>): itemset I
        sorted_item_list (list): sorted item list in the UD

    Returns:
        list_itemset_j_order_larger_than_i: list of itemsets J have order lager than itemset I
    """
    itemset_j_order_larger_than_i = []

    sorted_item_list_size = len(sorted_item_list)
    item_i_size = len(itemset_i)
    index_of_last_item_i = -1

    # Get index of the last item of itemset in Sorted Item List
    if item_i_size > 0:
        for index in range(0, sorted_item_list_size):
            if itemset_i[-1] == sorted_item_list[index][0]:
                index_of_last_item_i = index
                break
        # index_of_last_item_i = len(sorted_item_list) - 1 - sorted_item_list[::-1].index(itemset_i[-1])

    # Return [] if the last item in itemset is the last item in Sorted Item List
    if index_of_last_item_i == len(sorted_item_list) - 1:
        return []

    for i in range(index_of_last_item_i + 1, sorted_item_list_size):
        temp_itemset = itemset_i.copy()
        temp_itemset.append(sorted_item_list[i][0])
        itemset_j_order_larger_than_i.append(temp_itemset)

    return itemset_j_order_larger_than_i

def discover_single_items(UD):
    """get all single item in uncertain database

    Args:
        UD (list<dict<string, double>>): Uncertain database

    Returns:
        list<string>: all single item in uncertain database
    """
    single_items = set()

    for Ti in UD:
        for item in Ti.keys():
            single_items.add(item)

    return single_items

def get_sorted_list_item(UD, W, min_sup, min_conf):
    """get all single item by order decrease of probabilistic support in uncertain database

    Args:
        UD (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence

    Returns:
        list: list of single item by order decrease of probabilistic support
    """
    single_items = discover_single_items(UD)
    list_single_items = []
    
    for item in single_items:
        support = compute_support([item], UD)
        
        if support < min_sup :
            continue
        
        expected_support = compute_expected_support([item], UD)
        weighted = get_weighted_itemset([item], W)
        lower_bound = compute_lower_bound(expected_support, min_conf, weighted)
        upper_bound = compute_upper_bound(expected_support, min_conf, weighted)
        
        probabilistic_support = compute_prWF([item], UD, W, min_conf)
        list_single_items.append([item, support, expected_support, lower_bound, upper_bound, probabilistic_support])
        
    return sorted(list_single_items, key=lambda x: x[5], reverse=False)


# Example

In [15]:
# Example 1
UD = [
    {'A': 0.6, 'B': 0.7},
    {'A': 0.2, 'C': 0.3},
]

weighted = {
    'A': 1,
    'B': 1,
    'C': 1
}

min_sup = 1
min_conf = 0.1

# #Example 2
UD = [
    {"A": 0.5, "B": 0.7, "D": 0.8, "E": 0.9},
    {"B": 0.6, "C": 0.8, "D": 0.6, "E": 0.8},
    {"C": 0.6, "D": 0.9, "E": 0.5},
    {"A": 0.6, "C": 0.7, "D": 0.8, "E": 0.8},
    {"A": 0.8, "B": 0.9, "C": 0.5, "D": 0.6, "E": 0.7},
    {"B": 0.6, "D": 0.9, "E": 0.8},
]

weighted = {
    "A": 0.3,
    "B": 0.9,
    "C": 0.5,
    "D": 0.6,
    "E": 0.9
}

min_sup = 2
min_conf = 0.2

print(algorithms(UD, weighted, min_sup, min_conf))

sorted list:  [['A', 3, 1.9000000000000001, np.float64(-0.14321479461627296), np.float64(3.3604558540228453), 2], ['C', 4, 2.5999999999999996, np.float64(0.970186132227624), np.float64(5.288528512859422), 3], ['B', 4, 2.8, np.float64(1.6136776161712727), np.float64(6.550104012189765), 4], ['D', 6, 4.6, np.float64(2.6686069808050163), np.float64(8.375599729231398), 5], ['E', 6, 4.5, np.float64(2.9960618847412124), np.float64(9.007336126669923), 5]]
[['A', 'E'], ['C', 'D', 'E'], ['B', 'D', 'E']]


# Real dataset

In [16]:
def read_dataset(file_path):
    uncertain_database = []

    # Initialize normal distribution
    random.seed(12345)

    try:
        with open(file_path, 'r') as file:
            curr_id_transaction = 0
            count_num_line = 1

            for line in file:
                data_line_transaction = line.strip().split(" ")
                cur_transaction = {}

                for item in data_line_transaction:
                    data_item = item.split("-")
                    value = data_item[0]
                    prob = float(data_item[1])
                    cur_transaction[value] = prob

                uncertain_database.append(cur_transaction)
                curr_id_transaction += 1
                count_num_line += 1

    except IOError as e:
        print(e)

    print(uncertain_database)

    return uncertain_database

def generate_weighted_table(uncertain_database):
    weighted_table = {}
    distinct_itemset_database = discover_single_items(uncertain_database)

    random.seed(12345)

    for item in distinct_itemset_database:
        weighted = round(random.random(), 1)
        if weighted <= 0:
            weighted = 0.1
        else:
            if weighted >= 1:
                weighted = 0.9
        weighted_table[item] = weighted

    return weighted_table

In [17]:
path = './data/connect4_10K.data'

UD = read_dataset(path)
W = generate_weighted_table(UD)

for i in range(0, 5):
    print(UD[i])

[{'22': 0.9, '88': 0.5, '67': 0.4, '46': 0.4, '25': 0.3, '49': 0.2, '28': 0.7, '112': 0.5, '115': 0.8, '91': 0.6, '118': 0.9, '70': 0.7, '94': 0.8, '73': 0.3, '52': 0.5, '31': 1.0, '97': 0.5, '10': 0.3, '76': 0.0, '55': 0.2, '34': 0.5, '13': 0.1, '79': 0.2, '58': 0.3, '37': 0.1, '16': 0.3, '19': 0.2, '121': 0.6, '1': 0.4, '100': 0.7, '124': 0.2, '4': 0.7, '103': 0.1, '127': 0.4, '7': 0.8, '106': 0.9, '82': 0.6, '109': 1.0, '61': 0.5, '40': 0.9, '85': 0.1, '64': 0.4, '43': 0.7}, {'22': 0.9, '44': 0.6, '88': 0.1, '67': 0.1, '46': 0.4, '25': 1.0, '49': 0.8, '28': 0.9, '112': 0.1, '115': 0.4, '91': 0.9, '118': 0.4, '70': 0.3, '94': 0.2, '73': 0.1, '52': 0.6, '31': 0.8, '97': 0.5, '10': 0.9, '55': 0.7, '77': 0.5, '34': 0.6, '13': 0.9, '79': 0.3, '58': 0.9, '37': 0.9, '16': 0.1, '19': 0.7, '121': 0.7, '1': 0.1, '100': 0.5, '124': 0.5, '4': 0.2, '103': 0.3, '127': 0.4, '7': 0.1, '106': 0.5, '82': 0.8, '109': 0.7, '61': 0.9, '40': 1.0, '85': 0.6, '64': 0.1}, {'44': 0.5, '88': 0.4, '23': 0.1, '

In [18]:

# Start the timer
start_time = time.time()

min_sup = 0.1*10000
min_conf = 0.6
# Your code here
result = algorithms(UD, W, min_sup, min_conf)
print(result)
print(len(result))

# End the timer
end_time = time.time()

# Calculate the runtime
runtime = end_time - start_time
print(f"Runtime: {runtime} seconds")


  return expected_support - np.sqrt(-2 * expected_support * np.log(1 - quotient_conf_weighted))
  return (2 * expected_support - np.log(quotient_conf_weighted) + np.sqrt(np.log(quotient_conf_weighted)**2 - 8 * expected_support * np.log(quotient_conf_weighted))) / 2


KeyboardInterrupt: 