# Implement Probabilistic maximal frequent itemset mining methods over uncertain databases

In [1]:
import numpy as np
import random
import time


# 1. Preliminary

In [2]:
def get_probability_in_transaction(X, T):
    """get the probabilistic of itemset in transaction T

    Args:
        X (list<string>): itemset X
        T (dict<string, double>): transaction T

    Returns:
        prob: the probabilistic of itemset in transaction T
    """
    prob = 1.0
    for i in X:
        prob *= T.get(i, 0.0)
    return prob

def get_weighted_itemset(X, W):
    """get the weight of itemset in weighted table

    Args:
        X (list<string>): itemset X
        W (dict<string, double>): weighted table W

    Returns:
        weight of itemset: the weight of itemset in weighted table
    """
    weighted = 0

    for i in X:
        weighted += W.get(i, 0.0)

    return weighted / len(X)

def compute_support(X, D):
    """compute the support of itemset X

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database

    Returns:
        support: the support of itemset X
    """
    support = 0

    for Ti in D:
        probX= get_probability_in_transaction(X, Ti)
        
        if probX > 0:
            support += 1

    return support

def compute_expected_support(X, D):
    """compute the expected support of itemset X

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database

    Returns:
        expected_support: the expected support of itemset X
    """
    expected_support = 0

    for Ti in D:
        expected_support += get_probability_in_transaction(X, Ti)

    return expected_support

In [3]:
def compute_prWF(X, D, W, min_conf):
    """compute the probabilistic support of itemset X

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_conf (double): min confidence

    Returns:
        probabilistic support vector: the probabilistic support vector of itemset X
    """
    convolution_vector = compute_support_probabilistic_vector(X, D)
    weighted = get_weighted_itemset(X, W)
    prWF = 0
    
    for i in range(len(D), 0, -1):
        prWF += convolution_vector[i]*weighted

        if prWF > min_conf:
            return i

    return -1

def compute_support_probabilistic_vector(X, D):
    """calculate support probabilistic vector of itemset X in D 

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database

    Returns:
        support_probabilistic_vector: support probabilistic vector of itemset X in D 
    """
    n = len(D)
    
    # Base case
    if n == 1:
        prob = get_probability_in_transaction(X, D[0])
        f_X = np.zeros(2)
        f_X[0] = 1 - prob
        f_X[1] = prob
        return f_X

    # Recursive case: Partition PDB
    mid = n // 2
    D1 = D[:mid]
    D2 = D[mid:]

    # Recursively call DC on partitions
    f_X1 = compute_support_probabilistic_vector(X, D1)
    f_X2 = compute_support_probabilistic_vector(X, D2)

    # Convolution 2 array
    # Adjust length of f_X1 and f_X2 for convolution
    len_total = len(f_X1) + len(f_X2) - 1
    f_X1 = np.pad(f_X1, (0, len_total - len(f_X1)), mode='constant')
    f_X2 = np.pad(f_X2, (0, len_total - len(f_X2)), mode='constant')

    # Convolution using FFT
    f_X = np.fft.ifft(np.fft.fft(f_X1) * np.fft.fft(f_X2)).real
    f_X = f_X[:n+1]  # Take only the first n+1 elements
    f_X = np.round(f_X, decimals=10)  # Fix numerical inaccuracies

    return f_X

In [4]:

def compute_lower_bound(expected_support, min_conf, weighted_itemset):
    """compute the lower bound of expected support itemset X

    Args:
        expected_support (double): expected support of itemset X
        min_conf (double): minimum confidence
        weighted_itemset (double): weight of itemset X

    Returns:
        lower_bound: the lower bound of expected support itemset X
    """
    quotient_conf_weighted = min_conf / weighted_itemset
    return expected_support - np.sqrt(-2 * expected_support * np.log(1 - quotient_conf_weighted))

def compute_upper_bound(expected_support, min_conf, weighted_itemset):
    """compute the upper bound of expected support itemset X

    Args:
        expected_support (double): expected support of itemset X
        min_conf (double): minimum confidence
        weighted_itemset (double): weight of itemset X

    Returns:
        upper_bound: the upper bound of expected support itemset X
    """
    quotient_conf_weighted = min_conf / weighted_itemset
    return (2 * expected_support - np.log(quotient_conf_weighted) + np.sqrt(np.log(quotient_conf_weighted)**2 - 8 * expected_support * np.log(quotient_conf_weighted))) / 2


# 2. Implement algorithms

In [5]:
def algorithms(D, W, min_sup, min_conf):
    """implement the WPMFI algorithms

    Args:
        D (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence

    Returns:
        list_result: list of weighted probabilistic maximal frequent itemset
    """
    list_result = []
    
    sorted_item_list = get_sorted_list_item(D, W, min_sup, min_conf)
    
    map_sorted_item_list = {}
    for item in sorted_item_list:
        map_sorted_item_list[item[0]] = [item[1], item[2], item[3], item[4], item[5]]
        
    root = [[], 0, 0, 0, 0, 0, []]
    WPMFIM(root, list_result, sorted_item_list, D, W, min_sup, min_conf, map_sorted_item_list)
    
    return list_result
    
    
def WPMFIM(node, list_result, sorted_item_list, D, W, min_sup, min_conf, map_sorted_item_list):
    """implement the WPMFIM method

    Args:
        node (list): current node of itemset 
        list_result (list): list of weighted probabilistic maximal frequent itemset
        sorted_item_list (list): sorted item list in the D
        D (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence
    """
    itemset_of_node = node[0]
    
    item_J_order_larger_than_I_List = get_itemset_j_order_larger_than_i(itemset_of_node, sorted_item_list)
    
    for index in range(0, len(item_J_order_larger_than_I_List)):
        itemJ = item_J_order_larger_than_I_List[index]
        
        tempNode = [itemJ, 0, 0, 0, 0, 0, []]
        
        tempChild = node[6]
        # tempChild.append(tempNode)
        
        support = compute_support(itemJ, D)
        expected_support = compute_expected_support(itemJ, D)
        weighted_J = get_weighted_itemset(itemJ, W)
        lower_bound = compute_lower_bound(expected_support, min_conf, weighted_J)
        upper_bound = compute_upper_bound(expected_support, min_conf, weighted_J)
        
        if len(itemJ) == 1:
            probabilistic_support = map_sorted_item_list[itemJ[0]][4]
            
            if probabilistic_support >= min_sup:
                
                tempChild.append(tempNode)
                WPMFIM(tempNode, list_result, sorted_item_list, D, W, min_sup, min_conf, map_sorted_item_list)
        else:
        
            if min(support, upper_bound) < min_sup :
                continue
            
            if lower_bound >= min_sup:
                tempChild.append(tempNode)
                WPMFIM(tempNode, list_result, sorted_item_list, D, W, min_sup, min_conf, map_sorted_item_list)
            else:
                probabilistic_support = compute_prWF(itemJ, D, W, min_conf)
                
                if probabilistic_support >= min_sup:
                    tempChild.append(tempNode)
                    WPMFIM(tempNode, list_result, sorted_item_list, D, W, min_sup, min_conf, map_sorted_item_list)

    isContain = False
    
    for item in list_result:
        if set(itemset_of_node).issubset(set(item)):
            isContain = True
            break
        
    if len(node[6]) == 0 and not isContain:
        list_result.append(itemset_of_node)

def get_itemset_j_order_larger_than_i(itemset_i, sorted_item_list):
    """get itemsets J have order lager than itemset I

    Args:
        itemset_i (list<item>): itemset I
        sorted_item_list (list): sorted item list in the D

    Returns:
        list_itemset_j_order_larger_than_i: list of itemsets J have order lager than itemset I
    """
    itemset_j_order_larger_than_i = []

    sorted_item_list_size = len(sorted_item_list)
    item_i_size = len(itemset_i)
    index_of_last_item_i = -1

    # Get index of the last item of itemset in Sorted Item List
    if item_i_size > 0:
        for index in range(0, sorted_item_list_size):
            if itemset_i[-1] == sorted_item_list[index][0]:
                index_of_last_item_i = index
                break
        # index_of_last_item_i = len(sorted_item_list) - 1 - sorted_item_list[::-1].index(itemset_i[-1])

    # Return [] if the last item in itemset is the last item in Sorted Item List
    if index_of_last_item_i == len(sorted_item_list) - 1:
        return []

    for i in range(index_of_last_item_i + 1, sorted_item_list_size):
        temp_itemset = itemset_i.copy()
        temp_itemset.append(sorted_item_list[i][0])
        itemset_j_order_larger_than_i.append(temp_itemset)

    return itemset_j_order_larger_than_i

def discover_single_items(D):
    """get all single item in uncertain database

    Args:
        D (list<dict<string, double>>): Uncertain database

    Returns:
        list<string>: all single item in uncertain database
    """
    single_items = set()

    for Ti in D:
        for item in Ti.keys():
            single_items.add(item)

    return single_items

def get_sorted_list_item(D, W, min_sup, min_conf):
    """get all single item by order decrease of probabilistic support in uncertain database

    Args:
        D (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence

    Returns:
        list: list of single item by order decrease of probabilistic support
    """
    single_items = discover_single_items(D)
    list_single_items = []
    
    for item in single_items:
        support = compute_support([item], D)
        
        if support < min_sup :
            continue
        
        expected_support = compute_expected_support([item], D)
        weighted = get_weighted_itemset([item], W)
        lower_bound = compute_lower_bound(expected_support, min_conf, weighted)
        upper_bound = compute_upper_bound(expected_support, min_conf, weighted)
        
        probabilistic_support = compute_prWF([item], D, W, min_conf)
        list_single_items.append([item, support, expected_support, lower_bound, upper_bound, probabilistic_support])
        
    return sorted(list_single_items, key=lambda x: x[5], reverse=False)


# 3. Example

### 3.1 Example 1

In [6]:
D = [
    {'A': 0.6, 'B': 0.7},
    {'A': 0.2, 'C': 0.3},
]

W = {
    'A': 1,
    'B': 1,
    'C': 1
}

min_sup = 1
min_conf = 0.1

print(algorithms(D, W, min_sup, min_conf))

[['B', 'A'], ['C']]


### 3.2 Example 2

In [7]:
D = [
    {"1": 0.5, "2": 0.7, "4": 0.8, "5": 0.9},
    {"2": 0.6, "3": 0.8, "4": 0.6, "5": 0.8},
    {"3": 0.6, "4": 0.9, "5": 0.5},
    {"1": 0.6, "3": 0.7, "4": 0.8, "5": 0.8},
    {"1": 0.8, "2": 0.9, "3": 0.5, "4": 0.6, "5": 0.7},
    {"2": 0.6, "4": 0.9, "5": 0.8},
]

W = {
    "1": 0.3,
    "2": 0.9,
    "3": 0.5,
    "4": 0.6,
    "5": 0.9
}
min_sup = 2
min_conf = 0.2

print(algorithms(D, W, min_sup, min_conf))

[['1', '5'], ['3', '5', '4'], ['2', '5', '4']]


# Read dataset

In [8]:
def read_dataset(file_path, mean, variance):
    uncertain_database = []

    # Initialize normal distribution
    random.seed(12345)

    try:
        with open(file_path, 'r') as file:
            curr_id_transaction = 0
            count_num_line = 1

            for line in file:
                data_line_transaction = line.strip().split(" ")
                cur_transaction = {}

                for item in data_line_transaction:
                    data_item = item.split("-")
                    value = data_item[0]
                    prob = float(data_item[1])
                    cur_transaction[value] = prob

                uncertain_database.append(cur_transaction)
                curr_id_transaction += 1
                count_num_line += 1
    except IOError as e:
        print(e)

    print(uncertain_database)

    return uncertain_database

def generate_weighted_table(uncertain_database):
    weighted_table = {}
    distinct_itemset_database = discover_single_items(uncertain_database)

    random.seed(12345)

    for item in distinct_itemset_database:
        weighted = round(random.random(), 1)
        if weighted <= 0:
            weighted = 0.1
        else:
            if weighted >= 1:
                weighted = 0.9
        weighted_table[item] = weighted

    return weighted_table

# 4. Dataset

In [9]:
def read_dataset(file_path, mean, variance):
    uncertain_database = []

    # Initialize normal distribution
    random.seed(12345)

    try:
        with open(file_path, 'r') as file:
            curr_id_transaction = 0
            count_num_line = 1

            for line in file:
                data_line_transaction = line.strip().split(" ")
                cur_transaction = {}

                for item in data_line_transaction:
                    data_item = item.split("-")
                    value = data_item[0]
                    prob = float(data_item[1])
                    cur_transaction[value] = prob

                uncertain_database.append(cur_transaction)
                curr_id_transaction += 1
                count_num_line += 1
    except IOError as e:
        print(e)

    print(uncertain_database)

    return uncertain_database

def generate_weighted_table(uncertain_database):
    weighted_table = {}
    distinct_itemset_database = discover_single_items(uncertain_database)

    random.seed(12345)

    for item in distinct_itemset_database:
        weighted = round(random.random(), 1)
        if weighted <= 0:
            weighted = 0.1
        else:
            if weighted >= 1:
                weighted = 0.9
        weighted_table[item] = weighted

    return weighted_table

## Dataset

In [10]:
path = './data/accidents_10K.data'
path = './data/connect4_10K.data'
path = './data/T40I10D100K_10K.data'
path = './data/us_10K.data'
D = read_dataset(path, 0.78, 0.65)
W = generate_weighted_table(D)

for i in range(0, 5):
    print(D[i])

[{'7': 0.4, '10': 0.7, '21': 0.8, '50': 0.3, '58': 0.1, '61': 0.3, '64': 0.2, '65': 0.9, '75': 0.3, '97': 0.7, '103': 1.0, '120': 0.5, '128': 0.1, '130': 0.9, '142': 0.7, '155': 0.4, '156': 0.2, '159': 0.5, '167': 0.2, '183': 0.3, '185': 0.5, '189': 0.4, '192': 0.4, '203': 0.7, '213': 0.3, '216': 0.0, '222': 0.2, '225': 0.6, '230': 0.1, '253': 0.5, '270': 0.6, '279': 0.8, '285': 0.5, '289': 0.7, '293': 0.1, '307': 0.9, '310': 1.0, '316': 0.5, '329': 0.8, '335': 0.9, '344': 0.6, '349': 0.2, '359': 0.4, '361': 0.1, '364': 0.2, '373': 0.1, '387': 0.9, '394': 0.9}, {'5': 0.1, '10': 0.7, '22': 0.9, '44': 1.0, '56': 0.9, '61': 0.8, '64': 0.6, '65': 0.9, '75': 1.0, '99': 0.6, '105': 0.4, '120': 0.8, '128': 0.6, '130': 0.7, '142': 0.9, '155': 0.9, '156': 0.1, '161': 0.1, '167': 0.3, '183': 0.1, '185': 0.5, '189': 0.3, '192': 0.8, '203': 0.6, '213': 0.1, '215': 0.9, '220': 0.2, '225': 0.5, '229': 0.5, '253': 0.3, '270': 0.5, '279': 0.7, '285': 0.1, '289': 0.4, '293': 0.4, '307': 0.7, '313': 0.5

## Min support and minconfidence

In [11]:
min_sup = 0.1*10000
min_conf = 0.7

# 5. Run algorithm

In [12]:
# Start the timer
start_time = time.time()

# Your code here
result = algorithms(D, W, min_sup, min_conf)
print(result)
print(len(result))

# End the timer
end_time = time.time()

# Calculate the runtime
runtime = end_time - start_time
print(f"Runtime: {runtime} seconds")


  return expected_support - np.sqrt(-2 * expected_support * np.log(1 - quotient_conf_weighted))
  return (2 * expected_support - np.log(quotient_conf_weighted) + np.sqrt(np.log(quotient_conf_weighted)**2 - 8 * expected_support * np.log(quotient_conf_weighted))) / 2
  return expected_support - np.sqrt(-2 * expected_support * np.log(1 - quotient_conf_weighted))


[['330'], ['201'], ['120'], ['360'], ['386'], ['158'], ['347'], ['308'], ['365'], ['159', '61'], ['159', '307'], ['159', '213'], ['279', '61'], ['279', '156'], ['279', '307'], ['279', '213'], ['223', '269'], ['223', '128'], ['223', '394'], ['334', '213'], ['185', '307'], ['185', '213'], ['185', '394'], ['10', '307'], ['10', '213'], ['10', '394'], ['215', '128'], ['215', '213'], ['215', '394'], ['269', '128'], ['269', '213'], ['269', '394'], ['61', '156'], ['61', '307'], ['61', '128'], ['61', '213'], ['61', '394'], ['156', '307'], ['156', '128'], ['156', '213'], ['156', '394'], ['307', '128'], ['307', '213'], ['307', '394'], ['128', '213'], ['128', '394'], ['213', '394']]
47
Runtime: 139.40361762046814 seconds
