# Implement Approximation of Probabilistic Maximal Frequent Itemset Mining Over Uncertain Sensed Data

In [45]:
import numpy as np
import random
from itertools import combinations
import time


# 1. Preliminary

In [46]:
def get_probability_in_transaction(X, T):
    """get the probabilistic of itemset in transaction T

    Args:
        X (list<string>): itemset X
        T (dict<string, double>): transaction T

    Returns:
        prob: the probabilistic of itemset in transaction T
    """
    prob = 1.0
    for i in X:
        prob *= T.get(i, 0.0)
    return prob

def get_weighted_itemset(X, W):
    """get the weight of itemset in weighted table

    Args:
        X (list<string>): itemset X
        W (dict<string, double>): weighted table W

    Returns:
        weight of itemset: the weight of itemset in weighted table
    """
    weighted = 0

    for i in X:
        weighted += W.get(i, 0.0)

    return weighted / len(X)

def compute_support(X, D):
    """compute the support of itemset X

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database

    Returns:
        support: the support of itemset X
    """
    support = 0

    for Ti in D:
        probX= get_probability_in_transaction(X, Ti)
        
        if probX > 0:
            support += 1

    return support

def compute_expected_support(X, D):
    """compute the expected support of itemset X

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database

    Returns:
        expected_support: the expected support of itemset X
    """
    expected_support = 0

    for Ti in D:
        expected_support += get_probability_in_transaction(X, Ti)

    return expected_support

In [47]:
def compute_prWF(X, D, W, min_conf):
    """compute the probabilistic support of itemset X

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_conf (double): min confidence

    Returns:
        probabilistic support vector: the probabilistic support vector of itemset X
    """
    convolution_vector = compute_support_probabilistic_vector(X, D)
    weighted = get_weighted_itemset(X, W)
    prWF = 0
    
    for i in range(len(D), 0, -1):
        prWF += convolution_vector[i]*weighted

        if prWF > min_conf:
            return i

    return -1

def compute_support_probabilistic_vector(X, D):
    """calculate support probabilistic vector of itemset X in D 

    Args:
        X (list<string>): itemset X
        D (list<dict<string, double>>): Uncertain database

    Returns:
        support_probabilistic_vector: support probabilistic vector of itemset X in D 
    """
    n = len(D)
    
    # Base case
    if n == 1:
        prob = get_probability_in_transaction(X, D[0])
        f_X = np.zeros(2)
        f_X[0] = 1 - prob
        f_X[1] = prob
        return f_X

    # Recursive case: Partition PDB
    mid = n // 2
    D1 = D[:mid]
    D2 = D[mid:]

    # Recursively call DC on partitions
    f_X1 = compute_support_probabilistic_vector(X, D1)
    f_X2 = compute_support_probabilistic_vector(X, D2)

    # Convolution 2 array
    # Adjust length of f_X1 and f_X2 for convolution
    len_total = len(f_X1) + len(f_X2) - 1
    f_X1 = np.pad(f_X1, (0, len_total - len(f_X1)), mode='constant')
    f_X2 = np.pad(f_X2, (0, len_total - len(f_X2)), mode='constant')

    # Convolution using FFT
    f_X = np.fft.ifft(np.fft.fft(f_X1) * np.fft.fft(f_X2)).real
    f_X = f_X[:n+1]  # Take only the first n+1 elements
    f_X = np.round(f_X, decimals=10)  # Fix numerical inaccuracies

    return f_X

In [48]:
def lower_bound_expected_support(min_sup, min_conf):
    return (2*min_sup - np.log(min_conf) - np.sqrt((np.log(min_conf))**2 - 8*min_sup*np.log(min_conf))) / 2

def upper_bound_expected_support(min_sup, min_conf):
    return min_sup - np.log(1 - min_conf) + np.sqrt(np.log(1 - min_conf)**2 - 2*min_sup*np.log(1 - min_conf))

# 2. Algorithms

In [49]:
def discover_single_items(D):
    # Implement discovery of single items using Chernoff-Hoeffding bound
    single_items = set()

    for Ti in D:
        for item in Ti.keys():
            single_items.add(item)

    return single_items

def candidate_generate_expected_bound(D, W, min_sup, min_conf):
    """generate candidate from expected bound

    Args:
        D (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence

    Returns:
        list_candidate: list of candidate is generated from expected bound
    """
    list_candidate = []
    i = 1
    L = discover_single_items(D)

    while True:
        Ci = []

        combinations_list = list(combinations(L, i))

        for item in combinations_list:
            E = 0
            Var = 0
            count = 0

            for j in range(0, len(D)):
                Tj = D[j]
                # item = list(item)
                probX = get_probability_in_transaction(item, Tj)
                weightedX = get_weighted_itemset(item, W)

                if probX > 0:
                    E += probX
                    Var += probX * (1 - probX)
                    count += 1

                    if E  >= lower_bound_expected_support(min_sup, min_conf / weightedX) and count >= min_sup:
                        Ci.append((item, E, Var, j))
                        
                        break

        i += 1
        list_candidate.append(Ci)

        L = set()

        for c in Ci:
            for item in c[0]:
              L.add(item)

        if len(L) == 0:
            return list_candidate


In [50]:
def algorithms(D, W, min_sup, min_conf):
    """implement the wPMFI-MAX algorithm

    Args:
        D (list<dict<string, double>>): Uncertain database
        W (dict<string, double): Weighted table
        min_sup (double): min support
        min_conf (double): min confidence

    Returns:
        RES: list of weighted probabilistic maximal frequent itemset
    """

    # Step to obtain candidates with algorithm 1 is assumed to be done separately
    candidate = candidate_generate_expected_bound(D, W, min_sup, min_conf)

    Fre_Pre = []
    Fre_Cur = []
    RES = []

    n = len(candidate)

    for i in range(n, 0, -1):
        for j in range(len(candidate[i-1])):
            X = candidate[i-1][j]

            if X in Fre_Pre and all(item in X for item in Fre_Pre):
                Fre_Cur.append(X)
                continue
            
            frequent = False
            
            # if X[1] >= upper_bound_expected_support(min_sup, min_conf):
            #     frequent = True
            #     # isContain = False
            #     # for y in RES:
            #     #     if set(X[0]).issubset(set(y)):
            #     #         isContain = True
            #     #         break
            #     # if not isContain:
            #     #     RES.append(list(X[0]))
            #     # Fre_Cur.append(X)

            # else: 
            if compute_prWF(X[0], D, W, min_conf) >= min_sup:
                frequent = True
                # isContain = False
                # for y in RES:
                #     if set(X[0]).issubset(set(y)):
                #         isContain = True
                #         break
                # if not isContain:
                #     RES.append(list(X[0]))
                # Fre_Cur.append(X)
                
            if frequent:
                isContain = False
                for y in RES:
                    if set(X[0]).issubset(set(y)):
                        isContain = True
                        break
                if not isContain:
                    RES.append(list(X[0]))
                Fre_Cur.append(X)

        Fre_Pre = Fre_Cur
        Fre_Cur = []

    return RES


# 3. Example

### 3.1 Example 1

In [51]:
D = [
    {'A': 0.6, 'B': 0.7},
    {'A': 0.2, 'C': 0.3},
]

weighted = {
    'A': 1,
    'B': 1,
    'C': 1
}

min_sup = 1
min_conf = 0.1

print(algorithms(D, weighted, min_sup, min_conf))

[['B', 'A'], ['C']]


### 3.2 Example 2

In [52]:
D = [
    {"A": 0.5, "B": 0.7, "D": 0.8, "E": 0.9},
    {"B": 0.6, "C": 0.8, "D": 0.6, "E": 0.8},
    {"C": 0.6, "D": 0.9, "E": 0.5},
    {"A": 0.6, "C": 0.7, "D": 0.8, "E": 0.8},
    {"A": 0.8, "B": 0.9, "C": 0.5, "D": 0.6, "E": 0.7},
    {"B": 0.6, "D": 0.9, "E": 0.8},
]

weighted = {
    "A": 0.3,
    "B": 0.9,
    "C": 0.5,
    "D": 0.6,
    "E": 0.9
}

min_sup = 2
min_conf = 0.2

print(algorithms(D, weighted, min_sup, min_conf))

[['C', 'D', 'E'], ['D', 'E', 'B'], ['E', 'A']]


# Real dataset

In [53]:
def read_dataset(file_path, mean, variance):
    uncertain_database = []

    # Initialize normal distribution
    random.seed(12345)

    try:
        with open(file_path, 'r') as file:
            curr_id_transaction = 0
            count_num_line = 1

            for line in file:
                data_line_transaction = line.strip().split(" ")
                cur_transaction = {}

                for item in data_line_transaction:
                    data_item = item.split("-")
                    value = data_item[0]
                    prob = float(data_item[1])
                    cur_transaction[value] = prob

                uncertain_database.append(cur_transaction)
                curr_id_transaction += 1
                count_num_line += 1
    except IOError as e:
        print(e)

    print(uncertain_database)

    return uncertain_database

def generate_weighted_table(uncertain_database):
    weighted_table = {}
    distinct_itemset_database = discover_single_items(uncertain_database)

    random.seed(12345)

    for item in distinct_itemset_database:
        weighted = round(random.random(), 1)
        if weighted <= 0:
            weighted = 0.1
        else:
            if weighted >= 1:
                weighted = 0.9
        weighted_table[item] = weighted

    return weighted_table

In [54]:
path = './data/us_10K.data'

D = read_dataset(path, 0.78, 0.65)
W = generate_weighted_table(D)

for i in range(0, 5):
    print(D[i])

[{'22': 0.9, '88': 0.5, '67': 0.4, '46': 0.4, '25': 0.3, '49': 0.2, '28': 0.7, '112': 0.5, '115': 0.8, '91': 0.6, '118': 0.9, '70': 0.7, '94': 0.8, '73': 0.3, '52': 0.5, '31': 1.0, '97': 0.5, '10': 0.3, '76': 0.0, '55': 0.2, '34': 0.5, '13': 0.1, '79': 0.2, '58': 0.3, '37': 0.1, '16': 0.3, '19': 0.2, '121': 0.6, '1': 0.4, '100': 0.7, '124': 0.2, '4': 0.7, '103': 0.1, '127': 0.4, '7': 0.8, '106': 0.9, '82': 0.6, '109': 1.0, '61': 0.5, '40': 0.9, '85': 0.1, '64': 0.4, '43': 0.7}, {'22': 0.9, '44': 0.6, '88': 0.1, '67': 0.1, '46': 0.4, '25': 1.0, '49': 0.8, '28': 0.9, '112': 0.1, '115': 0.4, '91': 0.9, '118': 0.4, '70': 0.3, '94': 0.2, '73': 0.1, '52': 0.6, '31': 0.8, '97': 0.5, '10': 0.9, '55': 0.7, '77': 0.5, '34': 0.6, '13': 0.9, '79': 0.3, '58': 0.9, '37': 0.9, '16': 0.1, '19': 0.7, '121': 0.7, '1': 0.1, '100': 0.5, '124': 0.5, '4': 0.2, '103': 0.3, '127': 0.4, '7': 0.1, '106': 0.5, '82': 0.8, '109': 0.7, '61': 0.9, '40': 1.0, '85': 0.6, '64': 0.1}, {'44': 0.5, '88': 0.4, '23': 0.1, '

In [55]:
# Start the timer
start_time = time.time()

min_sup = 0.1*10000
wfpt = 0.7
# Your code here
result = algorithms(D, W, min_sup, wfpt)
print(result)
print(len(result))

# End the timer
end_time = time.time()

# Calculate the runtime
runtime = end_time - start_time
print(f"Runtime: {runtime} seconds")


  return (2*min_sup - np.log(min_conf) - np.sqrt((np.log(min_conf))**2 - 8*min_sup*np.log(min_conf))) / 2


[]
0
Runtime: 4.850615739822388 seconds
