In [None]:
from collections import defaultdict

def get_frequent_itemsets(baskets, support_threshold):
    # Pass 1: Count individual items and hash pairs to buckets
    item_count = defaultdict(int)
    bucket_count = defaultdict(int)

    for basket in baskets:
        for item in basket:
            item_count[item] += 1

        # Hash pairs of items to buckets
        for i in range(len(basket)):
            for j in range(i + 1, len(basket)):
                bucket = hash((basket[i], basket[j])) % len(basket)
                bucket_count[bucket] += 1

    # Pass 1.5: Convert bucket counts to bitmap
    bitmap = {}
    for bucket in bucket_count:
        if bucket_count[bucket] >= support_threshold:
            bitmap[bucket] = 1
        else:
            bitmap[bucket] = 0

    # Filter out infrequent items
    frequent_items = set()
    for item, count in item_count.items():
        if count >= support_threshold:
            frequent_items.add(item)

    # Pass 2: Count frequent pairs
    pair_count = defaultdict(int)
    for basket in baskets:
        for i in range(len(basket)):
            for j in range(i + 1, len(basket)):
                if (basket[i] in frequent_items and basket[j] in frequent_items):
                    bucket = hash((basket[i], basket[j])) % len(basket)
                    if bitmap[bucket] == 1:
                        pair = tuple(sorted((basket[i], basket[j])))
                        pair_count[pair] += 1

    # Filter out infrequent pairs
    frequent_pairs = {pair: count for pair, count in pair_count.items() if count >= support_threshold}

    return frequent_pairs

# Example usage:
baskets = [
    [1, 2, 3],
    [1, 2, 4],
    [1, 3, 4],
    [2, 3],
    [2, 4],
    [3, 4]
]
support_threshold = 2

frequent_pairs = get_frequent_itemsets(baskets, support_threshold)
print(f"Frequent pairs: {frequent_pairs}")

Frequent pairs: {(1, 2): 2, (1, 3): 2, (2, 3): 2, (1, 4): 2, (2, 4): 2, (3, 4): 2}


In [None]:
import pandas as pd

# Load the dataset from the uploaded CSV file
file_path = '/content/Datasetcutted.csv'
dataset = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
dataset.head(), dataset.columns


import numpy as np
from collections import defaultdict

def pcy_algorithm(data, support_threshold):
    """
    PCY Algorithm for Frequent Pattern Mining
    Args:
    data: Pandas DataFrame containing binary data for items
    support_threshold: Minimum support threshold for an itemset to be considered frequent

    Returns:
    frequent_itemsets: List of tuples representing frequent itemsets
    """
    # First pass: Count item frequencies and hash bucket counts
    item_counts = defaultdict(int)
    bucket_counts = defaultdict(int)
    num_buckets = len(data.columns)

    for _, transaction in data.iterrows():
        items = np.where(transaction == 1)[0]
        # Count single items
        for item in items:
            item_counts[item] += 1

        # Count pairs using hash buckets
        for i in range(len(items)):
            for j in range(i + 1, len(items)):
                pair = (items[i], items[j])
                bucket = hash(pair) % num_buckets
                bucket_counts[bucket] += 1

    # Determine frequent items and bitmap for frequent buckets
    frequent_items = {item for item, count in item_counts.items() if count >= support_threshold}
    bitmap = {bucket: (count >= support_threshold) for bucket, count in bucket_counts.items()}

    # Second pass: Identify frequent item pairs
    candidate_pairs = defaultdict(int)
    for _, transaction in data.iterrows():
        items = [item for item in np.where(transaction == 1)[0] if item in frequent_items]

        for i in range(len(items)):
            for j in range(i + 1, len(items)):
                pair = (items[i], items[j])
                bucket = hash(pair) % num_buckets
                if bitmap[bucket]:
                    candidate_pairs[pair] += 1

    # Filter to retain only frequent pairs
    frequent_itemsets = [(pair, count) for pair, count in candidate_pairs.items() if count >= support_threshold]

    return frequent_itemsets

# Set a support threshold
support_threshold = 2

# Run the PCY algorithm on the dataset
frequent_itemsets = pcy_algorithm(dataset, support_threshold)

frequent_itemsets[:10]  # Display the first 10 frequent itemsets for brevity

[((13, 16), 4),
 ((11, 13), 2),
 ((2, 10), 2),
 ((13, 17), 2),
 ((7, 15), 2),
 ((12, 13), 4),
 ((9, 10), 3),
 ((9, 15), 2),
 ((10, 15), 2),
 ((11, 15), 2)]