In [1]:
import numpy as np
import pandas as pd
from pyECLAT import ECLAT

dataframe = pd.read_csv('data.csv', header=None)
print(dataframe)

       0      1      2       3     4      5
0   Wine  Chips  Bread  Butter  Milk  Apple
1   Wine    NaN  Bread  Butter  Milk    NaN
2    NaN    NaN  Bread  Butter  Milk    NaN
3    NaN  Chips    NaN     NaN   NaN  Apple
4   Wine  Chips  Bread  Butter  Milk  Apple
5   Wine  Chips    NaN     NaN  Milk    NaN
6   Wine  Chips  Bread  Butter   NaN  Apple
7   Wine  Chips    NaN     NaN  Milk    NaN
8   Wine    NaN  Bread     NaN   NaN  Apple
9   Wine    NaN  Bread  Butter  Milk    NaN
10   NaN  Chips  Bread  Butter   NaN  Apple
11  Wine    NaN    NaN  Butter  Milk  Apple
12  Wine  Chips  Bread  Butter  Milk    NaN
13  Wine    NaN  Bread     NaN  Milk  Apple
14  Wine    NaN  Bread  Butter  Milk  Apple
15  Wine  Chips  Bread  Butter  Milk  Apple
16   NaN  Chips  Bread  Butter  Milk  Apple
17   NaN  Chips    NaN  Butter  Milk  Apple
18  Wine  Chips  Bread  Butter  Milk  Apple
19  Wine    NaN  Bread  Butter  Milk  Apple
20  Wine  Chips  Bread     NaN  Milk  Apple
21   NaN  Chips    NaN     NaN  

In [2]:
eclat_instance = ECLAT(data=dataframe, verbose=True)
print(eclat_instance.df_bin)

100%|██████████| 6/6 [00:00<00:00, 966.39it/s]
100%|██████████| 6/6 [00:00<?, ?it/s]
100%|██████████| 6/6 [00:00<00:00, 2383.35it/s]

    Butter  Milk  Chips  Apple  Wine  Bread
0        1     1      1      1     1      1
1        1     1      0      0     1      1
2        1     1      0      0     0      1
3        0     0      1      1     0      0
4        1     1      1      1     1      1
5        0     1      1      0     1      0
6        1     0      1      1     1      1
7        0     1      1      0     1      0
8        0     0      0      1     1      1
9        1     1      0      0     1      1
10       1     0      1      1     0      1
11       1     1      0      1     1      0
12       1     1      1      0     1      1
13       0     1      0      1     1      1
14       1     1      0      1     1      1
15       1     1      1      1     1      1
16       1     1      1      1     0      1
17       1     1      1      1     0      0
18       1     1      1      1     1      1
19       1     1      0      1     1      1
20       0     1      1      1     1      1
21       0     0      1      0  




In [3]:
# count items in each row
items_per_transaction = eclat_instance.df_bin.astype(int).sum(axis=1)
# the item should appear at least at 5% of transactions
min_support = 0.6
# start from transactions containing at least 2 items
min_combination = 2
# up to maximum items per transaction
max_combination = max(items_per_transaction)
rule_indices, rule_supports = eclat_instance.fit(min_support=min_support,
                                                 min_combination=min_combination,
                                                 max_combination=max_combination,
                                                 separator=' & ',
                                                 verbose=True)
result = pd.DataFrame(rule_supports.items(), columns=['Item', 'Support'])
result1 = result.sort_values(by=['Support'], ascending=False)
print(result1)

Combination 2 by 2


15it [00:00, 420.64it/s]


Combination 3 by 3


20it [00:00, 424.20it/s]


Combination 4 by 4


15it [00:00, 409.94it/s]


Combination 5 by 5


6it [00:00, 443.84it/s]


Combination 6 by 6


1it [00:00, 332.51it/s]

          Item   Support
0  Milk & Wine  0.636364





# Vertical Apriori

In [4]:
def count_items_in_transactions(df):
    df1 = pd.DataFrame(0, index=df.index, columns=['Butter', 'Chips', 'Wine', 'Bread', 'Apple', 'Milk'])
    for index, row in df.iterrows():
        for item in row:
            if item in df1.columns:
                df1.at[index, item] = 1

    return df1

df1 = count_items_in_transactions(dataframe)
print(df1)

    Butter  Chips  Wine  Bread  Apple  Milk
0        1      1     1      1      1     1
1        1      0     1      1      0     1
2        1      0     0      1      0     1
3        0      1     0      0      1     0
4        1      1     1      1      1     1
5        0      1     1      0      0     1
6        1      1     1      1      1     0
7        0      1     1      0      0     1
8        0      0     1      1      1     0
9        1      0     1      1      0     1
10       1      1     0      1      1     0
11       1      0     1      0      1     1
12       1      1     1      1      0     1
13       0      0     1      1      1     1
14       1      0     1      1      1     1
15       1      1     1      1      1     1
16       1      1     0      1      1     1
17       1      1     0      0      1     1
18       1      1     1      1      1     1
19       1      0     1      1      1     1
20       0      1     1      1      1     1
21       0      1     0      0  

In [5]:
def items_per_transaction(df):
    df2 = df.notnull().astype(int).sum(axis=1)

    return df2
df2 = items_per_transaction(dataframe)
print(df2)

0     6
1     4
2     3
3     2
4     6
5     3
6     5
7     3
8     3
9     4
10    4
11    4
12    5
13    4
14    5
15    6
16    5
17    4
18    6
19    5
20    5
21    1
dtype: int64


In [6]:
max_combination = max(df2)

In [7]:
# Find rule indices Milk and Wine
def find_rule_indices(df, item1, item2):
    indices = []
    for index, row in df.iterrows():
        if item1 in row.values and item2 in row.values:
            indices.append(index)

    return indices
item1 = 'Milk'
item2 = 'Wine'
indices = find_rule_indices(dataframe, item1, item2)
print(f"Indices for {item1} and {item2}: {indices}")

Indices for Milk and Wine: [0, 1, 4, 5, 7, 9, 11, 12, 13, 14, 15, 18, 19, 20]


In [8]:
import itertools
def verticalApriori(df, min_support, min_combination, max_combination):
    # Create a list to store the frequent itemsets
    frequent_itemsets = []

    # Generate all possible combinations of items
    for size in range(min_combination, max_combination + 1):
        for combination in itertools.combinations(df.columns, size):
            # Count the occurrences of the combination in the DataFrame
            count = df[list(combination)].all(axis=1).sum()

            # Calculate the support
            support = count / len(df)

            # Check if the support is above the minimum threshold
            if support >= min_support:
                frequent_itemsets.append((combination, support))

    return frequent_itemsets
min_support = 0.6
min_combination = 2
max_combination = max(df2)
frequent_itemsets = verticalApriori(df1, min_support, min_combination, max_combination)
print("Frequent itemsets:")
for itemset, support in frequent_itemsets:
    print(f"Itemset: {itemset}, Support: {support}")

Frequent itemsets:
Itemset: ('Wine', 'Milk'), Support: 0.6363636363636364
