In [9]:
import pandas as pd
import tqdm
from apyori import apriori

### References

    1. https://www.geeksforgeeks.org/association-rule/
    2. https://towardsdatascience.com/association-rule-mining-be4122fc1793
    3. https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python/
    4. https://www.kaggle.com/datatheque/association-rules-mining-market-basket-analysis
    5. https://www.kaggle.com/roshansharma/market-basket-analysis
    6. https://www.kaggle.com/yugagrawal95/market-basket-analysis-apriori-in-python
    7. https://www.kaggle.com/kalash04/assoc-rule-discovery-cookies-hot-chocolate
    8. https://www.kaggle.com/ostrowski/market-basket-analysis-exploring-e-commerce-data
    9. https://en.wikipedia.org/wiki/Association_rule_learning
    10. http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
    11. https://towardsdatascience.com/association-rules-2-aa9a77241654
    12. https://towardsdatascience.com/complete-guide-to-association-rules-2-2-c92072b56c84
    13. https://paginas.fe.up.pt/~ec/files_0506/slides/04_AssociationRules.pdf

In [2]:
data_file = 'groceries.csv'

In [3]:
def get_transactions(data_file, sep=','):
    
    # Delimiter
    data_file_delimiter = sep
    transactions = []
    
    with open(data_file, 'r') as temp_f:
        
        # Read the lines
        lines = temp_f.readlines()

        for l in lines:
            
            columns = l.split(data_file_delimiter)
            columns = [col.strip('\n') for col in columns]
            transactions.append(columns)
            
    # Close file
    temp_f.close()
    
    return transactions

In [5]:
transactions = get_transactions(data_file, sep=',')

In [6]:
len(transactions)

9835

In [7]:
unique_items = set()
for i in range(len(transactions)):
    unique_items = unique_items.union(set(transactions[i]))
    if not len(set(transactions[i]))==len(transactions[i]):
        print('duplicate')

In [8]:
unique_items

{'Instant food products',
 'UHT-milk',
 'abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'baby food',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer',
 'bottled water',
 'brandy',
 'brown bread',
 'butter',
 'butter milk',
 'cake bar',
 'candles',
 'candy',
 'canned beer',
 'canned fish',
 'canned fruit',
 'canned vegetables',
 'cat food',
 'cereals',
 'chewing gum',
 'chicken',
 'chocolate',
 'chocolate marshmallow',
 'citrus fruit',
 'cleaner',
 'cling film/bags',
 'cocoa drinks',
 'coffee',
 'condensed milk',
 'cooking chocolate',
 'cookware',
 'cream',
 'cream cheese ',
 'curd',
 'curd cheese',
 'decalcifier',
 'dental care',
 'dessert',
 'detergent',
 'dish cleaner',
 'dishes',
 'dog food',
 'domestic eggs',
 'female sanitary products',
 'finished products',
 'fish',
 'flour',
 'flower (seeds)',
 'flower soil/fertilizer',
 'frankfurter',
 'frozen chicken',
 'frozen dessert',
 'frozen fish',
 'frozen fruits',
 'frozen m

In [11]:
THRESHOLD = 0.5
MIN_COUNT = 10
min_support = MIN_COUNT/len(transactions)
min_confidance = THRESHOLD

In [23]:
rules = apriori(transactions, min_support=min_support, min_confidance=min_confidance, min_lift=2, min_length=2)

In [24]:
rules

<generator object apriori at 0x10f072b50>

In [15]:
association_results = list(rules)

In [47]:
print(association_results[1000])

RelationRecord(items=frozenset({'newspapers', 'beef', 'citrus fruit'}), support=0.0010167768174885613, ordered_statistics=[OrderedStatistic(items_base=frozenset({'beef'}), items_add=frozenset({'newspapers', 'citrus fruit'}), confidence=0.01937984496124031, lift=2.324399697485347), OrderedStatistic(items_base=frozenset({'newspapers', 'citrus fruit'}), items_add=frozenset({'beef'}), confidence=0.12195121951219513, lift=2.3243996974853474)])


In [56]:
[x for x in association_results[1000].ordered_statistics[0].items_base]

['beef']

In [57]:
[x for x in association_results[1000].ordered_statistics[0].items_add]

['newspapers', 'citrus fruit']

In [64]:
association_results[1000].ordered_statistics[0].confidence

0.01937984496124031

In [60]:
def list_to_string(ls):
    result = ''
    for l in ls:
        result+=l+' ,'
    return result.strip(',').strip()

In [65]:
final_df = pd.DataFrame({'Rule':[],'Support':[],'Confidence':[],'Lift':[]})


for i in tqdm.tqdm(range(len(association_results))):

    # first index of the inner list
    # Contains base item and add item
    item = association_results[i]
    
    lhs1 = [x for x in item.ordered_statistics[0].items_base]
    rhs1 = [x for x in item.ordered_statistics[0].items_add]
    rule1 = list_to_string(lhs1) + " -> " + list_to_string(rhs1)
    
    lhs2 = [x for x in item.ordered_statistics[1].items_base]
    rhs2 = [x for x in item.ordered_statistics[1].items_add]
    rule2 = list_to_string(lhs2) + " -> " + list_to_string(rhs2)

    support = item[1]
    
    confidence1 = item.ordered_statistics[0].confidence
    confidence2 = item.ordered_statistics[1].confidence
    
    lift = item[2][0][3]
    #print("=====================================")
    final_df = final_df.append(pd.DataFrame({'Rule':[rule1],'Support':[support],'Confidence':[confidence1],'Lift':[lift]}))
    final_df = final_df.append(pd.DataFrame({'Rule':[rule2],'Support':[support],'Confidence':[confidence2],'Lift':[lift]}))

100%|██████████| 9897/9897 [00:19<00:00, 495.25it/s]


In [74]:
final_df.head()

Unnamed: 0,Rule,Support,Confidence,Lift
0,Instant food products -> butter,0.00122,0.151899,2.741145
0,butter -> Instant food products,0.00122,0.022018,2.741145
0,Instant food products -> curd,0.001322,0.164557,3.088583
0,curd -> Instant food products,0.001322,0.024809,3.088583
0,Instant food products -> frankfurter,0.001017,0.126582,2.146443


In [75]:
min_support, min_confidance

(0.0010167768174885613, 0.5)

In [76]:
filtered_final_df = final_df[final_df.Support>=min_support]
filtered_final_df = filtered_final_df[filtered_final_df.Confidence>=min_confidance]

In [77]:
filtered_final_df.shape

(91, 4)

In [78]:
filtered_final_df

Unnamed: 0,Rule,Support,Confidence,Lift
0,baking powder -> whole milk,0.009253,0.522989,2.046793
0,cereals -> whole milk,0.003660,0.642857,2.515917
0,cocoa drinks -> whole milk,0.001322,0.590909,2.312611
0,cooking chocolate -> whole milk,0.001322,0.520000,2.035097
0,honey -> whole milk,0.001118,0.733333,2.870009
...,...,...,...,...
0,"tropical fruit ,sugar -> whole milk",0.002847,0.595745,2.331536
0,"yogurt ,sugar -> whole milk",0.003660,0.529412,2.071932
0,"brown bread ,tropical fruit ,rolls/buns -> who...",0.001017,0.526316,2.059815
0,"other vegetables ,rolls/buns ,long life bakery...",0.001118,0.523810,2.050007


In [83]:
filtered_final_df.Lift.describe()

count    91.000000
mean      2.267799
std       0.290194
min       2.002332
25%       2.059815
50%       2.162806
75%       2.370320
max       3.445437
Name: Lift, dtype: float64