### Association rule learning 

In [1]:
import numpy as np
import pandas as pd
data = {'products':['bread eggs', 'bread eggs milk', 'milk cheese', 'bread butter cheese', 
                    'eggs milk', 'bread milk butter cheese']}
data

{'products': ['bread eggs',
  'bread eggs milk',
  'milk cheese',
  'bread butter cheese',
  'eggs milk',
  'bread milk butter cheese']}

In [2]:
transcations = pd.DataFrame(data=data, index=range(1,7))
transcations

Unnamed: 0,products
1,bread eggs
2,bread eggs milk
3,milk cheese
4,bread butter cheese
5,eggs milk
6,bread milk butter cheese


In [4]:
expanded = transcations['products'].str.split(expand=True)
expanded

Unnamed: 0,0,1,2,3
1,bread,eggs,,
2,bread,eggs,milk,
3,milk,cheese,,
4,bread,butter,cheese,
5,eggs,milk,,
6,bread,milk,butter,cheese


### 74 remove duplicates

In [5]:
products = set()
for col in expanded.columns:
    for product in expanded[col].unique():
        if product:
            products.add(product)
products

{'bread', 'butter', 'cheese', 'eggs', 'milk'}

In [7]:
products = sorted(list(products))
products

['bread', 'butter', 'cheese', 'eggs', 'milk']

### 75 OneHot encode

In [9]:
transactions_encoded = np.zeros((len(expanded),len(products)),dtype='int8')
transactions_encoded

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]], dtype=int8)

In [13]:
for row in zip(range(len(expanded)), expanded.values):
    print(row[0],row[1])
    for idx, product in enumerate(products):
        # print(idx, product)
        if product in row[1]:
            transactions_encoded[row[0],idx]=1
transactions_encoded

0 ['bread' 'eggs' None None]
1 ['bread' 'eggs' 'milk' None]
2 ['milk' 'cheese' None None]
3 ['bread' 'butter' 'cheese' None]
4 ['eggs' 'milk' None None]
5 ['bread' 'milk' 'butter' 'cheese']


array([[1, 0, 0, 1, 0],
       [1, 0, 0, 1, 1],
       [0, 0, 1, 0, 1],
       [1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1],
       [1, 1, 1, 0, 1]], dtype=int8)

In [15]:
transactions_encoded_df = pd.DataFrame(transactions_encoded, columns = products)
transactions_encoded_df

Unnamed: 0,bread,butter,cheese,eggs,milk
0,1,0,0,1,0
1,1,0,0,1,1
2,0,0,1,0,1
3,1,1,1,0,0
4,0,0,0,1,1
5,1,1,1,0,1


### 76 product support rate

In [17]:
support = transactions_encoded_df.sum()/len(transactions_encoded_df)
support

bread     0.666667
butter    0.333333
cheese    0.500000
eggs      0.500000
milk      0.666667
dtype: float64

### 77 multi products support rate

In [18]:
sup_butter_bread = len(transactions_encoded_df.query('butter==1 and bread==1'))/len(transactions_encoded_df)
sup_butter_bread

0.3333333333333333

In [20]:
sup_butter_milk = len(transactions_encoded_df.query('butter==1 and milk==1'))/len(transactions_encoded_df)
sup_butter_milk

0.16666666666666666

### 78 confidence rate

In [22]:
conf_cheese_bread = (len(transactions_encoded_df.query('cheese==1 and bread==1'))
                    /len(transactions_encoded_df.query('cheese==1')))
conf_cheese_bread

0.6666666666666666

In [23]:
conf_butter_bread = (len(transactions_encoded_df.query('butter==1 and bread==1'))
                    /len(transactions_encoded_df.query('butter==1')))
conf_butter_bread

1.0