In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('datasets/POS_TRANSACTIONS_2018 .csv', index_col=0)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 459258 entries, 2 to 3
Data columns (total 4 columns):
Transaction_Id      459258 non-null int64
Transaction_Date    459258 non-null object
Product_Name        459258 non-null object
Quantity            459258 non-null int64
dtypes: int64(2), object(2)
memory usage: 17.5+ MB


In [3]:
# Task 4 : Association Mining

In [4]:
## Preprocessing
## Identify significant measurements/values
print('----Transaction_Id----')
print(df['Transaction_Id'].describe())
print('-------Transaction_Date-------')
print(df['Transaction_Date'].describe())
print('----Product_Name----')
print(df['Product_Name'].describe())
print('----Quantity----')
print(df['Quantity'].describe())

----Transaction_Id----
count    4.592580e+05
mean     6.179884e+05
std      3.511578e+05
min      1.235900e+04
25%      3.124370e+05
50%      6.168260e+05
75%      9.254420e+05
max      1.221866e+06
Name: Transaction_Id, dtype: float64
-------Transaction_Date-------
count         459258
unique             7
top       12/28/2017
freq           97866
Name: Transaction_Date, dtype: object
----Product_Name----
count                459258
unique                   17
top       Sketching Markers
freq                  73951
Name: Product_Name, dtype: object
----Quantity----
count    459258.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: Quantity, dtype: float64


In [14]:
transactions = df.groupby(['Transaction_Id'])['Product_Name'].apply(list)
print(transactions.head(5))

Transaction_Id
12359                               [Exercise book]
12362    [Mini Stationery Set, Mini Stationery Set]
12365                                  [Flash Card]
12371                                [Drink bottle]
12380                                   [DVD media]
Name: Product_Name, dtype: object


In [15]:
from apyori import apriori
# type cast the transactions from pandas into normal list format and run apriori
transaction_list = list(transactions)
results = list(apriori(transaction_list, min_support=0.05))
# print first 5 rules
print(results[:5])

[RelationRecord(items=frozenset({'DVD media'}), support=0.146885, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'DVD media'}), confidence=0.146885, lift=1.0)]), RelationRecord(items=frozenset({'Digital Clock'}), support=0.06735, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Digital Clock'}), confidence=0.06735, lift=1.0)]), RelationRecord(items=frozenset({'Drink bottle'}), support=0.054645, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Drink bottle'}), confidence=0.054645, lift=1.0)]), RelationRecord(items=frozenset({'Exercise book'}), support=0.171005, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Exercise book'}), confidence=0.171005, lift=1.0)]), RelationRecord(items=frozenset({'Flash Card'}), support=0.160425, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Flash Card'}), confidence=0.160425, lift=1.0)]

In [16]:
def convert_apriori_results_to_pandas_df(results):
    rules = []

    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add), rule_set.support, rule.confidence, rule.lift])

    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])
result_df = convert_apriori_results_to_pandas_df(results)
print(result_df.head(20))

   Left_side         Right_side   Support  Confidence  Lift
0                     DVD media  0.146885    0.146885   1.0
1                 Digital Clock  0.067350    0.067350   1.0
2                  Drink bottle  0.054645    0.054645   1.0
3                 Exercise book  0.171005    0.171005   1.0
4                    Flash Card  0.160425    0.160425   1.0
5                     Laminator  0.089960    0.089960   1.0
6                      Lanyards  0.134925    0.134925   1.0
7                    Power Bank  0.058480    0.058480   1.0
8                        Puzzle  0.050990    0.050990   1.0
9             Sketching Markers  0.241305    0.241305   1.0
10                   Wristbands  0.143575    0.143575   1.0


In [18]:
# sort all acquired rules descending by lift
result_df = result_df.sort_values(by='Lift', ascending=True)
print(result_df.head(10))

  Left_side         Right_side   Support  Confidence  Lift
0                    DVD media  0.146885    0.146885   1.0
1                Digital Clock  0.067350    0.067350   1.0
2                 Drink bottle  0.054645    0.054645   1.0
3                Exercise book  0.171005    0.171005   1.0
4                   Flash Card  0.160425    0.160425   1.0
5                    Laminator  0.089960    0.089960   1.0
6                     Lanyards  0.134925    0.134925   1.0
7                   Power Bank  0.058480    0.058480   1.0
8                       Puzzle  0.050990    0.050990   1.0
9            Sketching Markers  0.241305    0.241305   1.0
