# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 4) Pattern Mining
### *Antonio Strippoli, Valerio Mariani*

In [None]:
from functions import *  # Custom function for the analysis
from gsp import apriori
import datetime
import logging
import time
import os

# Set logging
logging.basicConfig(level=logging.INFO, filename="log.txt", filemode="a+", format="%(message)s")
logging.getLogger().addHandler(logging.StreamHandler())

### Apply GSP on sequential data

In [None]:
# Main cycle: apply GSP multiple times
params = {
    'min_sup': [0.4, 0.35, 0.3, 0.25, 0.2, 0.15],
    'min_baskets': [20, 10, 5, 3, 2],
}
for min_sup in params['min_sup']:
    for min_baskets in params['min_baskets']:
        logging.info(f"MIN_BASKETS: {min_baskets}, MIN_SUP: {min_sup}")

        # Read the dataset
        df = read_dataset()
        # Remove some baskets
        df = remove_baskets(df, min_baskets)
        # Convert into seq form
        seq_data = sequentialize(df)
        
        # Apply GSP
        t0 = time.time()
        result_set = apriori(seq_data, min_sup, verbose=False)
        t1 = time.time()

        # Compute n. of sequences with len > 2 and n. of sequences containing duplicates
        cnt_len_2 = 0
        cnt_duplicates = 0
        for r in result_set:
            r = r[0]
            tmp = []
            for l in r:
                tmp.extend(l)
            if len(tmp) >= 2:
                cnt_len_2 += 1
                if len(set(tmp)) < len(tmp):
                    cnt_duplicates += 1

        logging.info(
            f"TOTAL TIME:\t{round(t1-t0, 2)} s\n"\
            f"LEN RESULT SET:\t{len(result_set)}\n"\
            f"LEN SEQ > 2:\t{cnt_len_2}\nN. DUPLICATES:\t{cnt_duplicates}\n"
        )

        # Save
        read_write_result(False, min_baskets, min_sup, result_set)

### Analyze results and collect statistics

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25

# Read result
result_set = read_write_result(True, min_baskets, min_sup)
result_set = convert_tuples_to_list(result_set)
compute_distribution(result_set)

# Read and prepare the dataset
df = read_dataset()
df = remove_baskets(df, min_baskets)

# Compute mean qta values
result_set = compute_patterns_mean_qta(result_set, df)

# Convert ProdID to ProdDescr
result_set = prodID_to_prodDescr(result_set, df)

result_set

#### Results, do not execute the following cell

In [None]:
Distribution of lengths: {1: 56, 2: 14, 3: 3, 4: 0, 5: 0}
Sequences containing duplicates: 11 / 73

[[[['REGENCY CAKESTAND 3 TIER']], 0.42, [[10]]],
 [[['JUMBO BAG RED RETROSPOT']], 0.42, [[26]]],
 [[['PACK OF 72 RETROSPOT CAKE CASES']], 0.4, [[30]]],
 [[['PARTY BUNTING']], 0.4, [[12]]],
 [[['LUNCH BAG RED SPOTTY']], 0.39, [[15]]],
 [[['WHITE HANGING HEART T-LIGHT HOLDER']], 0.39, [[27]]],
 [[['SET OF 3 CAKE TINS PANTRY DESIGN']], 0.38, [[9]]],
 [[['JUMBO BAG VINTAGE DOILY']], 0.37, [[20]]],
 [[['LUNCH BAG VINTAGE DOILY']], 0.35, [[16]]],
 [[['SPOTTY BUNTING']], 0.35, [[8]]],
 [[['ASSORTED COLOUR BIRD ORNAMENT']], 0.34, [[51]]],
 [[['REGENCY CAKESTAND 3 TIER'], ['REGENCY CAKESTAND 3 TIER']],
  0.33,
  [[11], [10]]],
 [[['JUMBO BAG RED RETROSPOT'], ['JUMBO BAG RED RETROSPOT']],
  0.33,
  [[29], [26]]],
 [[['WHITE HANGING HEART T-LIGHT HOLDER'],
   ['WHITE HANGING HEART T-LIGHT HOLDER']],
  0.33,
  [[29], [24]]],
 [[['LUNCH BAG  BLACK SKULL.']], 0.32, [[13]]],
 [[['SET OF 3 REGENCY CAKE TINS']], 0.32, [[13]]],
 [[['LUNCH BAG SPACEBOY DESIGN']], 0.31, [[12]]],
 [[['LUNCH BAG PINK POLKADOT']], 0.31, [[11]]],
 [[['SET OF 4 PANTRY JELLY MOULDS']], 0.31, [[17]]],
 [[['LUNCH BAG ALPHABET DESIGN']], 0.31, [[16]]],
 [[['LUNCH BAG CARS BLUE']], 0.3, [[13]]],
 [[['SET/20 RED RETROSPOT PAPER NAPKINS']], 0.3, [[19]]],
 [[["PAPER CHAIN KIT 50'S CHRISTMAS"]], 0.3, [[20]]],
 [[['JAM MAKING SET PRINTED']], 0.3, [[25]]],
 [[['RABBIT NIGHT LIGHT']], 0.3, [[16]]],
 [[['JUMBO BAG VINTAGE LEAF']], 0.3, [[16]]],
 [[['LUNCH BAG RED SPOTTY'], ['LUNCH BAG RED SPOTTY']], 0.3, [[17], [14]]],
 [[['NATURAL SLATE HEART CHALKBOARD']], 0.29, [[9]]],
 [[['JAM MAKING SET WITH JARS']], 0.29, [[12]]],
 [[['JUMBO BAG ALPHABET']], 0.29, [[19]]],
 [[['LUNCH BAG APPLE DESIGN']], 0.29, [[17]]],
 [[['DOORMAT KEEP CALM AND COME IN']], 0.29, [[9]]],
 [[['PARTY BUNTING'], ['PARTY BUNTING']], 0.29, [[11], [16]]],
 [[['LUNCH BAG SUKI DESIGN']], 0.28, [[12]]],
 [[['JUMBO BAG PINK POLKADOT']], 0.28, [[20]]],
 [[['JUMBO SHOPPER VINTAGE RED PAISLEY']], 0.28, [[14]]],
 [[['HEART OF WICKER SMALL']], 0.28, [[17]]],
 [[['RECIPE BOX PANTRY YELLOW DESIGN']], 0.28, [[11]]],
 [[['SET OF 6 SPICE TINS PANTRY DESIGN']], 0.28, [[11]]],
 [[['ALARM CLOCK BAKELIKE RED']], 0.28, [[11]]],
 [[['HOT WATER BOTTLE KEEP CALM']], 0.28, [[9]]],
 [[['60 TEATIME FAIRY CAKE CASES']], 0.28, [[28]]],
 [[['ROSES REGENCY TEACUP AND SAUCER']], 0.27, [[17]]],
 [[['PAPER CHAIN KIT VINTAGE CHRISTMAS']], 0.27, [[12]]],
 [[['JUMBO BAG APPLES']], 0.27, [[21]]],
 [[['LUNCH BAG VINTAGE LEAF DESIGN']], 0.27, [[14]]],
 [[['WOODEN PICTURE FRAME WHITE FINISH']], 0.27, [[11]]],
 [[['JUMBO BAG VINTAGE DOILY'], ['JUMBO BAG VINTAGE DOILY']],
  0.27,
  [[21], [26]]],
 [[['JUMBO BAG RED RETROSPOT'], ['JUMBO BAG VINTAGE DOILY']],
  0.27,
  [[24], [19]]],
 [[['JUMBO BAG RED RETROSPOT'],
   ['JUMBO BAG RED RETROSPOT'],
   ['JUMBO BAG RED RETROSPOT']],
  0.27,
  [[32], [25], [30]]],
 [[['DOORMAT RED RETROSPOT']], 0.26, [[8]]],
 [[['RED RETROSPOT CHARLOTTE BAG']], 0.26, [[15]]],
 [[['LUNCH BAG WOODLAND']], 0.26, [[12]]],
 [[['JUMBO BAG PINK VINTAGE PAISLEY']], 0.26, [[18]]],
 [[['JUMBO STORAGE BAG SUKI']], 0.26, [[15]]],
 [[['PACK OF 60 PINK PAISLEY CAKE CASES']], 0.26, [[33]]],
 [[['GINGERBREAD MAN COOKIE CUTTER']], 0.26, [[14]]],
 [[['SET OF 60 PANTRY DESIGN CAKE CASES']], 0.26, [[32]]],
 [[["JUMBO BAG 50'S CHRISTMAS"]], 0.26, [[16]]],
 [[['JUMBO BAG RED RETROSPOT'], ['LUNCH BAG RED SPOTTY']], 0.26, [[21], [15]]],
 [[['WHITE HANGING HEART T-LIGHT HOLDER'],
   ['WHITE HANGING HEART T-LIGHT HOLDER'],
   ['WHITE HANGING HEART T-LIGHT HOLDER']],
  0.26,
  [[30], [24], [15]]],
 [[['RETROSPOT TEA SET CERAMIC 11 PC']], 0.25, [[6]]],
 [[['HEART OF WICKER LARGE']], 0.25, [[13]]],
 [[['SPACEBOY LUNCH BOX']], 0.25, [[17]]],
 [[['GARDENERS KNEELING PAD CUP OF TEA']], 0.25, [[11]]],
 [[['JUMBO BAG STRAWBERRY']], 0.25, [[24]]],
 [[['JUMBO BAG PINK POLKADOT', 'JUMBO BAG RED RETROSPOT']], 0.25, [[20, 24]]],
 [[['LUNCH BAG RED SPOTTY'], ['JUMBO BAG RED RETROSPOT']], 0.25, [[17], [22]]],
 [[['SET OF 3 CAKE TINS PANTRY DESIGN'], ['SET OF 3 CAKE TINS PANTRY DESIGN']],
  0.25,
  [[11], [10]]],
 [[['JUMBO BAG VINTAGE DOILY'], ['JUMBO BAG RED RETROSPOT']],
  0.25,
  [[18], [19]]],
 [[['LUNCH BAG VINTAGE DOILY'], ['JUMBO BAG VINTAGE DOILY']],
  0.25,
  [[16], [20]]],
 [[['SPOTTY BUNTING'], ['SPOTTY BUNTING']], 0.25, [[9], [13]]],
 [[['REGENCY CAKESTAND 3 TIER'],
   ['REGENCY CAKESTAND 3 TIER'],
   ['REGENCY CAKESTAND 3 TIER']],
  0.25,
  [[13], [11], [13]]]]