# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 4.optional) Pattern Mining with Time Constraints
### *Antonio Strippoli, Valerio Mariani*

In [None]:
from functions import *  # Custom function for the analysis
from gsp import apriori
import datetime
import logging
import time
import os

# Set logging
if os.path.exists('log.txt'):
    os.remove('log.txt')
logging.basicConfig(level=logging.INFO, filename="log.txt", filemode="a+", format="%(message)s")
logging.getLogger().addHandler(logging.StreamHandler())

In [None]:
import matplotlib.pyplot as plt

def plot(ax, folder="pattern_mining", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

### Apply GSP on sequential data using Time Constraints

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25
tests = [
    ('min_gap', list(range(1000, 1001, 1))), # qui c'è qualocsa di strano, '22386', '85099B'
    ('max_gap', list(range(4, 200, 4))), # 4, 72
    ('max_span', list(range(4, 200, 4))),
]

for test in tests:
    print(f"STARTING WITH {test[0]} - {test[1]}")
    lengths = []
    for x in test[1]:
        print(f"TESTING x={x}")
        # Read the dataset
        df = read_dataset()
        # Remove some baskets
        df = remove_baskets(df, min_baskets)
        # Convert into seq form
        seq_data, time_stamps = sequentialize(df, return_times=True)

        # Apply GSP
        kwargs = {test[0]: datetime.timedelta(weeks=x)}
        result_set = apriori(seq_data, min_sup, time_stamps, **kwargs)
        print(result_set)
        lengths.append(len(result_set))

        # DEBUG Prints
        if len(lengths) == 1:
            print('\tLEN RESULT SET:', lengths[-1])
        elif lengths[-1] != lengths[-2]:
            print('\tLEN RESULT SET:', lengths[-1])
        
        # Stop when reaching full set or when no sequence long > 2 remain
        if lengths[-1] >= 73 or lengths[-1] <= 56:
            break

    # Plot trend
    _, ax = plt.subplots()
    ax.plot([str(x // 4) for x in test[1]], lengths)
    ax.set(xlabel='Months', ylabel='N. of sequences')
    plot(ax, filename=f"{test[0]}_trend")

Forse da buttare

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25
max_gap = datetime.timedelta(days=365)
min_gap = datetime.timedelta(days=365)

# Read the dataset
df = read_dataset()
# Remove some baskets
df = remove_baskets(df, min_baskets)
# Convert into seq form
seq_data, time_stamps = sequentialize(df, return_times=True)

# Apply GSP
result_set = apriori(seq_data, min_sup, time_stamps, max_span=None, min_gap=None, max_gap=max_gap)
print_distribution(result_set)

# Distribution of lengths: {1: 56, 2: 1, 3: 0, 4: 0, 5: 0}

In [None]:
params = {
    'min_sup': [0.4, 0.35, 0.3, 0.25, 0.2, 0.15],
    'min_baskets': [20, 10],
    'max_gap': [datetime.timedelta(days=1), datetime.timedelta(days=2), datetime.timedelta(days=3), datetime.timedelta(weeks=1), datetime.timedelta(weeks=2), datetime.timedelta(weeks=3), datetime.timedelta(weeks=4), datetime.timedelta(weeks=8), datetime.timedelta(weeks=12)],
    'max_span': [datetime.timedelta(weeks=4), datetime.timedelta(weeks=8), datetime.timedelta(weeks=12), datetime.timedelta(weeks=48)]
}
for min_sup in params['min_sup']:
    for min_baskets in params['min_baskets']:
        for max_gap in params['max_gap']:
            for max_span in params['max_span']:
                logging.info(f"MIN_BASKETS: {min_baskets}, MIN_SUP: {min_sup}, MAX_GAP: {max_gap}, MAX_SPAN: {max_span}")

                # Read the dataset
                df = read_dataset()
                # Remove some baskets
                df = remove_baskets(df, min_baskets)
                # Convert into seq form
                seq_data, time_stamps = sequentialize(df, return_times=True)
                
                # Apply GSP
                t0 = time.time()
                result_set = apriori(seq_data, min_sup, time_stamps, max_span=max_span, min_gap=None, max_gap=max_gap)
                t1 = time.time()

                # Compute n. of sequences with len > 2 and n. of sequences containing duplicates
                cnt_len_2 = 0
                cnt_duplicates = 0
                for r in result_set:
                    r = r[0]
                    tmp = []
                    for l in r:
                        tmp.extend(l)
                    if len(tmp) >= 2:
                        cnt_len_2 += 1
                        if len(set(tmp)) < len(tmp):
                            cnt_duplicates += 1

                logging.info(
                    f"TOTAL TIME:\t{round(t1-t0, 2)} s\n"\
                    f"LEN RESULT SET:\t{len(result_set)}\n"\
                    f"LEN SEQ > 2:\t{cnt_len_2}\nN. DUPLICATES:\t{cnt_duplicates}\n"
                )

                # Save
                save_to_pickle(result_set, min_baskets, min_sup, max_gap.days, max_span.days)

In [None]:
print_distribution(result_set)