# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 4.optional) Pattern Mining with Time Constraints
### *Antonio Strippoli, Valerio Mariani*

In [None]:
import matplotlib.pyplot as plt
from functions import *  # Custom function for the analysis
from gsp import apriori
import datetime
import logging
from natsort import natsorted
import time
import os

# Set logging
logging.basicConfig(level=logging.INFO, filename="log.txt", filemode="a+", format="%(message)s")
logging.getLogger().addHandler(logging.StreamHandler())

In [None]:
def plot(ax, folder="pattern_mining", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25

# Read result
result_set = read_write_result(True, min_baskets, min_sup)
result_set = convert_tuples_to_list(result_set)

# Compute times
df = read_dataset()
df = remove_baskets(df, min_baskets)
seq_data, time_stamps = sequentialize(df, return_times=True)
result_set = compute_patterns_time(result_set, seq_data, time_stamps)

# Convert ProdID to ProdDescr
result_set = prodID_to_prodDescr(result_set, df)

for res in result_set:
    print(res[0])
    print(res[1], "-", len(res[-1]) / len(seq_data))
    times = pd.DataFrame(res[-1])
    print(times.describe())
    plot(times.hist(bins=20))

result_set

### Apply GSP on sequential data using Time Constraints

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25
tests = [
    ('min_gap', 'days', list(range(33, -1, -3))),
    ('max_gap', 'weeks', list(range(16, 73, 4))),
    ('max_span', 'weeks', list(range(16, 93, 4))),
]

In [None]:
for test in tests:
    print(f"STARTING WITH {test[0]} - {test[1]} - {test[2]}")
    lengths = []
    for x in test[2]:
        print(f"TESTING x={x}")
        # Read the dataset
        df = read_dataset()
        # Remove some baskets
        df = remove_baskets(df, min_baskets)
        # Convert into seq form
        seq_data, time_stamps = sequentialize(df, return_times=True)

        # Apply GSP
        kwargs = {test[1]: x}
        kwargs = {test[0]: datetime.timedelta(**kwargs)}
        result_set = apriori(seq_data, min_sup, time_stamps, **kwargs)
        
        read_write_result(False, min_baskets, min_sup, result_set=result_set, **kwargs)
        dist, _ = compute_distribution(result_set, print_out=False)
        lengths.append(dist[2] + dist[3])

        # DEBUG Prints
        if len(lengths) == 1:
            print('\tLEN RESULT SET:', lengths[-1])
        elif lengths[-1] != lengths[-2]:
            print('\tLEN RESULT SET:', lengths[-1])
        
        # Stop when reaching full set
        if dist[2] + dist[3] == 17:
            break

In [None]:
import re


# Compute plots for each value tested
folder = './gsp_res'
for t, x_label, _ in tests:
    files = natsorted([f for f in os.listdir(folder) if f.endswith(t.replace('_', '') + '.pickle')])

    x, y = [], []
    for f in files:
        path = os.path.join(folder, f)
        number = int(re.search(r'\_(\d+)\D+$', f).group(1))
        kwargs = {t: datetime.timedelta(days=number)}
        
        result_set = read_write_result(True, min_baskets, min_sup, **kwargs)
        dist, _ = compute_distribution(result_set, print_out=False)

        x.append(number)
        y.append(dist[2] + dist[3])
    
    if x_label == 'weeks':
        x = [n // 7 for n in x]
    x = [str(n) for n in x]

    _, ax = plt.subplots()
    ax.plot(x, y)
    ax.set(xlabel=x_label.title(), ylabel='N. of sequences')
    plot(ax, filename=f"{t}_trend")

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25
min_gap = datetime.timedelta(days=30)
max_gap = datetime.timedelta(weeks=20)

# Read result
result_set = read_write_result(True, min_baskets, min_sup, min_gap=min_gap)
result_set = convert_tuples_to_list(result_set)
compute_distribution(result_set)

# Convert ProdID to ProdDescr
df = read_dataset()
df = remove_baskets(df, min_baskets)
result_set = prodID_to_prodDescr(result_set, df)

result_set

Forse da buttare

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25
max_gap = datetime.timedelta(days=365)
min_gap = datetime.timedelta(days=365)

# Read the dataset
df = read_dataset()
# Remove some baskets
df = remove_baskets(df, min_baskets)
# Convert into seq form
seq_data, time_stamps = sequentialize(df, return_times=True)

# Apply GSP
result_set = apriori(seq_data, min_sup, time_stamps, max_span=None, min_gap=None, max_gap=max_gap)
compute_distribution(result_set)

# Distribution of lengths: {1: 56, 2: 1, 3: 0, 4: 0, 5: 0}

In [None]:
params = {
    'min_sup': [0.4, 0.35, 0.3, 0.25, 0.2, 0.15],
    'min_baskets': [20, 10],
    'max_gap': [datetime.timedelta(days=1), datetime.timedelta(days=2), datetime.timedelta(days=3), datetime.timedelta(weeks=1), datetime.timedelta(weeks=2), datetime.timedelta(weeks=3), datetime.timedelta(weeks=4), datetime.timedelta(weeks=8), datetime.timedelta(weeks=12)],
    'max_span': [datetime.timedelta(weeks=4), datetime.timedelta(weeks=8), datetime.timedelta(weeks=12), datetime.timedelta(weeks=48)]
}
for min_sup in params['min_sup']:
    for min_baskets in params['min_baskets']:
        for max_gap in params['max_gap']:
            for max_span in params['max_span']:
                logging.info(f"MIN_BASKETS: {min_baskets}, MIN_SUP: {min_sup}, MAX_GAP: {max_gap}, MAX_SPAN: {max_span}")

                # Read the dataset
                df = read_dataset()
                # Remove some baskets
                df = remove_baskets(df, min_baskets)
                # Convert into seq form
                seq_data, time_stamps = sequentialize(df, return_times=True)
                
                # Apply GSP
                t0 = time.time()
                result_set = apriori(seq_data, min_sup, time_stamps, max_span=max_span, min_gap=None, max_gap=max_gap)
                t1 = time.time()

                # Compute n. of sequences with len > 2 and n. of sequences containing duplicates
                cnt_len_2 = 0
                cnt_duplicates = 0
                for r in result_set:
                    r = r[0]
                    tmp = []
                    for l in r:
                        tmp.extend(l)
                    if len(tmp) >= 2:
                        cnt_len_2 += 1
                        if len(set(tmp)) < len(tmp):
                            cnt_duplicates += 1

                logging.info(
                    f"TOTAL TIME:\t{round(t1-t0, 2)} s\n"\
                    f"LEN RESULT SET:\t{len(result_set)}\n"\
                    f"LEN SEQ > 2:\t{cnt_len_2}\nN. DUPLICATES:\t{cnt_duplicates}\n"
                )

                # Save
                save_to_pickle(result_set, min_baskets, min_sup, max_gap.days, max_span.days)

In [None]:
compute_distribution(result_set)