# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 4.optional) Pattern Mining with Time Constraints
### *Antonio Strippoli, Valerio Mariani*

In [None]:
import matplotlib.pyplot as plt
from natsort import natsorted
from functions import *  # Custom function for the analysis
from gsp import apriori
import datetime
import logging
import time
import os
import re


# Set logging
logging.basicConfig(level=logging.INFO, filename="log.txt", filemode="a+", format="%(message)s")
logging.getLogger().addHandler(logging.StreamHandler())

In [None]:
def plot(ax, folder="pattern_mining", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        if not os.path.exists(folder):
            os.mkdir(folder)
        plt.savefig(os.path.join(folder, filename))
    plt.show()
    plt.close()

### Apply GSP on sequential data using Time Constraints

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25
tests = [
    ('min_gap', 'days', list(range(33, -1, -3))),
    ('max_gap', 'weeks', list(range(16, 73, 4))),
    ('max_span', 'weeks', list(range(16, 93, 4))),
]

In [None]:
for test in tests:
    print(f"STARTING WITH {test[0]} - {test[1]} - {test[2]}")
    lengths = []
    for x in test[2]:
        print(f"TESTING x={x}")
        # Read the dataset
        df = read_dataset()
        # Remove some baskets
        df = remove_baskets(df, min_baskets)
        # Convert into seq form
        seq_data, time_stamps = sequentialize(df, return_times=True)

        # Apply GSP
        kwargs = {test[1]: x}
        kwargs = {test[0]: datetime.timedelta(**kwargs)}
        result_set = apriori(seq_data, min_sup, time_stamps, **kwargs)
        
        read_write_result(False, min_baskets, min_sup, result_set=result_set, **kwargs)
        dist, _ = compute_distribution(result_set, print_out=False)
        lengths.append(dist[2] + dist[3])

        # DEBUG Prints
        if len(lengths) == 1:
            print('\tLEN RESULT SET:', lengths[-1])
        elif lengths[-1] != lengths[-2]:
            print('\tLEN RESULT SET:', lengths[-1])
        
        # Stop when reaching full set
        if dist[2] + dist[3] == 17:
            break

### Compute plots for each value tested

In [None]:
folder = './gsp_res'
for t, x_label, _ in tests:
    files = natsorted([f for f in os.listdir(folder) if f.endswith(t.replace('_', '') + '.pickle')])

    x, y = [], []
    for f in files:
        path = os.path.join(folder, f)
        number = int(re.search(r'\_(\d+)\D+$', f).group(1))
        kwargs = {t: datetime.timedelta(days=number)}
        
        result_set = read_write_result(True, min_baskets, min_sup, **kwargs)
        dist, _ = compute_distribution(result_set, print_out=False)

        x.append(number)
        y.append(dist[2] + dist[3])
    
    if x_label == 'weeks':
        x = [n // 7 for n in x]
    x = [str(n) for n in x]

    _, ax = plt.subplots()
    ax.plot(x, y)
    ax.set(xlabel=x_label.title(), ylabel='N. of sequences')
    plot(ax, filename=f"{t}_trend")

### Compute time gaps distribution

In [None]:
# Config (which result do we want to analyze)
min_baskets = 10
min_sup = 0.25

# Read result
result_set = read_write_result(True, min_baskets, min_sup)
result_set = convert_tuples_to_list(result_set)

# Load original dataset
df = read_dataset()
df = remove_baskets(df, min_baskets)
seq_data, time_stamps = sequentialize(df, return_times=True)

# Compute time gaps between each couple of events
result_set = compute_patterns_time(result_set, seq_data, time_stamps)
# Convert ProdID to ProdDescr
result_set = prodID_to_prodDescr(result_set, df)

# Show distributions of time gaps
for res in result_set:
    print(res[0])
    print(res[1], "-", len(res[-1]) / len(seq_data))
    times = pd.DataFrame(res[-1])
    print(times.describe())
    plot(times.hist(bins=20))

result_set