# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 4) Pattern Mining
### *Antonio Strippoli, Valerio Mariani*

In [None]:
from gsp import apriori
import pandas as pd
import pickle
import time

In [None]:
def read_dataset():
    """Read the dataset using Pandas."""
    return pd.read_csv("../DM_25_TASK1/customer_supermarket_2.csv", index_col=0, parse_dates=["PurchaseDate"])

### Functions

In [None]:
def remove_baskets(df, threshold):
    """Keep only customers with more than `threshold` baskets."""
    customers = df.groupby('CustomerID').agg({'BasketID': 'nunique'})
    customers = customers[customers >= threshold].dropna().index.values
    return df[df['CustomerID'].isin(customers)]

In [None]:
# Convert data in sequential form
def get_sequential_form(df):
    """Convert a dataset into its sequential form."""
    seq_data = []
    for customer in df.groupby('CustomerID'):
        customer = customer[1]
        tmp = []
        for basket in customer.groupby('BasketID'):
            basket = basket[1]
            purchases = list( basket['ProdID'].unique() )
            tmp.append(purchases)
        seq_data.append(tmp)
    return seq_data

In [None]:
def save_to_pickle(result_set, min_baskets, min_sup):
    """Save gsp results"""
    with open(f'gsp_res/{min_baskets}_{int(min_sup*100)}.pickle', 'wb') as handle:
        pickle.dump(result_set, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Apply GSP on sequential data

In [None]:
# Main cycle: apply GSP multiple times
params = {
    'min_sup': [0.4, 0.3, 0.2],
    'min_baskets': [20, 10, 5],
}
for min_sup in params['min_sup']:
    for min_baskets in params['min_baskets']:
        # Read the dataset
        df = read_dataset()
        # Remove some baskets
        df = remove_baskets(df, min_baskets)
        # Convert into seq form
        seq_data = get_sequential_form(df)
        
        # Apply GSP
        t0 = time.time()
        result_set = apriori(seq_data, min_sup, verbose=False)
        t1 = time.time()
        print(f"MIN BASKETS: {min_baskets}, MIN SUP: {min_sup}\nTOTAL TIME: {round(t1-t0, 2)} s\nLEN RESULT SET: {len(result_set)}\n")

        # Save
        save_to_pickle(result_set, min_baskets, min_sup)

In [None]:
# Read gsp results
with open('gsp_res/20_40.pickle', 'rb') as handle:
    result_set = pickle.load(handle)

result_set