# SEQUENTIAL PATTERN MINING

In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import OrderedDict
from gsp import apriori

***

### UTILITY

In [2]:
# utility to transform list of dicts into list of lists of transactions, given a particular customer
def getCustomerTransactions(cust_trans_dates,customer):
    cust_dict_list = cust_trans_dates[customer]
    cust_trans_list = list()
    for _,v in cust_dict_list.items():
        cust_trans_list.append(v)
    return cust_trans_list

***

In [3]:
# load dataframe
df = pd.read_csv('datasets/cleaned_dataframe.csv', sep='\t', index_col=0)
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta
0,536365,2010-12-01 08:26:00,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6
2,536365,2010-12-01 08:26:00,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


### CUSTOMER ANALYSIS

Not all customers are good for these kind of analysis, especially the customers with little orders! If the number of orders is low then mining patterns becomes almost unfeasible

In [4]:
# load customer dataframe
dfc = pd.read_csv('datasets/customer_dataframe.csv', sep='\t', index_col=0)
dfc.head()

Unnamed: 0_level_0,TProd,DProd,MeanProdOrder,TSale,MinPSale,MaxPSale,MeanSaleOrder,TRProd,MeanPSale,TOrder,SETSaleQta,SESaleQtaOrder,MeanTimeGap,MaxOrderMonth,MaxOrderDay
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
17850,1702,21,48.629,5317.89,6.36,107.25,151.94,31,3.96,34,4.137,3.186,2.088,Dec,Thu
13047,1355,105,84.688,3089.1,6.64,68.0,193.069,35,3.926,9,5.421,3.875,21.8,Aug,Wed
12583,4978,114,292.824,6629.34,6.8,132.8,389.961,50,2.14,15,5.804,4.087,23.188,Sep,Fri
13748,439,24,87.8,948.25,9.36,204.0,189.65,0,3.996,5,4.08,2.322,69.5,Apr,Mon
15100,58,1,9.667,635.1,175.2,350.4,105.85,22,10.95,3,2.252,2.252,8.6,Dec,Wed


In [5]:
print("Total amount of customers:",len(dfc['TOrder']))
print("Total amount of customers with < 5 orders:",len(dfc[dfc['TOrder'] < 5]))
print("Total amount of customers with < 4 orders:",len(dfc[dfc['TOrder'] < 4]))
print("Total amount of customers with < 3 orders:",len(dfc[dfc['TOrder'] < 3]))
# here we can decide which ones to prune, < 5 can be good maybe
to_prune = dfc[dfc['TOrder']<5].index

Total amount of customers: 4333
Total amount of customers with < 5 orders: 3232
Total amount of customers with < 4 orders: 2837
Total amount of customers with < 3 orders: 2335


In [6]:
# prune away all irrelevant customers
df = df[~df['CustomerID'].isin(to_prune)]

In [7]:
df['BasketDate'] = pd.to_datetime(df["BasketDate"], dayfirst=True)

#### It is easy to notice that orders are always made on same day (differ at max by one minute intraorder)

In [8]:
"cust_trans_dates = {customer: {date: [[item1][item2]]}}"
"cust_trans = {customer: [[item1][item2]]}"
cust_trans_dates = {}
cust_trans = {}
for customer in tqdm(df['CustomerID'].unique(), total=len(df['CustomerID'].unique()), desc="Iterating over customers"):
    cust_trans_ord_dict = OrderedDict()
    cust_trans_list = list()
    cust_df = df.loc[df['CustomerID'] == customer,['BasketID', 'BasketDate', 'ProdID']]
    for basket in cust_df['BasketID'].unique():
        prod_list = cust_df[cust_df['BasketID'] == basket]['ProdID'].unique().tolist() #REMINDER FOR MYSELF: IS IT CORRECT TO MAINTAIN IN A TRANSACTION ONLY UNIQUE PRODIDS, NO REPETITIONS? FROM WHAT I SEE THIS SEEMS TO BE THE CASE BUT TRY TO SEARCH FOR CONFIRMATION
        date = cust_df[cust_df['BasketID'] == basket]['BasketDate'].unique()[0] #because of what said above we can take first date of order (at max we will have 2 elements differing of 1 minute)
        cust_trans_ord_dict[date] = prod_list
        cust_trans_list.append(prod_list)
    cust_trans_dates[customer] = cust_trans_ord_dict
    cust_trans[customer] = cust_trans_list

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=1101.0), HTML(value='')))




In [9]:
# usage example for date including structure
getCustomerTransactions(cust_trans_dates,17850)

[['85123A', '71053', '84406B', '84029G', '84029E', '22752', '21730'],
 ['22633', '22632'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '84406B',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '21730'],
 ['85123A',
  '71053',
  '84406B',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '21730'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '84406B',
  '15056BL',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '22803',
  '21730'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '84406B',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '22803',
  '21730'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '82483',
  '82486',
  '82482',
  '824

In [10]:
help(apriori)

Help on function apriori in module gsp:

apriori(dataset, minSupport, verbose=False)



In [11]:
trans = list(cust_trans.values())
result_set = apriori(trans[16], 0, verbose=True)
#itemsets_list = []
#trans = trans[0:17]
#for cust in tqdm(trans, total=len(trans), desc="Iterating over customer to analyze transactions"):
#    itemsets = apriori(cust, supp=70, zmin=20, target='m')
    

Iterating over customers:   0%|                                                                  | 0/1 [00:00<?, ?it/s]

Verbose: True
itemsInDataset:  ['15034', '15036', '15056N', '20695', '20697', '20698', '20705', '20718', '20724', '20725', '21034', '21035', '21080', '21154', '21166', '21169', '21172', '21175', '21181', '21216', '21231', '21232', '21479', '21485', '21497', '21531', '21533', '21535', '21539', '21770', '21843', '21844', '21870', '21872', '21874', '21903', '21908', '21980', '21982', '22064', '22072', '22111', '22112', '22114', '22191', '22193', '22302', '22303', '22311', '22312', '22332', '22365', '22366', '22423', '22440', '22449', '22468', '22469', '22470', '22494', '22557', '22616', '22632', '22637', '22652', '22654', '22665', '22666', '22667', '22670', '22671', '22672', '22697', '22698', '22704', '22705', '22727', '22729', '22752', '22834', '22835', '22865', '22866', '22867', '22894', '22895', '22896', '22897', '22898', '22903', '22905', '22908', '22922', '22923', '22969', '22992', '23007', '23076', '23077', '23123', '23181', '23184', '23191', '23196', '23206', '23208', '23232', '232

                                                                                                                       

Result, lvl 1: [(['15034'], 1), (['15036'], 2), (['15056N'], 1), (['20695'], 1), (['20697'], 1), (['20698'], 1), (['20705'], 3), (['20718'], 2), (['20724'], 3), (['20725'], 3), (['21034'], 1), (['21035'], 1), (['21080'], 4), (['21154'], 2), (['21166'], 4), (['21169'], 3), (['21172'], 5), (['21175'], 2), (['21181'], 5), (['21216'], 4), (['21231'], 5), (['21232'], 5), (['21479'], 4), (['21485'], 2), (['21497'], 4), (['21531'], 2), (['21533'], 2), (['21535'], 2), (['21539'], 1), (['21770'], 2), (['21843'], 1), (['21844'], 4), (['21870'], 2), (['21872'], 2), (['21874'], 2), (['21903'], 1), (['21908'], 3), (['21980'], 3), (['21982'], 4), (['22064'], 2), (['22072'], 4), (['22111'], 6), (['22112'], 6), (['22114'], 6), (['22191'], 5), (['22193'], 2), (['22302'], 3), (['22303'], 3), (['22311'], 5), (['22312'], 5), (['22332'], 6), (['22365'], 2), (['22366'], 4), (['22423'], 3), (['22440'], 4), (['22449'], 5), (['22468'], 2), (['22469'], 1), (['22470'], 3), (['22494'], 5), (['22557'], 5), (['2261



In [18]:
tot = 0
for tran in trans[16]:
    if prod in tran:
        tot +=1
print(tot)

2


In [13]:
# numero 16 fa paura con sup 10
#considerazioni su tutto l'insieme:
# con sup in range 80-99 tutto ok
# sup 70 is DEATH