In [2]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import OrderedDict
from fim import apriori

In [3]:
# load dataframe
df = pd.read_csv('datasets/cleaned_dataframe.csv', sep='\t', index_col=0)
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta
0,536365,2010-12-01 08:26:00,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6
2,536365,2010-12-01 08:26:00,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


# CUSTOMER ANALYSIS

Not all customers are good for these kind of analysis, especially the customers with little orders! If the number of orders is low then mining patterns becomes almost unfeasible

In [4]:
# load customer dataframe
dfc = pd.read_csv('datasets/customer_dataframe.csv', sep='\t', index_col=0)
dfc.head()

Unnamed: 0_level_0,TProd,DProd,MeanProdOrder,TSale,MinPSale,MaxPSale,MeanSaleOrder,TRProd,MeanPSale,TOrder,SETSaleQta,SESaleQtaOrder,MeanTimeGap,MaxOrderMonth,MaxOrderDay
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
17850,1702,21,48.629,5317.89,6.36,107.25,151.94,31,3.96,34,4.137,3.186,2.088,Dec,Thu
13047,1355,105,84.688,3089.1,6.64,68.0,193.069,35,3.926,9,5.421,3.875,21.8,Aug,Wed
12583,4978,114,292.824,6629.34,6.8,132.8,389.961,50,2.14,15,5.804,4.087,23.188,Sep,Fri
13748,439,24,87.8,948.25,9.36,204.0,189.65,0,3.996,5,4.08,2.322,69.5,Apr,Mon
15100,58,1,9.667,635.1,175.2,350.4,105.85,22,10.95,3,2.252,2.252,8.6,Dec,Wed


In [20]:
print("Total amount of customers:",len(dfc['TOrder']))
print("Total amount of customers with < 5 orders:",len(dfc[dfc['TOrder'] < 5]))
print("Total amount of customers with < 4 orders:",len(dfc[dfc['TOrder'] < 4]))
print("Total amount of customers with < 3 orders:",len(dfc[dfc['TOrder'] < 3]))
# here we can decide which ones to prune, < 5 can be good maybe
to_prune = dfc[dfc['TOrder']<5].index

Total amount of customers: 4333
Total amount of customers with < 5 orders: 3232
Total amount of customers with < 4 orders: 2837
Total amount of customers with < 3 orders: 2335


Int64Index([15100, 18074, 17420, 16250, 13705, 13747, 15862, 12791, 14045,
            17908,
            ...
            16000, 15195, 14087, 14204, 15471, 13436, 15520, 13298, 14569,
            12713],
           dtype='int64', name='CustomerID', length=3232)

In [27]:
# prune away all irrelevant customers
df = df[~df['CustomerID'].isin(to_prune)]

In [3]:
# utility to transform list of dicts into list of lists of transactions, given a particular customer
def getCustomerTransactions(cust_trans_dates,customer):
    cust_dict_list = cust_trans_dates[customer]
    cust_trans_list = list()
    for _,v in cust_dict_list.items():
        cust_trans_list.append(v)
    return cust_trans_list

In [4]:
df['BasketDate'] = pd.to_datetime(df["BasketDate"], dayfirst=True)

#### It is easy to notice that orders are always made on same day (differ at max by one minute intraorder)

In [5]:
"cust_trans_dates = {customer: {date: [[item1][item2]]}}"
"cust_trans = {customer: [[item1][item2]]}"
cust_trans_dates = {}
cust_trans = {}
for customer in tqdm(df['CustomerID'].unique(), total=len(df['CustomerID'].unique()), desc="Iterating over customers"):
    cust_trans_ord_dict = OrderedDict()
    cust_trans_list = list()
    cust_df = df.loc[df['CustomerID'] == customer,['BasketID', 'BasketDate', 'ProdID']]
    for basket in cust_df['BasketID'].unique():
        prod_list = cust_df[cust_df['BasketID'] == basket]['ProdID'].unique().tolist() #REMINDER FOR MYSELF: IS IT CORRECT TO MAINTAIN IN A TRANSACTION ONLY UNIQUE PRODIDS, NO REPETITIONS? FROM WHAT I SEE THIS SEEMS TO BE THE CASE BUT TRY TO SEARCH FOR CONFIRMATION
        date = cust_df[cust_df['BasketID'] == basket]['BasketDate'].unique()[0] #because of what said above we can take first date of order (at max we will have 2 elements differing of 1 minute)
        cust_trans_ord_dict[date] = prod_list
        cust_trans_list.append(prod_list)
    cust_trans_dates[customer] = cust_trans_ord_dict
    cust_trans[customer] = cust_trans_list

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4333.0), HTML(value='')))




In [6]:
# usage example
getCustomerTransactions(cust_trans_dates,17850)

[['85123A', '71053', '84406B', '84029G', '84029E', '22752', '21730'],
 ['22633', '22632'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '84406B',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '21730'],
 ['85123A',
  '71053',
  '84406B',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '21730'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '84406B',
  '15056BL',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '22803',
  '21730'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '84406B',
  '20679',
  '37370',
  '21871',
  '21071',
  '21068',
  '82483',
  '82486',
  '82482',
  '82494L',
  '84029G',
  '84029E',
  '22752',
  '22803',
  '21730'],
 ['22632', '22633'],
 ['85123A',
  '71053',
  '82483',
  '82486',
  '82482',
  '824

In [7]:
help(apriori)

Help on built-in function apriori in module fim:

apriori(...)
    apriori (tracts, target='s', supp=10, zmin=1, zmax=None, report='a',
             eval='x', agg='x', thresh=10, prune=None, algo='b', mode='',
             border=None)
    Find frequent item sets with the Apriori algorithm.
    tracts  transaction database to mine (mandatory)
            The database must be an iterable of transactions;
            each transaction must be an iterable of items;
            each item must be a hashable object.
            If the database is a dictionary, the transactions are
            the keys, the values their (integer) multiplicities.
    target  type of frequent item sets to find     (default: s)
            s/a   sets/all   all     frequent item sets
            c     closed     closed  frequent item sets
            m     maximal    maximal frequent item sets
            g     gens       generators
            r     rules      association rules
    supp    minimum support of an i

In [13]:
trans = list(cust_trans.values())
for cust in tqdm(trans[0:2], total=2, desc="Iterating over customer to analyze transactions"):
    itemsets = apriori(cust, supp=10, zmin=10, target='m')
    print(itemsets)

HBox(children=(HTML(value='Iterating over customer to analyze transactions'), FloatProgress(value=0.0, max=2.0…

[(('15056BL', '20679', '22803', '82486', '84406B', '82482', '21068', '37370', '22752', '82483', '21071', '82494L', '84029E', '84029G', '71053', '85123A', '21730'), 4), (('20679', '22803', '82486', '21871', '84406B', '82482', '21068', '37370', '22752', '82483', '21071', '82494L', '84029E', '84029G', '71053', '85123A', '21730'), 4), (('22803', '22411', '82486', '21871', '84406B', '82482', '21068', '37370', '22752', '82483', '21071', '82494L', '84029E', '84029G', '71053', '85123A', '21730'), 4)]
[(('23118', '23029', '22423', '85123A', '23028', '23112', '23110', '23111', '23236', '23245', '22720'), 2), (('23093', '23240', '23173', '23028', '23112', '23110', '23111', '23236', '23245', '21755', '22720'), 2)]

