In [41]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import OrderedDict

In [42]:
# load dataframe
df = pd.read_csv('datasets/cleaned_dataframe.csv', sep='\t', index_col=0)
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta
0,536365,2010-12-01 08:26:00,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6
2,536365,2010-12-01 08:26:00,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


In [60]:
# utility to transform list of dicts into list of lists of transactions, given a particular customer
def getCustomerTransactions(cust_trans,customer):
    cust_dict_list = cust_trans[customer]
    cust_trans_list = list()
    for _,v in cust_dict_list.items():
        cust_trans_list.append(v)
    return cust_trans_list
    

In [44]:
df['BasketDate'] = pd.to_datetime(df["BasketDate"], dayfirst=True)

#### It is easy to notice that orders are always made on same day (differ at max by one minute intraorder)

In [45]:
"{customer: {date: [[transaction1][transaction2]]}}"
cust_trans = {}
for customer in tqdm(df['CustomerID'].unique(), total=len(df['CustomerID'].unique()), desc="Iterating over customers"):
    cust_trans_ord_dict = OrderedDict()
    cust_df = df.loc[df['CustomerID'] == customer,['BasketID', 'BasketDate', 'ProdID']]
    for basket in cust_df['BasketID'].unique():
        prod_list = cust_df[cust_df['BasketID'] == basket]['ProdID'].unique() #REMINDER FOR MYSELF: IS IT CORRECT TO MAINTAIN IN A TRANSACTION ONLY UNIQUE PRODIDS, NO REPETITIONS? FROM WHAT I SEE THIS SEEMS TO BE THE CASE BUT TRY TO SEARCH FOR CONFIRMATION
        date = cust_df[cust_df['BasketID'] == basket]['BasketDate'].unique()[0] #because of what said above we can take first date of order (at max we will have 2 elements differing of 1 minute)
        cust_trans_ord_dict[date] = prod_list
    cust_trans[customer] = cust_trans_ord_dict
    
"""
for a simpler version (IF WE NEED IT)
"{customer: [[transaction1][transaction2]]}"
cust_trans = {}
for customer in tqdm(df['CustomerID'].unique(), total=len(df['CustomerID'].unique()), desc="Iterating over customers"):
    cust_trans_list = list()
    cust_df = df.loc[df['CustomerID'] == customer,['BasketID', 'ProdID']]
    for basket in cust_df['BasketID'].unique():
        prod_list = cust_df[cust_df['BasketID'] == basket]['ProdID'].unique() #REMINDER FOR MYSELF: IS IT CORRECT TO MAINTAIN IN A TRANSACTION ONLY UNIQUE PRODIDS, NO REPETITIONS? FROM WHAT I SEE THIS SEEMS TO BE THE CASE BUT TRY TO SEARCH FOR CONFIRMATION
        cust_trans_list.append(prod_list)
    cust_trans[customer] = cust_trans_list
    
"""


HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4333.0), HTML(value='')))




'\nfor a simpler version\n"{customer: [[transaction1][transaction2]]}"\ncust_trans = {}\nfor customer in tqdm(df[\'CustomerID\'].unique(), total=len(df[\'CustomerID\'].unique()), desc="Iterating over customers"):\n    cust_trans_list = []\n    cust_df = df.loc[df[\'CustomerID\'] == customer,[\'BasketID\', \'ProdID\']]\n    for basket in cust_df[\'BasketID\'].unique():\n        prod_list = cust_df[cust_df[\'BasketID\'] == basket][\'ProdID\'].unique() #REMINDER FOR MYSELF: IS IT CORRECT TO MAINTAIN IN A TRANSACTION ONLY UNIQUE PRODIDS, NO REPETITIONS? FROM WHAT I SEE THIS SEEMS TO BE THE CASE BUT TRY TO SEARCH FOR CONFIRMATION\n        cust_trans_list.append(prod_list)\n    cust_trans[customer] = cust_trans_list\n    \n'

In [66]:
# usage example
getCustomerTransactions(cust_trans,17850)

[array(['85123A', '71053', '84406B', '84029G', '84029E', '22752', '21730'],
       dtype=object),
 array(['22633', '22632'], dtype=object),
 array(['22632', '22633'], dtype=object),
 array(['85123A', '71053', '84406B', '20679', '37370', '21871', '21071',
        '21068', '82483', '82486', '82482', '82494L', '84029G', '84029E',
        '22752', '21730'], dtype=object),
 array(['85123A', '71053', '84406B', '20679', '37370', '21871', '21071',
        '21068', '82483', '82486', '82482', '82494L', '84029G', '84029E',
        '22752', '21730'], dtype=object),
 array(['22632', '22633'], dtype=object),
 array(['85123A', '71053', '84406B', '15056BL', '20679', '37370', '21871',
        '21071', '21068', '82483', '82486', '82482', '82494L', '84029G',
        '84029E', '22752', '22803', '21730'], dtype=object),
 array(['22632', '22633'], dtype=object),
 array(['85123A', '71053', '84406B', '20679', '37370', '21871', '21071',
        '21068', '82483', '82486', '82482', '82494L', '84029G', '84029E',
