In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
# load dataframe
df = pd.read_csv('datasets/cleaned_dataframe.csv', sep='\t', index_col=0)
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta
0,536365,2010-12-01 08:26:00,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6
2,536365,2010-12-01 08:26:00,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,536365,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


In [18]:
# check if there exist orders with different BasketDate in same order
dfp = df.loc[df['CustomerID'] > -1000,['BasketID','BasketDate']]
for basket in dfp['BasketID'].unique():
    dfp2 = dfp[df['BasketID'] == basket]['BasketDate'].unique()
    if len(dfp2) > 1:
        print(dfp2)

['2010-12-01 16:57:00' '2010-12-01 16:58:00']
['2011-01-05 13:40:00' '2011-01-05 13:41:00']
['2011-01-19 16:18:00' '2011-01-19 16:19:00']
['2011-01-20 10:47:00' '2011-01-20 10:48:00']
['2011-01-21 15:56:00' '2011-01-21 15:57:00']
['2011-01-23 13:33:00' '2011-01-23 13:34:00']
['2011-01-26 12:35:00' '2011-01-26 12:36:00']
['2011-02-01 11:19:00' '2011-02-01 11:20:00']
['2011-02-04 10:31:00' '2011-02-04 10:32:00']
['2011-02-11 16:19:00' '2011-02-11 16:20:00']
['2011-02-22 15:09:00' '2011-02-22 15:10:00']
['2011-02-24 17:50:00' '2011-02-24 17:51:00']
['2011-03-02 17:32:00' '2011-03-02 17:33:00']
['2011-03-03 16:25:00' '2011-03-03 16:26:00']
['2011-03-18 12:55:00' '2011-03-18 12:56:00']
['2011-03-24 14:55:00' '2011-03-24 14:56:00']
['2011-04-07 11:59:00' '2011-04-07 12:00:00']
['2011-04-17 12:37:00' '2011-04-17 12:38:00']
['2011-04-17 14:05:00' '2011-04-17 14:06:00']
['2011-04-19 16:30:00' '2011-04-19 16:31:00']
['2011-05-15 15:13:00' '2011-05-15 15:14:00']
['2011-05-16 14:52:00' '2011-05-16

In [35]:
# Therefore we can drop information about hour, minute and seconds to keep a bigger granularity for indexing the transactions
df['BasketDate'] = pd.to_datetime(df["BasketDate"], dayfirst=True)
df['BasketDate'] = df['BasketDate'].dt.date

#### Easy to notice that orders are always made on same day (differ at max by one minute intraorder)

In [7]:
cust_trans = {}
for customer in tqdm(df['CustomerID'].unique(), total=len(df['CustomerID'].unique()), desc="Iterating over customers"):
    cust_trans_list = []
    cust_df = df.loc[df['CustomerID'] == customer,['BasketID', 'ProdID']]
    for basket in cust_df['BasketID'].unique():
        prod_list = cust_df[cust_df['BasketID'] == basket]['ProdID'].unique() #REMINDER FOR MYSELF: IS IT CORRECT TO MAINTAIN IN A TRANSACTION ONLY UNIQUE PRODIDS, NO REPETITIONS? FROM WHAT I SEE THIS SEEMS TO BE THE CASE BUT TRY TO SEARCH FOR CONFIRMATION
        cust_trans_list.append(prod_list)
    cust_trans[customer] = cust_trans_list


HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4333.0), HTML(value='')))


