In [151]:
import pandas as pd
from os import listdir
from os.path import isfile, join

import re

import plotly.express as px

DATA_PATH = 'data/'

In [152]:
# Read files
onlyfiles = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

# Get files name
cashier_arrival = sorted(filter(lambda f : re.match("cashier-arrival", f), onlyfiles))
customer_arrival = sorted(filter(lambda f : re.match("customer-arrival", f), onlyfiles))
customer_basket = sorted(filter(lambda f : re.match("customer-basket", f), onlyfiles))

# Read files
cashier_arrival = list(map(lambda x: pd.read_csv(join(DATA_PATH, x)), cashier_arrival))
customer_arrival = list(map(lambda x: pd.read_csv(join(DATA_PATH, x)), customer_arrival))
customer_basket = list(map(lambda x: pd.read_csv(join(DATA_PATH, x)), customer_basket))

In [153]:
def preprocess_df_arrival(df):
    # Preprocess data
    df = df.drop(columns=["NA"], axis=1, errors='ignore')
    df.iloc[:,-1] = df.iloc[:,-1].apply(round)
    df.iloc[:,-1] = df.iloc[:,-1].astype('int32')
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["timestamp"] = df["timestamp"].apply(lambda x: x.replace(minute=0, second=0))
    # Aggregate time
    df = df.groupby(by="timestamp").sum()
    df = df.reset_index()
    # Normalize data
    df["norm"] = df.iloc[:,-1] / sum(df.iloc[:,-1])

    return df

In [154]:
def df_slice_time(cashier_arrival, customer_arrival):
    # Min bound
    min_cashier = max([min(x["timestamp"]) for x in cashier_arrival])
    min_customer = max([min(x["timestamp"]) for x in customer_arrival])
    min_bound = max([min_cashier, min_customer])
    # Max bound
    max_cashier = min([max(x["timestamp"]) for x in cashier_arrival])
    max_customer = min([max(x["timestamp"]) for x in customer_arrival])
    max_bound = max([max_cashier, max_customer])
    # Slice
    query_string = f'timestamp >= "{min_bound}" and timestamp <= "{max_bound}"'
    new_cashier_arrival = [x.query(query_string) for x in cashier_arrival]
    new_customer_arrival = [x.query(query_string) for x in customer_arrival]
    return new_cashier_arrival, new_customer_arrival

In [155]:
def combine_dataframe(cashier_df, customer_df):
    for id, df in enumerate(customer_arrival):
        cashier_df[id]["store"] = id
        cashier_df[id]["type"] = "customer"
    for id, df in enumerate(cashier_arrival):
        customer_df[id]["store"] = id
        customer_df[id]["type"] = "cashier"
    # Rename
    cashier_df = map(lambda x: x.coluns = ["a","b"], cashier_df)
    customer_df = map(lambda x: x.columns = [str(a) for a in range(5)], customer_df)
    return cashier_df

In [156]:
cashier_arrival = list(map(preprocess_df_arrival, cashier_arrival))
customer_arrival = list(map(preprocess_df_arrival, customer_arrival))
cashier_arrival, customer_arrival = df_slice_time(cashier_arrival, customer_arrival)

In [157]:
arrivals = combine_dataframe(cashier_arrival, customer_arrival)
arrivals

[              timestamp  number-of-cashiers      norm  store      type
 1   2018-02-01 07:00:00                   4  0.008368      0  customer
 2   2018-02-01 08:00:00                   0  0.000000      0  customer
 3   2018-02-01 09:00:00                   2  0.004184      0  customer
 4   2018-02-01 10:00:00                   5  0.010460      0  customer
 5   2018-02-01 11:00:00                   2  0.004184      0  customer
 ..                  ...                 ...       ...    ...       ...
 324 2018-02-14 18:00:00                   0  0.000000      0  customer
 325 2018-02-14 19:00:00                   5  0.010460      0  customer
 326 2018-02-14 20:00:00                   0  0.000000      0  customer
 327 2018-02-14 21:00:00                   0  0.000000      0  customer
 328 2018-02-14 22:00:00                   0  0.000000      0  customer
 
 [328 rows x 5 columns],
               timestamp  number-of-cashiers      norm  store      type
 1   2018-02-01 07:00:00             