# 4. Generating and filtering

Once we have assigned weights to all the variables that we wanted to predict for the individual household, we can use them to generate their predictions. These predictions are made by using the Pandas DataFrame.sample'' function and setting a weights’’ parameter. This function returns a random sample of items from the dataframe~\cite{pandas}. For example, we can use this function to return one randomized value (0,1, or 2) from the dataframe for Mondays with the ``weights’’ variable to determine whether or not this household will shop on Monday.

### Import libraries 

In [1]:
# %matplotlib notebook
%matplotlib inline
#Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
#NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
from matplotlib.ticker import StrMethodFormatter
from matplotlib.pyplot import figure
class bcolors:
    WARNING = '\033[91m'
    BOLD = '\033[1m'
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree
# importing the required function for CHI2 tests
from scipy.stats import chi2_contingency

### Load and view data 

In [2]:
# Load the dataframe that was saved after step 2.2. Visualizing the data
df = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/DATA-HH (dummy).csv")
df.head()

Unnamed: 0,week,order_ID,item_name,amount,price_unit,price_total,date,day,timestamp,time,...,item_id,type_id,order_amount,order_price,day_num,storetype_num,storename_num,cat_num,time_num,promo_num
0,1,1,RABEKO choco light 250g,2,2.82,5.64,2021-11-23,Tuesday,12:32:00,noon,...,0,0,9,16.77,5,4,6,2,3,0
1,1,1,JOYVALLE pudding griesmeel natuur 135g,4,0.99,3.96,2021-11-23,Tuesday,12:32:00,noon,...,1,1,9,16.77,5,4,6,7,3,0
2,1,1,BONI tomatensoep met balletjes 950ml,1,1.99,1.99,2021-11-23,Tuesday,12:32:00,noon,...,2,2,9,16.77,5,4,6,3,3,0
3,1,1,LIEBIG DELISOUP 9 groenten brik 1L,1,2.59,2.59,2021-11-23,Tuesday,12:32:00,noon,...,3,2,9,16.77,5,4,6,3,3,0
4,1,1,LIEBIG DELISOUP tom. Balletjes brik 1L,1,2.59,2.59,2021-11-23,Tuesday,12:32:00,noon,...,4,2,9,16.77,5,4,6,3,3,0


In [3]:
# load the dataframe for just the orders (not all individual items) as saved after step 2.2. Visualizing the data
df_orders = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/DATA-HH (dummy-orders).csv")
df_orders = df_orders.drop_duplicates()
df_orders.head()

Unnamed: 0,week,order_ID,store_name,storename_num,store_type,storetype_num,day,day_num,time,time_num,timestamp,order_amount,order_price
0,1,1,Okay,6,supermarket,4,Tuesday,5,noon,3,12:32:00,9,16.77
1,1,2,baker,7,bakery,0,Tuesday,5,noon,3,12:39:00,7,6.1
2,1,4,Delhaize,3,supermarket,4,Friday,0,afternoon,0,17:25:00,50,103.32998
3,2,5,Albert Heijn,0,supermarket,4,Wednesday,6,morning,2,11:43:00,1,3.99
4,2,6,baker,7,bakery,0,Wednesday,6,morning,2,09:57:00,9,7.7


## 4.1. Days of the week (model step 1)

### 4.1.1. Load weighted dataframes per day of week

In [4]:
df_Mondays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Mondays).csv")
df_Tuesdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Tuesdays).csv")
df_Wednesdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Wednesdays).csv")
df_Thursdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Thursdays).csv")
df_Fridays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Fridays).csv")
df_Saturdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Saturdays).csv")
df_Sundays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Sundays).csv")

In [5]:
df_Sundays

Unnamed: 0,Sunday,weights
0,0.0,4
1,3.0,2
2,2.0,1
3,1.0,1


### 4.1.2.  Generate random shopping week

In [6]:
#create random samples for each day based on the assigned weights
def weekdays():
    df_Monday = df_Mondays.sample(n=1, weights='weights') 
    df_Monday = df_Monday.rename(columns={"Monday": "times"})
    
    df_Tuesday = df_Tuesdays.sample(n=1, weights='weights')
    df_Tuesday = df_Tuesday.rename(columns={"Tuesday": "times"})
    
    df_Wednesday = df_Wednesdays.sample(n=1, weights='weights')
    df_Wednesday = df_Wednesday.rename(columns={"Wednesday": "times"})
    
    df_Thursday = df_Thursdays.sample(n=1, weights='weights')
    df_Thursday = df_Thursday.rename(columns={"Thursday": "times"})
    
    df_Friday = df_Fridays.sample(n=1, weights='weights')
    df_Friday = df_Friday.rename(columns={"Friday": "times"})
    
    df_Saturday = df_Saturdays.sample(n=1, weights='weights')
    df_Saturday = df_Saturday.rename(columns={"Saturday": "times"})
    
    df_Sunday = df_Sundays.sample(n=1, weights='weights')
    df_Sunday = df_Sunday.rename(columns={"Sunday": "times"})

    #combine all random samplers and print the final (random) week9
    df_week9 = pd.concat([df_Monday, df_Tuesday, df_Wednesday, df_Thursday, df_Friday,df_Saturday,df_Sunday])
    df_week9['day']=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    del df_week9['weights']
    return df_week9

In [7]:
weekdays()

Unnamed: 0,times,day
0,0.0,Monday
0,2.0,Tuesday
0,0.0,Wednesday
0,0.0,Thursday
1,0.0,Friday
1,1.0,Saturday
0,0.0,Sunday


### 4.1.3. Set limits for min/max amounts of visits and shopping days per week

In [8]:
# Making sure all categories of days and weeks are included in the [orders] dataframe (so that 0 may be a possible lower limit)
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_orders.loc[:, ('day')] = pd.Categorical(df_orders.loc[:, ('day')], categories=days)

weeks = [1,2,3,4,5,6,7,8]
df_orders.loc[:, ('week')] = pd.Categorical(df_orders.loc[:, ('week')], categories=weeks)

In [9]:
# Count the minimum and maximum amounts of grocery visits per week (sum of multiple per day)
def CountTotalVisits():
    Count = df_orders.groupby(['week'], observed = False)['order_ID'].nunique()
    Count = pd.DataFrame (Count)
    Count = Count.reset_index()

    min = Count.min(axis=0,numeric_only=True)['order_ID']
    max = Count.max(axis=0,numeric_only=True)['order_ID']
    return min, max

CountTotalVisits()

(2, 7)

In [10]:
# Count the minimum and maximum amounts of grocery shopping days per week (count the days, not the total visits)
def CountTotalDays():
    Count = df_orders.groupby(['week'], observed=False)['day'].nunique()
    Count = pd.DataFrame (Count)
    Count = Count.reset_index()

    min = Count.min(axis=0,numeric_only=True)['day']
    max = Count.max(axis=0,numeric_only=True)['day']
    return min, max

CountTotalDays()

(2, 4)

### 4.1.4. Generate shopping days (with limits - loop until within the limits)

In [11]:
weekdays()

Unnamed: 0,times,day
0,0.0,Monday
1,0.0,Tuesday
1,2.0,Wednesday
1,2.0,Thursday
1,0.0,Friday
1,1.0,Saturday
0,0.0,Sunday


In [12]:
def dayofweek():
    i = 0

    while i < 2:

        ### 1. What DAY?
        dfweek = weekdays()
        
        ### CHECKPOINT: range for visits/week and days/week

            # Check if the total GROCERY VISITS PER WEEK are within the normal range
        dftimes = dfweek[dfweek['times'] > 0]
        visitcount = dftimes['times'].sum()
        totalvisits = CountTotalVisits()
        minvisits = totalvisits[0]
        maxvisits = totalvisits[1]
        if minvisits <= visitcount <= maxvisits:
            i= i+1
        else:
            print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too few/many visits per week", "| minimum:", minvisits, "| maximum:", maxvisits, "| generated:", visitcount)
            i=0
            continue

            # Check if the total GROCERY DAYS PER WEEK are within the normal range
        dftimes = dfweek[dfweek['times'] > 0]
        dayscount = dftimes['times'].count()
        totaldays = CountTotalDays()
        mindays = totaldays[0]
        maxdays = totaldays[1]
        if mindays <= dayscount <= maxdays:
            i = i+1
            dfweek['times'] = dfweek['times'].astype(int)
            print("FINAL grocery days count:", dayscount)
            print("FINAL visit count:", visitcount)
            return dfweek
        else:
            print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too few/many grocery days per week", "| minimum:", mindays, "| maximum:", maxdays, "| generated:", dayscount)
            i=0
            continue

In [13]:
df1 = dayofweek()
df1

[91m[1m MODEL WAS RERUN - Too few/many visits per week | minimum: 2 | maximum: 7 | generated: 1.0
FINAL grocery days count: 3
FINAL visit count: 4.0


Unnamed: 0,times,day
0,0,Monday
0,2,Tuesday
2,1,Wednesday
0,0,Thursday
0,1,Friday
0,0,Saturday
0,0,Sunday


We now have our first model output: days on which they will shop (+ amount of visits per day)

## 4.2. Store Name (model step 2)

### 4.2.1.  Load weighted dataframes for week/weekend

In [14]:
df_week = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_weekstore.csv")
df_end = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_weekendstore.csv")

df_end

Unnamed: 0,store_name,store_type,weight
0,Albert Heijn,supermarket,1
1,Carrefour,supermarket,1
2,Delhaize,supermarket,2
3,Okay,supermarket,2
4,baker,bakery,8
5,butcher,butcher,2


### 4.2.2.  Generate random store name for week/weekend

In [15]:
#create random samples for weekdays based on the assigned weights
def week(n):
    df_weeks = df_week.sample(n=n, weights='weight', replace=True)
    df_weeks = df_weeks.assign(day='week')
    del df_weeks['weight']

    return df_weeks

In [16]:
week(0)

Unnamed: 0,store_name,store_type,day


In [17]:
#create random samples for weekends based on the assigned weights
def weekend(n):
    df_ends = df_end.sample(n=n, weights='weight', replace=True)
    df_ends = df_ends.assign(day='weekend')
    del df_ends['weight']

    return df_ends

In [18]:
weekend(1)

Unnamed: 0,store_name,store_type,day
4,baker,bakery,weekend


In [19]:
### Generate (random) stores based on the shopping week (df1)
            # identify counts for visits per weekday
mo = df1[["times", "day"]].values[0]
tu = df1[["times", "day"]].values[1]
we = df1[["times", "day"]].values[2]
th = df1[["times", "day"]].values[3]
fr = df1[["times", "day"]].values[4]
sa = df1[["times", "day"]].values[5]
su = df1[["times", "day"]].values[6]

    # Generate times for the [previously identified] shopping dags + corresponding time
stores = [week(mo[0]), week(tu[0]), week(we[0]), week(th[0]), week(fr[0]), weekend(sa[0]), weekend(su[0])]

    # create one df for all visited stores + day of the week
stores = pd.concat(stores)

stores

Unnamed: 0,store_name,store_type,day
3,baker,bakery,week
2,Okay,supermarket,week
3,baker,bakery,week
4,butcher,butcher,week


In [20]:
# Convert the shooping days (of the week) into a seperate list

# get just the rows for which there is minimum one store visit
dftimes = df1[df1['times']>0]
dftimes = dftimes.reset_index(drop = True)

# convert these rows to lists
l1 = dftimes['day'].tolist()
l2 = dftimes['times'].tolist()

dow = list(zip(l1, l2))

# multiply both lists
dow = [char for char, count in dow for _ in range(count)]

# Append the list of the days of the week (dow) to this df
stores['dow'] = dow
stores

Unnamed: 0,store_name,store_type,day,dow
3,baker,bakery,week,Tuesday
2,Okay,supermarket,week,Tuesday
3,baker,bakery,week,Wednesday
4,butcher,butcher,week,Friday


### 4.2.3. Set limits for min/max different store visits per day, stores per week (type, name)

In [21]:
# Count min/max (different) store visits per day (IF they shop on that day)
def CountVisitsPerDay(day):
    df_orders['week']=df_orders['week'].astype('category')
    df_count = df_orders[df_orders['day']== day]

    Count = df_count.sort_values(by=['week'])
    Count = Count.groupby(['week'], observed = False)['store_name'].nunique()
    Count = pd.DataFrame (Count)
    Count = Count.reset_index()

    min = Count.min(axis=0,numeric_only=True)['store_name']
    max = Count.max(axis=0,numeric_only=True)['store_name']
    return min, max

CountVisitsPerDay('Friday')

(0, 1)

In [22]:
# Count min/max visits to one store per week (IF that store is visited)
def CountStoreName(store):
    df_orders['week']=df_orders['week'].astype('category')
    df_count = df_orders[df_orders['store_name']== store]

    Count = df_count.sort_values(by=['week'])
    Count = df_count.groupby(['week'], observed = False)['order_ID'].nunique()
    Count = pd.DataFrame (Count)
    Count = Count.reset_index()

    min = Count.min(axis=0,numeric_only=True)['order_ID']
    max = Count.max(axis=0,numeric_only=True)['order_ID']
    return min, max

CountStoreName('Carrefour')

(0, 1)

In [23]:
# Count min/max visits to one store type per week (IF that store type is visited)
def CountStoreType(store):
    df_orders['week']=df_orders['week'].astype('category')
    df_count = df_orders[df_orders['store_type']== store]

    Count = df_count.sort_values(by=['week'])
    Count = Count.groupby(['week'], observed = False)['order_ID'].nunique()
    Count = pd.DataFrame (Count)
    Count = Count.reset_index()

    min = Count.min(axis=0,numeric_only=True)['order_ID']
    max = Count.max(axis=0,numeric_only=True)['order_ID']
    return min, max

CountStoreType('butcher')

(0, 2)

### 4.2.4. Generate storenames (and matching -types; with limits - loop until within the limits)

In [24]:
# data for HH
df = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/DATA-HH (dummy).csv")

def storeperday(dataframes):
    i = 0

    while i < 3:

        ### 2. What STORE?
            # identify counts for visits per weekday
        mo = int(df1["times"].values[0])
        tu = int(df1["times"].values[1])
        we = int (df1["times"].values[2])
        th = int (df1["times"].values[3])
        fr = int (df1["times"].values[4])
        sa = int (df1["times"].values[5])
        su = int (df1["times"].values[6])

        # Generate times for the [previously identified] shopping dags + corresponding time
        stores = [week(mo), week(tu), week(we), week(th), week(fr), weekend(sa), weekend(su)]
        # create one df for all visited stores + day of the week
        stores = pd.concat(stores)

        # get just the rows for which there is minimum one store visit
        dftimes = df1[df1['times']>0]
        dftimes = dftimes.reset_index(drop = True)

        # convert these rows to lists
        l1 = dftimes['day'].tolist()
        l2 = dftimes['times'].tolist()

        dow = list(zip(l1, l2))

        # multiply both lists
        dow = [char for char, count in dow for _ in range(count)]

        # Append the list of the days of the week (dow) to this df
        stores['dow'] = dow

        ### CHECKPOINT: range for amount of stores per week.day

            # Check if the total STORE TYPES VISITED PER WEEK are within the normal range
        for storetype in df['store_type']:
            typecount = stores[stores['store_type']==storetype]['store_type'].count()
            countstoretype = CountStoreType(storetype)
            minstoretype = countstoretype[0]
            maxstoretype = countstoretype[1]
            if typecount < minstoretype:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too few store type:", storetype, "| minimum:", minstoretype, "| generated:", typecount)
                i=0
                continue
            elif maxstoretype < typecount:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too many store type:", storetype, "| maximum:", maxstoretype, "| generated:", typecount)
                i=0
                continue
            else:
                i= i+1

            # Check if the total STORE NAMES VISITED PER WEEK are within the normal range
        for storename in df['store_name']:
            namecount = stores[stores['store_name']==storename]['store_name'].count()
            countstorename = CountStoreName(storename)
            minstorename = countstorename[0]
            maxstorename = countstorename[1]
            if namecount < minstorename:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too few store name:", storename, "| minimum:", minstorename, "| generated:", namecount)
                i=0
                continue
            elif maxstorename < namecount:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too many store name:", storename, "| maximum:", maxstorename, "| generated:", namecount)
                i=0
                continue
            else:
                i= i+1

            # Check if the total STORES VISITED PER DAY are within the normal range
        for day in df['day']:
            perdaycount = stores[stores['dow']==day]['dow'].count()
            CountTotalPerday = CountVisitsPerDay(day)
            minstoreperday = CountTotalPerday[0]
            maxstoreperday = CountTotalPerday[1]
            if perdaycount < minstoreperday:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too few different stores on:", day, "| minimum:", minstoreperday, "| generated:", perdaycount)
                i=0
                continue
            elif maxstoreperday < perdaycount:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too many different stores on:", day, "| maximum:", maxstoreperday, "| generated:", perdaycount)
                i=0
                continue
            else:
                i= i+1
                return stores

In [25]:
df2 = storeperday(df1)
df2

Unnamed: 0,store_name,store_type,day,dow
0,Albert Heijn,supermarket,week,Tuesday
0,Albert Heijn,supermarket,week,Tuesday
0,Albert Heijn,supermarket,week,Wednesday
3,baker,bakery,week,Friday


## 4.3.  Time of day (model step 3)

### 4.3.1. Load the dataframes per day

In [26]:
# data for different days vs stores
df_Mondays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_MoTime.csv")
df_Tuesdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_TuTime.csv")
df_Wednesdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_WeTime.csv")
df_Thursdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_ThTime.csv")
df_Fridays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_FrTime.csv")
df_Saturdays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_SaTime.csv")
df_Sundays = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_SuTime.csv")

In [27]:
df_Sundays

Unnamed: 0,time,weight
0,morning,9


### 4.3.2.  Select times (per day)

In [28]:
#create random samples for this day based on the assigned weights
def Monday(n):
    df_Monday = df_Mondays.sample(n=n, weights='weight', replace=True) 
    df_Monday = df_Monday.assign(day='Monday')
    del df_Monday['weight']
    del df_Monday['day']
    return df_Monday

#create random samples for this day based on the assigned weights
def Tuesday(n):
    df_Tuesday = df_Tuesdays.sample(n=n, weights='weight', replace=True) 
    df_Tuesday = df_Tuesday.assign(day='Tuesday')
    del df_Tuesday['weight']
    del df_Tuesday['day']
    return df_Tuesday

#create random samples for this day based on the assigned weights
def Wednesday(n):
    df_Wednesday = df_Wednesdays.sample(n=n, weights='weight', replace=True) 
    df_Wednesday = df_Wednesday.assign(day='Wednesday')
    del df_Wednesday['weight']
    del df_Wednesday['day']
    return df_Wednesday

#create random samples for this day based on the assigned weights
def Thursday(n):
    df_Thursday = df_Thursdays.sample(n=n, weights='weight', replace=True) 
    df_Thursday = df_Thursday.assign(day='Thursday')
    del df_Thursday['weight']
    del df_Thursday['day']
    return df_Thursday

#create random samples for this day based on the assigned weights
def Friday(n):
    df_Friday = df_Fridays.sample(n=n, weights='weight', replace=True) 
    df_Friday = df_Friday.assign(day='Friday')
    del df_Friday['weight']
    del df_Friday['day']
    return df_Friday

# #create random samples for this day based on the assigned weights
def Saturday(n):
    df_Saturday = df_Saturdays.sample(n=n, weights='weight', replace=True) 
    df_Saturday = df_Saturday.assign(day='Saturday')
    del df_Saturday['weight']
    del df_Saturday['day']
    return df_Saturday

#create random samples for this day based on the assigned weights
def Sunday(n):
    df_Sunday = df_Sundays.sample(n=n, weights='weight', replace=True) 
    df_Sunday = df_Sunday.assign(day='Sunday')
    del df_Sunday['weight']
    del df_Sunday['day']
    return df_Sunday

In [29]:
Sunday(2)

Unnamed: 0,time
0,morning
0,morning


### 4.3.3. Set limits for min/max visits per time of day

In [30]:
def CountTimingPerDay(time):
    df_count = df_orders[df_orders['time']== time]
    Count = df_count.groupby(['week', 'day'], observed=False)['order_ID'].count()
    Count = pd.DataFrame (Count)
    Count = Count.reset_index()

    min = Count.min(axis=0,numeric_only=True)['order_ID']
    max = Count.max(axis=0,numeric_only=True)['order_ID']
    return min, max

In [31]:
CountTimingPerDay('morning')

(0, 3)

### 4.3.4. Generate times of day (morning, noon, afternoon; with limits - loop until within the limits)

In [32]:
def timeperday(dataframes):
    restart = True

    while restart:

        ### 2. What STORE?
        # identify counts for visits per weekday
        mo = int(df1["times"].values[0])
        tu = int(df1["times"].values[1])
        we = int (df1["times"].values[2])
        th = int (df1["times"].values[3])
        fr = int (df1["times"].values[4])
        sa = int (df1["times"].values[5])
        su = int (df1["times"].values[6])
        
        # Generate store names for the [previously identified] shopping dags + corresponding store type
        times = [Monday(mo), Tuesday(tu), Wednesday(we), Thursday(th), Friday(fr), Saturday(sa), Sunday(su)]
        # create one df for all visited stores + day of the week
        timesdf = pd.concat(times)


        ### CHECKPOINT: range for visits/week and days/week

        # Check if the total STORE TYPES VISITED PER day are within the set limits
        timecount = timesdf['time']
        timecount = timecount.to_numpy()

        for timing in timecount:
            timeecount = timesdf[timesdf['time']==timing]['time'].count()
            counttimesperday = CountTimingPerDay(timing)
            mindaytimes = counttimesperday[0]
            maxdaytimes = counttimesperday[1]
            if mindaytimes <= timeecount <= maxdaytimes:
                restart = False
                return timesdf
                break
            else:
                print(f"{bcolors.WARNING}{bcolors.BOLD} RERUN - Too few/many different times per day", "| minimum:", mindaytimes, "| maximum:", maxdaytimes, "| generated:", timeecount)
                restart = True
                break

In [33]:
df3 = timeperday(df2)
#convert this df (time of day) to a list, so that it can be appended to df2
listtime = df3['time'].tolist()

df2['time'] = listtime
df3 = df2
df3

Unnamed: 0,store_name,store_type,day,dow,time
0,Albert Heijn,supermarket,week,Tuesday,noon
0,Albert Heijn,supermarket,week,Tuesday,noon
0,Albert Heijn,supermarket,week,Wednesday,morning
3,baker,bakery,week,Friday,afternoon


We now have our third model output: times at which they will shop (morning, noon, afternoon) ---> this will help to define the next variable: (different) item amount (count)

## 4.4. Amount of (different) items bought (model step 4)

### 4.4.1.  Load weighted dataframes for item amounts

This part of the model does not need a separate weighted dataframe since the weight is defined by the amount of times the item types are present in the whole dataframe 

### 4.4.2.  Generate random item amount for store name

In [34]:
import random
def StoreCount(storename):
    df_store = df[df['store_name'] == storename]
    df_store = df_store.groupby(['order_ID'], observed = False)['amount'].count()
    df_store = pd.DataFrame (df_store)
    df_store = df_store.reset_index()
    # The amount of (different) items they will buy will be a random number between the minimum amount of items they ever bought at this store(name) and the max amount.
    min = df_store.min(axis=0)['amount']
    max = df_store.max(axis=0)['amount']
    return random.randint(min, max)

# Test
StoreCount('Okay')

24

### 4.4.3. Set limits for min/max weekly item amounts

In [35]:
def WeekCount():
    df_count = df.groupby(['week'], observed = False)['amount'].count()
    df_count = pd.DataFrame (df_count)
    df_count = df_count.reset_index()
    # The amount of (different) items they will buy will be limited between the minimum amount of (different) items they ever bought in a week and the max amount.    
    min = df_count.min(axis=0)['amount']
    max = df_count.max(axis=0)['amount']
    return (min,max)

# Test
WeekCount()

(7, 86)

### 4.4.4. Generate item amount (with weekly (different) item amount limits - loop until within the limits)

In [36]:
dataframe = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/DATA-HH (dummy).csv")


def countsperstore(dataframes):
    storetime = dataframes[['store_name','dow']]
    
    restart = True 

    while restart:
        storecounts = []

        for store, time in storetime.itertuples(index=False):
            storecounts.append(StoreCount(store))

        dataframes['counts'] = storecounts

        # Check if the total ITEMS PER STORE are within the normal range
        for items in dataframe['item_type']:
            week9count = dataframes['counts'].sum()
            NormWeekCount = WeekCount()
            minWeekCount = NormWeekCount[0]
            maxWeekCount = NormWeekCount[1]
            if week9count < minWeekCount:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too few different items this week", "| minimum:", minWeekCount, "| generated:", week9count)
                restart = True
                break # force restart
            elif maxWeekCount < week9count:
                print(f"{bcolors.WARNING}{bcolors.BOLD} MODEL WAS RERUN - Too many different items this week", "| maximum:", maxWeekCount, "| generated:", week9count)
                restart = True
                break # force restart
            else:
                restart = False
                print("FINAL total itemcount (amount) =", week9count)
                return dataframes
                break

In [37]:
# generate item counts per gorcery visit
df4 = countsperstore(df3)

# add a column to identify each grocery visit (I = identifier)
df4['I'] = np.arange(df4.shape[0])
# df4 = df4.drop(['day'], axis=1)
df4

FINAL total itemcount (amount) = 27


Unnamed: 0,store_name,store_type,day,dow,time,counts,I
0,Albert Heijn,supermarket,week,Tuesday,noon,10,0
0,Albert Heijn,supermarket,week,Tuesday,noon,5,1
0,Albert Heijn,supermarket,week,Wednesday,morning,10,2
3,baker,bakery,week,Friday,afternoon,2,3


## 4.5. Grocery lists (model step 5 - FINAL STEP)

### 4.5.1.  Load weighted dataframes for item amounts

The amount of times an item type is present in the whole dataframe makes up the weight for this step in the model, for this we need to create a new column in the dataframe:

In [38]:
df['weights_itemtype'] = df.groupby('item_type')['item_type'].transform('count')

### 4.5.2.  Generate random item lists

In [39]:
# Generate grocery lists for supermarkets, indicate the amount of items on the list (manually)
def supermarket(dataframes, counts):
        ## Create dataframes for each day they go shopping
    typedf = {}
    for storetype in dataframes['store_type']:
            storeday = df[df['store_type'] == storetype]
            items = storeday[['item_type', 'item_name', 'store_type','store_name', 'amount', 'category', 'weights_itemtype']]
            typedf[storetype] = pd.DataFrame(items)
            typedf[storetype].drop_duplicates() 
        ## sample dataframes per store type
    if 'supermarket' in dataframes.values:
            grocerylist = dataframes[dataframes['store_type'] == 'supermarket']
            if 'supermarket' in dataframes.values:
                typedf['supermarket'] = typedf['supermarket'].drop_duplicates()
                return typedf['supermarket'].sample(n=counts, replace=False)


# automatically take the item count from df4 (model step 4) by specifying the index number
def supermarkets(dataframes, index):
    storesup = dataframes[dataframes['store_type'] == 'supermarket']
    storesup = storesup[['store_type', 'counts', 'I']]
    storesup = storesup.to_numpy()
    storetypes = {}
    supermarkets = pd.DataFrame(columns = ['item_type', 'item_name', 'amount', 'category'])
    for storetype, counts, i  in storesup:
        if storetype == 'supermarket':
            storetypes[i] = pd.DataFrame(supermarket(dataframes, counts))
    return storetypes[index]


In [40]:
# Generate grocery lists for bakeries, indicate the amount of items on the list (manually)
def bakery(dataframes, counts):
        ## Create dataframes for each day they go shopping
    typedf = {}
    for storetype in dataframes['store_type']:
            storeday = df[df['store_type'] == storetype]
            items = storeday[['item_type', 'item_name', 'store_type','store_name', 'amount', 'category', 'weights_itemtype']]
            typedf[storetype] = pd.DataFrame(items)
            typedf[storetype].drop_duplicates() 
        ## sample dataframes per store type
    if 'bakery' in dataframes.values:
            grocerylist = dataframes[dataframes['store_type'] == 'bakery']
            if 'bakery' in dataframes.values:
                typedf['bakery'] = typedf['bakery'].drop_duplicates()
                return typedf['bakery'].sample(n=counts, replace=False)


# automatically take the item count from df4 (model step 4) by specifying the index number
def bakeries(dataframes, index):
    storesup = dataframes[dataframes['store_type'] == 'bakery']
    storesup = storesup[['store_type', 'counts', 'I']]
    storesup = storesup.to_numpy()
    storetypes = {}
    supermarkets = pd.DataFrame(columns = ['item_type', 'item_name', 'amount', 'category'])
    for storetype, counts, i  in storesup:
        if storetype == 'bakery':
            storetypes[i] = pd.DataFrame(bakery(dataframes, counts))
    return storetypes[index]

In [41]:
# Generate grocery lists for bakeries, indicate the amount of items on the list (manually)
def butcher(dataframes, counts):
        ## Create dataframes for each day they go shopping
    typedf = {}
    for storetype in dataframes['store_type']:
            storeday = df[df['store_type'] == storetype]
            items = storeday[['item_type', 'item_name', 'store_type','store_name', 'amount', 'category', 'weights_itemtype']]
            typedf[storetype] = pd.DataFrame(items)
            typedf[storetype].drop_duplicates() 
        ## sample dataframes per store type
    if 'butcher' in dataframes.values:
            grocerylist = dataframes[dataframes['store_type'] == 'butcher']
            if 'butcher' in dataframes.values:
                typedf['butcher'] = typedf['butcher'].drop_duplicates()
                return typedf['butcher'].sample(n=counts, replace=False)


# automatically take the item count from df4 (model step 4) by specifying the index number
def butchers(dataframes, index):
    storesup = dataframes[dataframes['store_type'] == 'butcher']
    storesup = storesup[['store_type', 'counts', 'I']]
    storesup = storesup.to_numpy()
    storetypes = {}
    supermarkets = pd.DataFrame(columns = ['item_type', 'item_name', 'amount', 'category'])
    for storetype, counts, i  in storesup:
        if storetype == 'butcher':
            storetypes[i] = pd.DataFrame(butcher(dataframes, counts))
    return storetypes[index]

### 4.5.3. Set limits for min/max items per store 

In [42]:
def CountTypePerStore(itemtype, storetype):
    df_type = df[df['store_type'] == storetype]
    count = int(df['order_ID'].nunique())
    data = {'order_ID': range(count)}
    orderID = pd.DataFrame(data)
    df_type = df_type[df_type['item_type'] == itemtype]
    df_type = df_type.groupby(['order_ID'])['amount'].count()
    df_type = pd.DataFrame (df_type)
    df_type = df_type.reset_index() 
    df_type = orderID.merge(df_type, how='left').fillna(0)
    min = df_type.min(axis=0)['amount']
    max = df_type.max(axis=0)['amount']
    return (min, max)

CountTypePerStore('lunch salad', 'supermarket')

(0.0, 1.0)

In [43]:
CountTotalPeritemSTORE = CountTypePerStore('lunch salad', 'supermarket')
minitemperdaySTORE = CountTotalPeritemSTORE[0]
maxitemeperdaySTORE = CountTotalPeritemSTORE[1]
maxitemeperdaySTORE

1.0

In [44]:
# count items per category for entire period
def PerCatCountALL(category):
    data = {'week': [1,2,3,4,5,6,7,8]}
    weeks = pd.DataFrame(data)
    df_cat = df[df['category'] == category]
    df_cat = df_cat.groupby(['week'])['amount'].count()
    df_cat = pd.DataFrame (df_cat)
    df_cat = df_cat.reset_index() 
    df_cat = weeks.merge(df_cat, how='left').fillna(0)
    min = df_cat.min(axis=0)['amount']
    max = df_cat.max(axis=0)['amount']
    return (min, max)

In [45]:
df_amount = df_orders.groupby(['week'], observed=False)['order_amount'].sum()
df_amount = pd.DataFrame (df_amount)
df_amount = df_amount.reset_index()

df_amount

Unnamed: 0,week,order_amount
0,1,66
1,2,61
2,3,82
3,4,45
4,5,118
5,6,47
6,7,14
7,8,34


In [46]:
# count items per week
def WeekAmount():
    df_amount = df_orders.groupby(['week'], observed=False)['order_amount'].sum()
    df_amount = pd.DataFrame (df_amount)
    df_amount = df_amount.reset_index()

    min = df_amount['order_amount'].min(axis=0)
    max = df_amount['order_amount'].max(axis=0)
    return (min,max)

WeekAmount()

(14, 118)

### 4.5.4. Generate grocery list per grocery visit (with limits per store - loop until within the limits)

In [47]:
df4

Unnamed: 0,store_name,store_type,day,dow,time,counts,I
0,Albert Heijn,supermarket,week,Tuesday,noon,10,0
0,Albert Heijn,supermarket,week,Tuesday,noon,5,1
0,Albert Heijn,supermarket,week,Wednesday,morning,10,2
3,baker,bakery,week,Friday,afternoon,2,3


In [48]:
restart = True 
while restart:
    # GET A LIST PER STORE VISITED IN THIS WEEK [ADJUST ACCORDING TO WEEKPLANNING]
    # butcher1 = butchers(df4, 3)
    super1 = supermarkets(df4, 0)
    super2 = supermarkets(df4,1)
    # super3 = supermarkets(df4, 2)
    bakery1 = bakeries(df4, 2)
    bakery2 = bakeries(df4, 3)

    
    # get lists per store type
    supermarketlist = pd.concat([super1])
    supermarketlist = pd.concat([super2])
    # supermarketlist = pd.concat([super3])
    bakerlist = pd.concat([bakery1])
    bakerlist = pd.concat([bakery2])
    # butcherlist = pd.concat([butcher1])

    # append the different lists to the entire week
    weeklist = [super1, super2, bakery1, bakery2]
    week = pd.concat(weeklist)

    #########
    # Check if the categories per WEEK counts are ok:
    for category in df['category']:
            percatcount = week[week['category']==category]['category'].count()
            CountTotalPerCat = PerCatCountALL(category)
            mincatperday = CountTotalPerCat[0]
            maxcatperday = CountTotalPerCat[1]
            if mincatperday <= percatcount <= maxcatperday:
                pass
            else:
                print(f"{bcolors.WARNING}{bcolors.BOLD} RERUN -  week - Too many/few of category:", category, "| minimum:", mincatperday, "| maximum:", maxcatperday, "| generated:", percatcount)
                restart = True
                break       
                
    for items in df['item_type']:
            week9Amount = week['amount'].sum()
            NormWeekAmount = WeekAmount()
            minWeekAmount = NormWeekAmount[0]
            maxWeekAmount = NormWeekAmount[1]
            if minWeekAmount <= week9Amount <= maxWeekAmount:
                pass
            else:
                print(f"{bcolors.WARNING}{bcolors.BOLD} RERUN - AMOUNT/WEEK - to many:", week9Amount, "| minimum:", minWeekAmount, "| maximum:", maxWeekAmount, "| generated:", week9Amount)
                restart = True
                break  
               
    # Check if the itemtype counts PER STORE TYPE are ok:
    for itemtype in df['item_type']:
            peritemcountSTORE = supermarketlist[supermarketlist['item_type']==itemtype]['item_type'].count()
            CountTotalPeritemSTORE = CountTypePerStore(itemtype, 'supermarket')
            minitemperdaySTORE = CountTotalPeritemSTORE[0]
            maxitemeperdaySTORE = CountTotalPeritemSTORE[1]
            if minitemperdaySTORE <= peritemcountSTORE <= maxitemeperdaySTORE:
                # print('gud')
                restart = False
                continue
            else:
                print(f"{bcolors.WARNING}{bcolors.BOLD} RERUN -  week - Too many/few of item:", itemtype, "| minimum:", minitemperdaySTORE, "| maximum:", maxitemeperdaySTORE, "| generated:", peritemcountSTORE)
                restart = True
                break
            

KeyError: 2

In [1562]:
#check grocery list for the entire week
week.sort_values(by=['category']).head()

Unnamed: 0,item_type,item_name,store_type,store_name,amount,category,weights_itemtype
178,bread,ZWARTE WOUD BROOD,bakery,baker,1,bakery,16
181,pastry,BOOTJE,bakery,baker,1,bakery,2
7,sandwich,SANDWICH,bakery,baker,4,bakery,7
281,danish,BOOTJE,bakery,baker,2,bakery,9
189,bread,BROOD grof gesneden,bakery,baker,1,bakery,16


In [1565]:
# Check individual lists (if supermarket super[i], if bakery bakery[i]; if butcher butcher[i])
super2.sort_values(by=['category'])

Unnamed: 0,item_type,item_name,store_type,store_name,amount,category,weights_itemtype
318,ice tea,"BONI Iced Tea Peach niet-bruisend 0,5L",supermarket,Okay,6,beverages,4
326,beer,ST BERNARDUS,supermarket,Albert Heijn,1,beverages,4
259,soup,BONI tomatensoep met balletjes 950ml,supermarket,Okay,1,beverages,6
45,tea,20 CLI RAS7MIN BIO,supermarket,Delhaize,1,beverages,4
264,tomato paste,BONI tomatenpuree dubbel geconcentr. 70g,supermarket,Okay,2,canned foods,2
309,sauce,GO TAN WOKSAUS ZOET ZUUR,supermarket,Delhaize,1,condiments,1
98,cheese,PORT SALUT 135g,supermarket,Okay,1,dairy & plant based,20
221,milk,BONI halfvolle melk PET 50cl,supermarket,Okay,6,dairy & plant based,1
155,cheese,GOUDAJONG SN 300,supermarket,Carrefour,1,dairy & plant based,20
163,eggs,SIMPL SCHAR EI X12,supermarket,Carrefour,1,dairy & plant based,5
