# 3. Assigning weights

## Import libraries 

In [12]:
# %matplotlib notebook
%matplotlib inline
#Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
#NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
from matplotlib.ticker import StrMethodFormatter
from matplotlib.pyplot import figure
class bcolors:
    WARNING = '\033[91m'
    BOLD = '\033[1m'

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeClassifier, export_graphviz, plot_tree

# importing the required function for CHI2 tests
from scipy.stats import chi2_contingency

## Load and view data 

In [13]:
# Load the dataframe that was saved after step 2.2. Visualizing the data
df = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/DATA-HH (dummy).csv")
df.head()

Unnamed: 0,week,order_ID,item_name,amount,price_unit,price_total,date,day,timestamp,time,...,item_id,type_id,order_amount,order_price,day_num,storetype_num,storename_num,cat_num,time_num,promo_num
0,1,1,RABEKO choco light 250g,2,2.82,5.64,2021-11-23,Tuesday,12:32:00,noon,...,0,0,9,16.77,5,4,6,2,3,0
1,1,1,JOYVALLE pudding griesmeel natuur 135g,4,0.99,3.96,2021-11-23,Tuesday,12:32:00,noon,...,1,1,9,16.77,5,4,6,7,3,0
2,1,1,BONI tomatensoep met balletjes 950ml,1,1.99,1.99,2021-11-23,Tuesday,12:32:00,noon,...,2,2,9,16.77,5,4,6,3,3,0
3,1,1,LIEBIG DELISOUP 9 groenten brik 1L,1,2.59,2.59,2021-11-23,Tuesday,12:32:00,noon,...,3,2,9,16.77,5,4,6,3,3,0
4,1,1,LIEBIG DELISOUP tom. Balletjes brik 1L,1,2.59,2.59,2021-11-23,Tuesday,12:32:00,noon,...,4,2,9,16.77,5,4,6,3,3,0


In [14]:
# load the dataframe for just the orders (not all individual items) as saved after step 2.2. Visualizing the data
df_orders = pd.read_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/DATA-HH (dummy-orders).csv")
df_orders = df_orders.drop_duplicates()
df_orders.head()

Unnamed: 0,week,order_ID,store_name,storename_num,store_type,storetype_num,day,day_num,time,time_num,timestamp,order_amount,order_price
0,1,1,Okay,6,supermarket,4,Tuesday,5,noon,3,12:32:00,9,16.77
1,1,2,baker,7,bakery,0,Tuesday,5,noon,3,12:39:00,7,6.1
2,1,4,Delhaize,3,supermarket,4,Friday,0,afternoon,0,17:25:00,50,103.32998
3,2,5,Albert Heijn,0,supermarket,4,Wednesday,6,morning,2,11:43:00,1,3.99
4,2,6,baker,7,bakery,0,Wednesday,6,morning,2,09:57:00,9,7.7


## 3.1. New Dataframes per day of week

In [15]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day'] = pd.Categorical(df['day'], categories=days, ordered=True)
df_dow = df.sort_values(by=['week','day'])

# grouping the variables for week, day and unique order id's
df_dow = df.groupby(['week', 'day'], observed = False)['order_ID'].nunique()
df_dow = pd.DataFrame (df_dow)
df_dow.head()

# make grid for days vs. week
df_dowgrid1 = df_dow.groupby(['week', 'day'], observed = False)['order_ID'].aggregate('first').unstack()
df_dowgrid1 = df_dowgrid1.reset_index()
df_dowgrid1.replace(0, np.nan, inplace=True)
df_dowgrid1

# second grid to generate extra variables
df_dowgrid2 = df_dowgrid1.copy()
del df_dowgrid2["week"]
# column for total grocery visits
df_dowgrid1['sum'] = df_dowgrid2.sum(axis=1)
# column for total days shopped
df_dowgrid1['ndays'] = df_dowgrid2.count(axis=1)
# column for median visits/week
df_dowgrid1['med'] = df_dowgrid2.median(numeric_only=True, axis=1)

df_dowgrid1 = df_dowgrid1.round(0)

In [16]:
df_dowgrid1 = df_dowgrid1.round(0)
df_dowgrid1 = df_dowgrid1. replace(np. nan,0)
df_dowgrid1

day,week,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,sum,ndays,med
0,1,0.0,2.0,0.0,0.0,1.0,0.0,0.0,3.0,2,2.0
1,2,0.0,0.0,2.0,2.0,0.0,0.0,3.0,7.0,3,2.0
2,3,0.0,0.0,0.0,1.0,0.0,1.0,2.0,4.0,3,1.0
3,4,0.0,0.0,1.0,0.0,0.0,1.0,3.0,5.0,3,1.0
4,5,2.0,2.0,0.0,0.0,0.0,3.0,0.0,7.0,3,2.0
5,6,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2,1.0
6,7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,3,1.0
7,8,2.0,0.0,1.0,1.0,1.0,0.0,0.0,5.0,4,1.0


###  3.1.1. Set weights for amounts of visits per day

In [17]:
# Assign weights based on how the numbers per day are represented
df_Mondays = pd.DataFrame(df_dowgrid1['Monday'])
df_Mondays['weights'] = df_Mondays.groupby(['Monday'])['Monday'].transform('count')
df_Mondays = df_Mondays.drop_duplicates()

df_Tuesdays = pd.DataFrame(df_dowgrid1['Tuesday'])
df_Tuesdays['weights'] = df_Tuesdays.groupby(['Tuesday'])['Tuesday'].transform('count')
df_Tuesdays = df_Tuesdays.drop_duplicates()

df_Wednesdays = pd.DataFrame(df_dowgrid1['Wednesday'])
df_Wednesdays['weights'] = df_Wednesdays.groupby(['Wednesday'])['Wednesday'].transform('count')
df_Wednesdays = df_Wednesdays.drop_duplicates()

df_Thursdays = pd.DataFrame(df_dowgrid1['Thursday'])
df_Thursdays['weights'] = df_Thursdays.groupby(['Thursday'])['Thursday'].transform('count')
df_Thursdays = df_Thursdays.drop_duplicates()

df_Fridays = pd.DataFrame(df_dowgrid1['Friday'])
df_Fridays['weights'] = df_Fridays.groupby(['Friday'])['Friday'].transform('count')
df_Fridays = df_Fridays.drop_duplicates()

df_Saturdays = pd.DataFrame(df_dowgrid1['Saturday'])
df_Saturdays['weights'] = df_Saturdays.groupby(['Saturday'])['Saturday'].transform('count')
df_Saturdays = df_Saturdays.drop_duplicates()

df_Sundays = pd.DataFrame(df_dowgrid1['Sunday'])
df_Sundays['weights'] = df_Sundays.groupby(['Sunday'])['Sunday'].transform('count')
df_Sundays = df_Sundays.drop_duplicates()

In [18]:
df_Sundays

Unnamed: 0,Sunday,weights
0,0.0,4
1,3.0,2
2,2.0,1
5,1.0,1


In [19]:
df_Mondays.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Mondays).csv", index = None, header=True)
df_Tuesdays.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Tuesdays).csv", index = None, header=True)
df_Wednesdays.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Wednesdays).csv", index = None, header=True)
df_Thursdays.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Thursdays).csv", index = None, header=True)
df_Fridays.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Fridays).csv", index = None, header=True)
df_Saturdays.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Saturdays).csv", index = None, header=True)
df_Sundays.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1(Sundays).csv", index = None, header=True)

### 3.1.2: Conclusion:  MODEL STEP 1 : ON WHAT DAYS WILL THEY SHOP?

Outcome (example):
1. input: Monday,Tuesday...
2. output: 0 times, 1 time, 2 times... (weights)

## 3.2. New Dataframes per times of the day

In [20]:
# Select only one day
Monday = 'Monday'
Tuesday = 'Tuesday'
Wednesday = 'Wednesday'
Thursday = 'Thursday'
Friday = 'Friday'
Saturday = 'Saturday'
Sunday = 'Sunday'

# select only these data for the df
df_Monday = df_orders[df_orders['day'] == 'Monday']
df_Tuesday = df_orders[df_orders['day'] == 'Tuesday']
df_Wednesday = df_orders[df_orders['day'] == 'Wednesday']
df_Thursday = df_orders[df_orders['day'] == 'Thursday']
df_Friday = df_orders[df_orders['day'] == 'Friday']
df_Saturday = df_orders[df_orders['day'] == 'Saturday']
df_Sunday = df_orders[df_orders['day'] == 'Sunday']

df_Tuesday

Unnamed: 0,week,order_ID,store_name,storename_num,store_type,storetype_num,day,day_num,time,time_num,timestamp,order_amount,order_price
0,1,1,Okay,6,supermarket,4,Tuesday,5,noon,3,12:32:00,9,16.77
1,1,2,baker,7,bakery,0,Tuesday,5,noon,3,12:39:00,7,6.1
21,5,24,Okay,6,supermarket,4,Tuesday,5,morning,2,11:40:00,53,104.36096
22,5,25,baker,7,bakery,0,Tuesday,5,morning,2,11:55:00,6,3.6


###  3.2.1. Set weights for times per day

In [21]:
df_Mo = df_Monday.groupby(['time'])['order_ID'].nunique()
df_Mo = pd.DataFrame (df_Mo)
df_Mo = df_Mo.reset_index()
df_Mo = df_Mo.loc[~(df_Mo==0).any(axis=1)]
df_Mo = df_Mo.rename(columns={"order_ID": "weight"})

df_Tu = df_Tuesday.groupby(['time'])['order_ID'].nunique()
df_Tu = pd.DataFrame (df_Tu)
df_Tu = df_Tu.reset_index()
df_Tu = df_Tu.loc[~(df_Tu==0).any(axis=1)]
df_Tu = df_Tu.rename(columns={"order_ID": "weight"})

df_We = df_Wednesday.groupby(['time'])['order_ID'].nunique()
df_We = pd.DataFrame (df_We)
df_We = df_We.reset_index()
df_We = df_We.loc[~(df_We==0).any(axis=1)]
df_We = df_We.rename(columns={"order_ID": "weight"})

df_Th = df_Thursday.groupby(['time'])['order_ID'].nunique()
df_Th = pd.DataFrame (df_Th)
df_Th = df_Th.reset_index()
df_Th = df_Th.loc[~(df_Th==0).any(axis=1)]
df_Th = df_Th.rename(columns={"order_ID": "weight"})

df_Fr = df_Friday.groupby(['time'])['order_ID'].nunique()
df_Fr = pd.DataFrame (df_Fr)
df_Fr = df_Fr.reset_index()
df_Fr = df_Fr.loc[~(df_Fr==0).any(axis=1)]
df_Fr = df_Fr.rename(columns={"order_ID": "weight"})

df_Sa = df_Saturday.groupby(['time'])['order_ID'].nunique()
df_Sa = pd.DataFrame (df_Sa)
df_Sa = df_Sa.reset_index()
df_Sa = df_Sa.loc[~(df_Sa==0).any(axis=1)]
df_Sa = df_Sa.rename(columns={"order_ID": "weight"})

df_Su = df_Sunday.groupby(['time'])['order_ID'].nunique()
df_Su = pd.DataFrame (df_Su)
df_Su = df_Su.reset_index()
df_Su = df_Su.loc[~(df_Su==0).any(axis=1)]
df_Su = df_Su.rename(columns={"order_ID": "weight"})

In [22]:
df_Mo

Unnamed: 0,time,weight
0,afternoon,1
1,morning,2
2,noon,2


In [23]:
df_Mo.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_MoTime.csv", index = None, header=True)
df_Tu.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_TuTime.csv", index = None, header=True)
df_We.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_WeTime.csv", index = None, header=True)
df_Th.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_ThTime.csv", index = None, header=True)
df_Fr.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_FrTime.csv", index = None, header=True)
df_Sa.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_SaTime.csv", index = None, header=True)
df_Su.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_SuTime.csv", index = None, header=True)

### 3.2.2: Conclusion:  MODEL STEP 2 : At what time will they shop?

#### If they go to store X (on day Y) => at X time (noon, afternoon, evening)

Data to build the algorithm:
>  2. Number of visits per time (per week)
    1. Assign weights to each time (based on times shopped on these times)
    
    
We then have the second given:
> 1. Week 9: HH3 will shop on <b>Monday/Tuesday/..., X times</b>
> 2. Week 9: HH3 will shop on <b>noon/afternoon/..., X times</b> ==> related to the day

## 3.3. New dataframes per week/weekend

In [24]:
df.to_csv(r"/workspaces/DesignerlyAlgorithmicPrototyping/database/DATA-HH (dummy).csv", index=False)
df.head()

Unnamed: 0,week,order_ID,item_name,amount,price_unit,price_total,date,day,timestamp,time,...,item_id,type_id,order_amount,order_price,day_num,storetype_num,storename_num,cat_num,time_num,promo_num
0,1,1,RABEKO choco light 250g,2,2.82,5.64,2021-11-23,Tuesday,12:32:00,noon,...,0,0,9,16.77,5,4,6,2,3,0
1,1,1,JOYVALLE pudding griesmeel natuur 135g,4,0.99,3.96,2021-11-23,Tuesday,12:32:00,noon,...,1,1,9,16.77,5,4,6,7,3,0
2,1,1,BONI tomatensoep met balletjes 950ml,1,1.99,1.99,2021-11-23,Tuesday,12:32:00,noon,...,2,2,9,16.77,5,4,6,3,3,0
3,1,1,LIEBIG DELISOUP 9 groenten brik 1L,1,2.59,2.59,2021-11-23,Tuesday,12:32:00,noon,...,3,2,9,16.77,5,4,6,3,3,0
4,1,1,LIEBIG DELISOUP tom. Balletjes brik 1L,1,2.59,2.59,2021-11-23,Tuesday,12:32:00,noon,...,4,2,9,16.77,5,4,6,3,3,0


In [25]:
# aggregate afternoon and evening
df['day'] = df['day'].replace(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'], 'weekday')
df['day'] = df['day'].replace(['Saturday', 'Sunday'], 'weekend')

# aggregate afternoon and evening
df['store_type'] = df['store_type'].replace(['drugstore', 'furniture store'], 'other')
df.drop(df[df['store_type']== 'other'].index, inplace = True)

  df['day'] = df['day'].replace(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'], 'weekday')
  df['day'] = df['day'].replace(['Saturday', 'Sunday'], 'weekend')


In [26]:
# select only these data for the df
df_weekdays = df[df['day'] == 'weekday']
df_weekends = df[df['day'] == 'weekend']

df_week = df_weekdays.groupby(['store_name', 'store_type'])['order_ID'].nunique()
df_week = pd.DataFrame (df_week)
df_week = df_week.reset_index()
df_week = df_week.loc[~(df_week==0).any(axis=1)]
df_week = df_week.rename(columns={"order_ID": "weight"})

df_end = df_weekends.groupby(['store_name', 'store_type'])['order_ID'].nunique()
df_end = pd.DataFrame (df_end)
df_end = df_end.reset_index()
df_end = df_end.loc[~(df_end==0).any(axis=1)]
df_end = df_end.rename(columns={"order_ID": "weight"})

In [27]:
df_week.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_weekstore.csv", index = None, header=True)
df_end.to_csv (r"/workspaces/DesignerlyAlgorithmicPrototyping/database/df_HH1_weekendstore.csv", index = None, header=True)

In [28]:
df_end

Unnamed: 0,store_name,store_type,weight
0,Albert Heijn,supermarket,1
1,Carrefour,supermarket,1
2,Delhaize,supermarket,2
3,Okay,supermarket,2
4,baker,bakery,8
5,butcher,butcher,2


#### We now have all weighted dataframes necessary to be able to create our designerly baseline model. The steps in this model will be:
1. Generate a shopping week (per day of the week: (1) Will they shop? (0/1) (2) How many store visits will they do on this day? --> limited by min/max amounts of visits and shopping days per week
2. Generate a store (name) per shopping visit ---> limited by min/max different store visits per day, stores per week (type, name)
3. Generate time per day per store type ---> limited by min/max different store visits per time of day
4. Generate amount of items bought for each shopping visit ---> limited by min/max (unique) item amounts per week
5. Generate item types per shopping visit (= individual SHOPPING LISTS)