In [1]:
%cd ../../

%load_ext autoreload
%autoreload 2

/home/hoanghu/projects/Food-Waste-Optimization


In [2]:
import itertools
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [3]:
import plotly.io as pio

pio.templates.default = "seaborn"

# Load data

In [4]:
path_dir_processed = Path("experiments_hoangle/processed")

path_lunches = Path("src/data/basic_mvp_data/Sold lunches.csv")
path_fact = path_dir_processed / "fact.csv"

raw_lunch = pd.read_csv(path_lunches, delimiter=';', encoding='utf-8-sig', parse_dates=['Date'], low_memory=False)
raw_fact = pd.read_csv(path_fact, header=0, parse_dates=['date'])

# Process data

## With 'lunches'

In [5]:
lunches = (
    raw_lunch
    .replace(
        {
            '600 Chemicum': 'Chemicum',
            '610 Physicum': 'Physicum',
            '620 Exactum': 'Exactum',
            'Kala': 'fish',
            'Liha': 'meat',
            'Vegaani': 'vegan',
            'Kasvis': 'vegetarian',
            'Kana': 'chicken'
        },
    )
    .rename(columns={
        'Date': 'date',
        'Restaurant': 'restaurant',
        'Food Category': 'category',
        'Dish': 'dish'
    })
    
)

lunches['pcs'] = pd.to_numeric(lunches.pcs, errors='coerce')
lunches['date'] = pd.to_datetime(lunches.date, format='%d.%m.%Y', errors='coerce')

lunches = (
    lunches
    .groupby(['date', 'restaurant', 'category', 'dish'])['pcs']
    .sum()
    .reset_index()
)

# Remove 'Not Napped'
lunches = lunches[lunches['category'] != 'Not Mapped']

# Remove 'takeaway'
def is_takeaway(s: str):
    return s.lower().count('take away') > 0

lunches['is_takeaway'] = lunches['dish'].map(is_takeaway)
lunches = lunches[~lunches['is_takeaway']].drop(columns='is_takeaway')

# Add dish name processing
def _f(s: str):
    s = s.split(',')[0]
    s = s.split('&')[0]
    s = s.strip()

    return s

lunches['dish'] = lunches['dish'].apply(_f)

# Sum pieces again since on some dates, same dish appears twice
lunches = lunches.groupby(['date', 'restaurant', 'category', 'dish'])['pcs'].sum().reset_index()


lunches.head(10)

Unnamed: 0,date,restaurant,category,dish,pcs
0,2023-01-02,Chemicum,fish,Kalapuikot tillikermaviilikast,78.0
1,2023-01-02,Chemicum,meat,Uunimakkaraa,165.0
2,2023-01-02,Chemicum,vegan,Marokkolainen linssipata,84.0
3,2023-01-03,Chemicum,fish,Herkkulohipihvit,105.0
4,2023-01-03,Chemicum,fish,Kalapuikot tillikermaviilikast,52.0
5,2023-01-03,Chemicum,meat,Pasta Carbonara,17.0
6,2023-01-03,Chemicum,meat,Uunimakkaraa,56.0
7,2023-01-03,Chemicum,vegan,Marokkolainen linssipata,62.0
8,2023-01-03,Chemicum,vegan,Vegaaninen buttertofu,51.0
9,2023-01-03,Chemicum,vegetarian,Feta-pinaattilasagnette,29.0


### Create table containing dishes' name and dish quantity by date and restaurant

In [6]:
tmp1 = (
    lunches
    .groupby(['date', 'restaurant', 'category'])
    .agg({
        'dish': lambda x: [x] if isinstance(x, str) else x,
    })
    .reset_index()
)

tmp1['category'] = tmp1['category'].map(lambda x: f"dishes_{x}")
tmp1['dish'] = tmp1['dish'].map(lambda x: np.array([x]) if isinstance(x, str) else x)

tmp1 = tmp1.pivot(index=['date', 'restaurant'], columns='category', values='dish')

tmp1.head()

Unnamed: 0_level_0,category,dishes_chicken,dishes_fish,dishes_meat,dishes_vegan,dishes_vegetarian
date,restaurant,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-02,Chemicum,,[Kalapuikot tillikermaviilikast],[Uunimakkaraa],[Marokkolainen linssipata],
2023-01-03,Chemicum,,"[Herkkulohipihvit, Kalapuikot tillikermaviilik...","[Pasta Carbonara, Uunimakkaraa]","[Marokkolainen linssipata, Vegaaninen buttertofu]",[Feta-pinaattilasagnette]
2023-01-04,Chemicum,,"[Herkkulohipihvit, Rapea kalaleike]",[Lihapullat],[Punajuuripyörykät],
2023-01-05,Chemicum,[Kievin kana],"[Rapea kalaleike, Sitruunaiset kalapalat]",,[Meksikon Beanit Chilipata],
2023-01-09,Chemicum,,"[Kalapuikot tillikermaviilikast, Sitruunaiset ...",[Chorizo lihap ja ruskkastike],"[Kasvis-jalapnuget ja tomatsals, Kasvisjauhisp...",


In [7]:
tmp2 = (
    lunches
    .groupby(['date', 'restaurant', 'category'])['dish']
    .count()
    .reset_index()
)

tmp2['category'] = tmp2['category'].map(lambda x: f"no_{x}")

tmp2 = tmp2.pivot(index=['date', 'restaurant'], columns='category', values='dish')

tmp2.fillna(0.0, inplace=True)

tmp2.head()

Unnamed: 0_level_0,category,no_chicken,no_fish,no_meat,no_vegan,no_vegetarian
date,restaurant,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-02,Chemicum,0.0,1.0,1.0,1.0,0.0
2023-01-03,Chemicum,0.0,2.0,2.0,2.0,1.0
2023-01-04,Chemicum,0.0,2.0,1.0,1.0,0.0
2023-01-05,Chemicum,1.0,2.0,0.0,1.0,0.0
2023-01-09,Chemicum,0.0,2.0,1.0,2.0,0.0


In [8]:
lunches = tmp1.merge(tmp2, on=['date', 'restaurant'], how='inner')

lunches.head()

Unnamed: 0_level_0,category,dishes_chicken,dishes_fish,dishes_meat,dishes_vegan,dishes_vegetarian,no_chicken,no_fish,no_meat,no_vegan,no_vegetarian
date,restaurant,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-02,Chemicum,,[Kalapuikot tillikermaviilikast],[Uunimakkaraa],[Marokkolainen linssipata],,0.0,1.0,1.0,1.0,0.0
2023-01-03,Chemicum,,"[Herkkulohipihvit, Kalapuikot tillikermaviilik...","[Pasta Carbonara, Uunimakkaraa]","[Marokkolainen linssipata, Vegaaninen buttertofu]",[Feta-pinaattilasagnette],0.0,2.0,2.0,2.0,1.0
2023-01-04,Chemicum,,"[Herkkulohipihvit, Rapea kalaleike]",[Lihapullat],[Punajuuripyörykät],,0.0,2.0,1.0,1.0,0.0
2023-01-05,Chemicum,[Kievin kana],"[Rapea kalaleike, Sitruunaiset kalapalat]",,[Meksikon Beanit Chilipata],,1.0,2.0,0.0,1.0,0.0
2023-01-09,Chemicum,,"[Kalapuikot tillikermaviilikast, Sitruunaiset ...",[Chorizo lihap ja ruskkastike],"[Kasvis-jalapnuget ja tomatsals, Kasvisjauhisp...",,0.0,2.0,1.0,2.0,0.0


## With 'fact'

In [9]:
cols = [
    'date',
    'restaurant',
    'amnt_waste_customer',
    'amnt_waste_coffee',
    'amnt_waste_kitchen',
    'amnt_waste_hall'
]

wastes = raw_fact[cols].dropna(axis=0)
wastes.head()

Unnamed: 0,date,restaurant,amnt_waste_customer,amnt_waste_coffee,amnt_waste_kitchen,amnt_waste_hall
0,2023-01-02,Chemicum,4.7,1.2,12.0,0.0
1,2023-01-03,Chemicum,5.0,1.4,14.8,0.0
2,2023-01-04,Chemicum,4.15,4.0,7.1,0.0
3,2023-01-05,Chemicum,10.0,3.3,8.5,0.0
5,2023-01-09,Chemicum,7.65,2.1,4.9,0.0


## Create final fact table containing waste amount in different types, dishes by date and restaurant

In [10]:
fact = lunches.merge(wastes, on=['date', 'restaurant'], how='inner')

fact.head()

Unnamed: 0,date,restaurant,dishes_chicken,dishes_fish,dishes_meat,dishes_vegan,dishes_vegetarian,no_chicken,no_fish,no_meat,no_vegan,no_vegetarian,amnt_waste_customer,amnt_waste_coffee,amnt_waste_kitchen,amnt_waste_hall
0,2023-01-02,Chemicum,,[Kalapuikot tillikermaviilikast],[Uunimakkaraa],[Marokkolainen linssipata],,0.0,1.0,1.0,1.0,0.0,4.7,1.2,12.0,0.0
1,2023-01-03,Chemicum,,"[Herkkulohipihvit, Kalapuikot tillikermaviilik...","[Pasta Carbonara, Uunimakkaraa]","[Marokkolainen linssipata, Vegaaninen buttertofu]",[Feta-pinaattilasagnette],0.0,2.0,2.0,2.0,1.0,5.0,1.4,14.8,0.0
2,2023-01-04,Chemicum,,"[Herkkulohipihvit, Rapea kalaleike]",[Lihapullat],[Punajuuripyörykät],,0.0,2.0,1.0,1.0,0.0,4.15,4.0,7.1,0.0
3,2023-01-05,Chemicum,[Kievin kana],"[Rapea kalaleike, Sitruunaiset kalapalat]",,[Meksikon Beanit Chilipata],,1.0,2.0,0.0,1.0,0.0,10.0,3.3,8.5,0.0
4,2023-01-09,Chemicum,,"[Kalapuikot tillikermaviilikast, Sitruunaiset ...",[Chorizo lihap ja ruskkastike],"[Kasvis-jalapnuget ja tomatsals, Kasvisjauhisp...",,0.0,2.0,1.0,2.0,0.0,7.65,2.1,4.9,0.0


# Explore

Check how the dishes affect the waste amount

In [11]:
restaurant = "Chemicum"

## Check: no. dishes per type -> waste amount per type

In [34]:
cols_X = ['no_chicken', 'no_fish', 'no_meat', 'no_vegan', 'no_vegetarian']
cols_y = ['amnt_waste_customer', 'amnt_waste_coffee', 'amnt_waste_kitchen', 'amnt_waste_hall']

X = fact[fact['restau rant'] == restaurant][cols_X]
y = fact[fact['restaurant'] == restaurant][cols_y]

In [13]:
is_multi = True

def _get_reg(model, is_multi: bool):
    if is_multi is True:
        return MultiOutputRegressor(model)
    else:
        return model

models = {
    'Linear': {'model': LinearRegression(), 'rmse': 0., 'r2': 0.},
    'Ridge': {'model': Ridge(), 'rmse': 0., 'r2': 0.},
    'Lasso': {'model': Lasso(), 'rmse': 0., 'r2': 0.},
    'SVM': {'model': _get_reg(SVR(), is_multi), 'rmse': 0., 'r2': 0.},
    'RF': {'model': _get_reg(RandomForestRegressor(), is_multi), 'rmse': 0., 'r2': 0.},
    'GB': {'model': _get_reg(GradientBoostingRegressor(), is_multi), 'rmse': 0., 'r2': 0.},
    'XGB': {'model': _get_reg(XGBRegressor(), is_multi), 'rmse': 0., 'r2': 0.},
    'CatBoost': {'model': _get_reg(CatBoostRegressor(verbose=False), is_multi), 'rmse': 0., 'r2': 0.},
    'LightBGM': {'model': _get_reg(LGBMRegressor(verbose=-1), is_multi), 'rmse': 0., 'r2': 0.},
}

for v in models.values():
    model = v['model']
    model.fit(X, y)

    y_pred = model.predict(X)

    v['rmse'] = root_mean_squared_error(y, y_pred)
    v['r2'] = r2_score(y, y_pred)

### Plot results

In [14]:
rmse_vals = pd.DataFrame.from_records([
    {'model': name, 'rmse': v['rmse']}
    for name, v in models.items()
])

r2_vals = pd.DataFrame.from_records([
    {'model': name, 'r2': v['r2']}
    for name, v in models.items()
])

rmse_vals.head()

Unnamed: 0,model,rmse
0,Linear,5.483743
1,Ridge,5.483751
2,Lasso,5.540983
3,SVM,5.629358
4,RF,4.946256


In [15]:
fig = make_subplots(
    rows=2, cols=1,
    specs=[
        [{'type': 'bar'}],
        [{'type': 'bar'}]
    ]
)

fig.add_trace(
    go.Bar(x=rmse_vals['model'], y=rmse_vals['rmse'], name='RMSE'),
    row=1, col=1
)
fig.add_trace(
    go.Bar(x=r2_vals['model'], y=r2_vals['r2'], name='R2'),
    row=2, col=1
)

fig.update_layout(
    height=600, 
    width=800,
    title_text=f"<b>Metrics of regressors predicting waste amnt per type from<br> no. dishes per type for {restaurant}</b>",
    title_font_size=20,
    xaxis_tickangle=-90,
    title_x=0.5,
)
fig.show()

## Check: dishes' name -> waste amount per type

In [16]:
cols = ['dishes_chicken', 'dishes_fish', 'dishes_meat', 'dishes_vegan', 'dishes_vegetarian']

names = set()

for col in cols:
    dish_names = fact[col].dropna()
    dish_names = set(np.hstack(dish_names.to_list()))

    names = names.union(dish_names)

name2id = {
    name: idx+1
    for idx, name in enumerate(names) 
}
id2name = {
    idx+1: name
    for idx, name in enumerate(names) 
}

### Using dish names with Ordinal Encoding

In [39]:
MAX_LEN_X = 16

X, y = [], []

cols_X = ['dishes_chicken', 'dishes_fish', 'dishes_meat', 'dishes_vegan', 'dishes_vegetarian']
cols_y = ['amnt_waste_customer', 'amnt_waste_coffee', 'amnt_waste_kitchen', 'amnt_waste_hall']

for r in fact[fact['restaurant'] == restaurant].itertuples():
    # Create X
    dishes = []

    for col in cols_X:
        val = r.__getattribute__(col)

        if val is np.nan:
            continue

        dishes.extend([name2id[name] for name in val])

    # Pad 0s
    if len(dishes) < MAX_LEN_X:
        dishes.extend([0]*(MAX_LEN_X - len(dishes)))

    X.append(dishes)

    # Create y
    y.append([r.__getattribute__(col) for col in cols_y])

X, y = np.float32(X), np.float32(y)
    

In [18]:
is_multi = True

def _get_reg(model, is_multi: bool):
    if is_multi is True:
        return MultiOutputRegressor(model)
    else:
        return model

models = {
    'Linear': {'model': LinearRegression(), 'rmse': 0., 'r2': 0.},
    'Ridge': {'model': Ridge(), 'rmse': 0., 'r2': 0.},
    'Lasso': {'model': Lasso(), 'rmse': 0., 'r2': 0.},
    'SVM': {'model': _get_reg(SVR(), is_multi), 'rmse': 0., 'r2': 0.},
    'RF': {'model': _get_reg(RandomForestRegressor(), is_multi), 'rmse': 0., 'r2': 0.},
    'GB': {'model': _get_reg(GradientBoostingRegressor(), is_multi), 'rmse': 0., 'r2': 0.},
    'XGB': {'model': _get_reg(XGBRegressor(), is_multi), 'rmse': 0., 'r2': 0.},
    'CatBoost': {'model': _get_reg(CatBoostRegressor(verbose=False), is_multi), 'rmse': 0., 'r2': 0.},
    'LightBGM': {'model': _get_reg(LGBMRegressor(verbose=-1), is_multi), 'rmse': 0., 'r2': 0.},
}

for v in models.values():
    model = v['model']
    model.fit(X, y)

    y_pred = np.clip(model.predict(X), a_min=0, a_max=None)

    v['rmse'] = root_mean_squared_error(y, y_pred)
    v['r2'] = r2_score(y, y_pred)

### Plot results

In [19]:
rmse_vals = pd.DataFrame.from_records([
    {'model': name, 'rmse': v['rmse']}
    for name, v in models.items()
])

r2_vals = pd.DataFrame.from_records([
    {'model': name, 'r2': v['r2']}
    for name, v in models.items()
])

rmse_vals.head()

Unnamed: 0,model,rmse
0,Linear,5.432481
1,Ridge,5.432481
2,Lasso,5.434919
3,SVM,5.570986
4,RF,2.218575


In [20]:
fig = make_subplots(
    rows=2, cols=1,
    specs=[
        [{'type': 'bar'}],
        [{'type': 'bar'}]
    ]
)

fig.add_trace(
    go.Bar(x=rmse_vals['model'], y=rmse_vals['rmse'], name='RMSE'),
    row=1, col=1
)
fig.add_trace(
    go.Bar(x=r2_vals['model'], y=r2_vals['r2'], name='R2'),
    row=2, col=1
)

fig.update_layout(
    height=600, 
    width=800,
    title_text=f"<b>Metrics of regressors predicting waste amnt per type <br>from dish names for {restaurant}</b>",
    title_font_size=20,
    xaxis_tickangle=-90,
    title_x=0.5,
)
fig.show()

### Get indices in input matrix X affecting the most to the waste of customer/kitchen

In [21]:
model = models['XGB']['model']

In [35]:
waste_type = "kitchen"
K = 5

In [36]:
if waste_type == "customer":
    nth = 0
elif waste_type == "kitchen":
    nth = 2
else:
    raise NotImplementedError()

indices = np.argpartition(model.estimators_[nth].feature_importances_, -K)[-K:]

indices

array([6, 4, 3, 2, 8])

In [40]:
dishes = np.hstack([X[:, idx] for idx in indices])

out = np.unique(dishes, return_counts=True)
df_counts = pd.DataFrame({
    'ordinal': out[0],
    'count': out[1]
})

df_counts = df_counts[df_counts['ordinal'] != 0]

counts_topK = df_counts.sort_values('count', ascending=False).head(K)
counts_topK['ordinal'] = counts_topK['ordinal'].astype(np.int32)
counts_topK['dish'] = [id2name[ordinal] for ordinal in counts_topK['ordinal']]

counts_topK

Unnamed: 0,ordinal,count,dish
49,84,44,Härkisbolognese
137,231,40,Lihapullat
111,189,39,Pasta Carbonara
25,43,34,Linssibolognesea
89,152,31,Meksikolaista uunimakkaraa
