In [1]:
%cd ../

/home/hoanghu/projects/Food-Waste-Optimization/experiments_hoangle


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error

from utils import Paths

# Load related dim tables

In [3]:
dim_biowaste = pd.read_excel(Paths.dim_biowaste(), index_col=None)
dim_biowaste.head()

Unnamed: 0,date,restaurant,amnt_waste_customer,amnt_waste_coffee,amnt_waste_kitchen,amnt_waste_hall
0,2023-01-02,Chemicum,4.7,1.2,12.0,0.0
1,2023-01-03,Chemicum,5.0,1.4,14.8,0.0
2,2023-01-04,Chemicum,4.15,4.0,7.1,0.0
3,2023-01-05,Chemicum,10.0,3.3,8.5,0.0
4,2023-01-09,Chemicum,7.65,2.1,4.9,0.0


In [4]:
dim_lunches = pd.read_excel(Paths.dim_lucnhes(), index_col=None)
dim_lunches.head()

Unnamed: 0,date,restaurant,category,meal,pcs
0,2023-01-02,Chemicum,fish,Kalapuikot tillikermaviilikast,78
1,2023-01-02,Chemicum,meat,Uunimakkaraa,165
2,2023-01-02,Chemicum,vegan,Marokkolainen linssipata,84
3,2023-01-03,Chemicum,fish,Herkkulohipihvit,105
4,2023-01-03,Chemicum,fish,Kalapuikot tillikermaviilikast,52


# Create data

## OneHot encode dish name

In [5]:
enc_onehot = OneHotEncoder(sparse_output=False)

enc_onehot.fit(dim_lunches[['meal']])

vectors = enc_onehot.transform(dim_lunches[['meal']])
embs = [x.squeeze() for x in vectors]

In [6]:
dim_lunches['embd'] = embs
dim_lunches['embd'] = dim_lunches['embd'] * dim_lunches['pcs']
dim_lunches.head()

Unnamed: 0,date,restaurant,category,meal,pcs,embd
0,2023-01-02,Chemicum,fish,Kalapuikot tillikermaviilikast,78,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2023-01-02,Chemicum,meat,Uunimakkaraa,165,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2023-01-02,Chemicum,vegan,Marokkolainen linssipata,84,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2023-01-03,Chemicum,fish,Herkkulohipihvit,105,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2023-01-03,Chemicum,fish,Kalapuikot tillikermaviilikast,52,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Get embedding per day and restaurant 

In [7]:
embds_by_day_res = (
    dim_lunches
    .groupby(['date', 'restaurant'])['embd']
    .sum()
    .reset_index()
)

embds_by_day_res.head()

Unnamed: 0,date,restaurant,embd
0,2023-01-02,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2023-01-03,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2023-01-04,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,2023-01-05,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2023-01-09,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Get total waste per day and restaurant

In [8]:
dim_biowaste['waste'] = (
    dim_biowaste['amnt_waste_customer']
    + dim_biowaste['amnt_waste_coffee']
    + dim_biowaste['amnt_waste_kitchen']
    + dim_biowaste['amnt_waste_hall']
)

dim_biowaste.drop(
    columns=['amnt_waste_customer', 'amnt_waste_coffee', 'amnt_waste_kitchen', 'amnt_waste_hall'],
    inplace=True
)

dim_biowaste.head()

Unnamed: 0,date,restaurant,waste
0,2023-01-02,Chemicum,17.9
1,2023-01-03,Chemicum,21.2
2,2023-01-04,Chemicum,15.25
3,2023-01-05,Chemicum,21.8
4,2023-01-09,Chemicum,14.65


## Create single table containing embeddings and biowaste per day and restaurant

In [9]:
fact = (
    embds_by_day_res
    .merge(dim_biowaste, on=['date', 'restaurant'])
)

fact.head()

Unnamed: 0,date,restaurant,embd,waste
0,2023-01-02,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",17.9
1,2023-01-03,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",21.2
2,2023-01-04,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",15.25
3,2023-01-05,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",21.8
4,2023-01-09,Chemicum,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",14.65


# Find waste amount emitted by each dish

In [10]:
model = LinearRegression(positive=True, fit_intercept=False)

X = np.stack(fact['embd'].to_numpy())
y = fact['waste']

model.fit(X, y)

model.coef_

array([5.68979614e-02, 5.04847297e-02, 4.91931273e-02, 0.00000000e+00,
       4.00150815e-02, 0.00000000e+00, 6.92182652e-02, 6.26239007e-02,
       4.67846599e-02, 3.05455947e-01, 4.93720100e-02, 0.00000000e+00,
       4.56874154e-02, 2.36042022e-01, 1.96228954e-01, 4.72136401e-02,
       1.21907128e-01, 6.45894009e-02, 1.60250643e-02, 4.84570109e-02,
       3.67576362e-02, 4.37330356e-02, 8.66946231e-02, 2.58907871e-01,
       1.82319270e-01, 4.99114180e-02, 6.01881711e-02, 3.74656337e-02,
       9.06737675e-02, 0.00000000e+00, 2.56833975e-02, 1.17994451e-01,
       5.64394726e-02, 6.79587885e-03, 4.82819166e+00, 2.11610235e-01,
       2.98349722e-01, 1.69491797e-02, 3.86955568e-02, 0.00000000e+00,
       0.00000000e+00, 1.07866269e-01, 0.00000000e+00, 4.78112637e-02,
       8.70554680e-02, 2.21063338e-02, 6.67182682e-02, 4.18795855e-01,
       5.45730892e-02, 0.00000000e+00, 2.29989499e-01, 0.00000000e+00,
       3.12145038e-02, 1.07681355e-01, 0.00000000e+00, 3.31397305e-02,
      

In [11]:
r2 = r2_score(y, model.predict(X))
rmse = root_mean_squared_error(y, model.predict(X))

print(f"r2   = {r2:.4f}")
print(f"rmse = {rmse:.4f}")

r2   = 0.6777
rmse = 8.2531


In [12]:
# This amount is added to waste of all dishes, which
# is based on the assumption that every dish emits waste

THETA = 1e-4

dish_biowaste = pd.DataFrame.from_records([
    {'dish': name, 'waste': waste}
    for name, waste in zip(enc_onehot.categories_[0], model.coef_)
])

dish_biowaste.head()

Unnamed: 0,dish,waste
0,Aurajuusto-pinaattilasagnettea,0.056898
1,BBQ-Broilerikastiketta,0.050485
2,Bangladeshilainen linssipata,0.049193
3,Bataatti-maapähkinäkeitto,0.0
4,Bataattipihvit,0.040015


In [14]:
dish_biowaste.to_excel(Paths.pred_biowaste(), index=False)