In [1]:
%cd ../../

%load_ext autoreload
%autoreload 2

/home/hoanghu/projects/Food-Waste-Optimization


In [2]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import TargetEncoder, MinMaxScaler
from sklearn.metrics import root_mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.decomposition import PCA

# Load data

In [3]:
path = "src/data/basic_mvp_data/Sold lunches.csv"

raw_lunch = pd.read_csv(path, delimiter=';', encoding='utf-8-sig', parse_dates=['Date'], low_memory=False)
raw_lunch.head()

Unnamed: 0,Date,Receipt time,Restaurant,Food Category,Dish,pcs,Hiilijalanjälki
0,2.1.2023,10:31,600 Chemicum,Liha,"Uunimakkaraa,sinappikastiketta",1,9
1,2.1.2023,10:32,600 Chemicum,Kala,Kalapuikot tillikermaviilikast,1,104
2,2.1.2023,10:32,600 Chemicum,Liha,"Uunimakkaraa,sinappikastiketta",1,9
3,2.1.2023,10:35,600 Chemicum,Kala,Kalapuikot tillikermaviilikast,1,104
4,2.1.2023,10:36,600 Chemicum,Liha,"Uunimakkaraa,sinappikastiketta",2,18


# Extract menu per day for each restaurant

In [4]:
lunches = (
    raw_lunch
    .replace(
        {
            '600 Chemicum': 'Chemicum',
            '610 Physicum': 'Physicum',
            '620 Exactum': 'Exactum',
            'Kala': 'fish',
            'Liha': 'meat',
            'Vegaani': 'vegan',
            'Kasvis': 'vegetarian',
            'Kana': 'chicken'
        },
    )
    .rename(columns={
        'Date': 'date',
        'Restaurant': 'restaurant',
        'Food Category': 'category',
        'Dish': 'dish'
    })
    
)

lunches['pcs'] = pd.to_numeric(lunches.pcs, errors='coerce')
lunches['date'] = pd.to_datetime(lunches.date, format='%d.%m.%Y', errors='coerce')

lunches = (
    lunches
    .groupby(['date', 'restaurant', 'category', 'dish'])['pcs']
    .sum()
    .reset_index()
)

# Remove 'Not Napped'
lunches = lunches[lunches['category'] != 'Not Mapped']

# Remove 'takeaway'
def is_takeaway(s: str):
    return s.lower().count('take away') > 0

lunches['is_takeaway'] = lunches['dish'].map(is_takeaway)
lunches = lunches[~lunches['is_takeaway']].drop(columns='is_takeaway')

lunches.head(10)

Unnamed: 0,date,restaurant,category,dish,pcs
2,2023-01-02,Chemicum,fish,Kalapuikot tillikermaviilikast,78.0
4,2023-01-02,Chemicum,meat,"Uunimakkaraa,sinappikastiketta",165.0
5,2023-01-02,Chemicum,vegan,Marokkolainen linssipata,84.0
8,2023-01-03,Chemicum,fish,"Herkkulohipihvit, punajuurimaj",105.0
10,2023-01-03,Chemicum,fish,Kalapuikot tillikermaviilikast,52.0
12,2023-01-03,Chemicum,meat,Pasta Carbonara,17.0
13,2023-01-03,Chemicum,meat,"Uunimakkaraa,sinappikastiketta",56.0
14,2023-01-03,Chemicum,vegan,Marokkolainen linssipata,62.0
16,2023-01-03,Chemicum,vegan,Vegaaninen buttertofu,51.0
17,2023-01-03,Chemicum,vegetarian,Feta-pinaattilasagnette,29.0


In [5]:
dish_count = (
    lunches
    .groupby(['date' ,'restaurant'])['category']
    .value_counts()
    .reset_index()
    .pivot(index=['date', 'restaurant'], columns='category', values='count')
    .fillna(0.0)
    .rename(columns={
        'chicken': 'num_cat_chicken',
        'fish': 'num_cat_fish',
        'meat': 'num_cat_meat',
        'vegan': 'num_cat_vegan',
        'vegetarian': 'num_cat_vegetarian',
    })
    .reset_index()
)
dish_count.head()

category,date,restaurant,num_cat_chicken,num_cat_fish,num_cat_meat,num_cat_vegan,num_cat_vegetarian
0,2023-01-02,Chemicum,0.0,1.0,1.0,1.0,0.0
1,2023-01-03,Chemicum,0.0,2.0,2.0,2.0,1.0
2,2023-01-04,Chemicum,0.0,2.0,1.0,1.0,0.0
3,2023-01-05,Chemicum,1.0,2.0,0.0,1.0,0.0
4,2023-01-09,Chemicum,0.0,2.0,1.0,2.0,0.0


In [17]:
num_pcs = lunches.groupby(['date', 'restaurant'])['pcs'].sum().reset_index()
num_pcs.head()

Unnamed: 0,date,restaurant,pcs
0,2023-01-02,Chemicum,327.0
1,2023-01-03,Chemicum,372.0
2,2023-01-04,Chemicum,387.0
3,2023-01-05,Chemicum,471.0
4,2023-01-09,Chemicum,568.0


In [18]:
dishes = (
    dish_count
    .merge(
        num_pcs,
        on=['date', 'restaurant'],
        how='inner'
    )
    .drop(columns=['date'])
)

dishes.head()

Unnamed: 0,restaurant,num_cat_chicken,num_cat_fish,num_cat_meat,num_cat_vegan,num_cat_vegetarian,pcs
0,Chemicum,0.0,1.0,1.0,1.0,0.0,327.0
1,Chemicum,0.0,2.0,2.0,2.0,1.0,372.0
2,Chemicum,0.0,2.0,1.0,1.0,0.0,387.0
3,Chemicum,1.0,2.0,0.0,1.0,0.0,471.0
4,Chemicum,0.0,2.0,1.0,2.0,0.0,568.0


## Study the distribution of pcs each dish

In [47]:
category = 'vegetarian'

fig = make_subplots(
    rows=1, cols=1,
    specs=[
        [{'type': 'box'}],
    ],
)

df = lunches[lunches['category'] == category]

# Sort dishes' name w.r.t the decreasing order of mean of pieces
dish_names = []
for dish in df['dish'].unique():
    pcs = df[df['dish'] == dish]['pcs']
    if len(pcs) < 5:
        continue

    dish_names.append((dish, pcs.median()))

dish_names.sort(key=lambda x: x[1], reverse=False)


for (dish, _) in dish_names:
    pcs = df[df['dish'] == dish]['pcs']

    fig.add_trace(
        go.Box(x=pcs, name=dish, showlegend=False, boxpoints=False),
        col=1, row=1
    )

fig.update_layout(
    height=1000, 
    width=1000,
    title_text=f"<b>Distribution of sold pieces of dishes in group '{category}'</b>",
    title_font_size=30,
    xaxis_tickangle=-90,
    title_x=0.5,
    # margin={t: top, b: bottom, autoexpand: False}
)
fig.show()

## Employ ML models

### Inputs are dish name's embeddings

In [8]:
path_embds = "experiments_hoangle/hypotheses/res/6_dish_vs_customers/names_embd.npy"

embds = np.load(path_embds, allow_pickle=True).item()

In [9]:
lunches['embd'] = lunches['dish'].map(embds)

lunches.head()

Unnamed: 0,date,restaurant,category,dish,pcs,embd
2,2023-01-02,Chemicum,fish,Kalapuikot tillikermaviilikast,78.0,"[0.64499474, -11.448155, 12.628082, -0.4857183..."
4,2023-01-02,Chemicum,meat,"Uunimakkaraa,sinappikastiketta",165.0,"[15.018338, 4.3914127, -3.3570702, 3.8625162, ..."
5,2023-01-02,Chemicum,vegan,Marokkolainen linssipata,84.0,"[5.133725, -5.3080544, -2.7053263, 8.477409, 8..."
8,2023-01-03,Chemicum,fish,"Herkkulohipihvit, punajuurimaj",105.0,"[10.697122, -0.18853807, -3.7573192, 8.4619465..."
10,2023-01-03,Chemicum,fish,Kalapuikot tillikermaviilikast,52.0,"[0.64499474, -11.448155, 12.628082, -0.4857183..."


In [55]:
# lunches.groupby(['date', 'restaurant']).agg({'dish': lambda x: ' + '.join(x), 'pcs': 'sum'}).reset_index()

In [56]:
tmp = lunches.groupby(['date', 'category']).agg({'embd': 'mean', 'pcs': 'sum'}).reset_index()

X, y = np.float32(tmp['embd'].tolist()), tmp['pcs'].to_numpy()

tmp

Unnamed: 0,date,category,embd,pcs
0,2023-01-02,fish,"[0.64499474, -11.448155, 12.628082, -0.4857183...",78.0
1,2023-01-02,meat,"[15.018338, 4.3914127, -3.3570702, 3.8625162, ...",165.0
2,2023-01-02,vegan,"[5.133725, -5.3080544, -2.7053263, 8.477409, 8...",84.0
3,2023-01-03,fish,"[5.671058, -5.818347, 4.4353814, 3.988114, 2.9...",157.0
4,2023-01-03,meat,"[10.395273, 1.9188652, -3.4813218, 2.5981388, ...",73.0
...,...,...,...,...
1715,2024-06-14,vegan,"[9.347745, 4.0702252, -5.3099523, 5.742154, 5....",180.0
1716,2024-06-14,vegetarian,"[12.235868, 2.8195195, -1.0143621, -2.1413262,...",14.0
1717,2024-06-17,fish,"[30.779861, -1.978827, -4.9003344, -1.2698876,...",114.0
1718,2024-06-17,meat,"[15.018338, 4.3914127, -3.3570702, 3.8625162, ...",223.0


In [11]:
models = {
    'Ridge': {}, 
    'Lasso': {}, 
    'RandomForestRegressor': {},
    'XGBRegressor': {},
    'LGBMRegressor': {},
    'GradientBoostingRegressor': {}
}

for name in models.keys():
    model = eval(name)()
    model.fit(X, y)


    # rmse = root_mean_squared_error(
    #     scaler_y.inverse_transform(y),
    #     scaler_y.inverse_transform(model.predict(X).reshape(-1, 1))
    # )
    rmse = root_mean_squared_error(y, model.predict(X).reshape(-1, 1))
    models[name]['rmse'] = rmse

models


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.882e+06, tolerance: 5.267e+03



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064906 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261031
[LightGBM] [Info] Number of data points in the train set: 1720, number of used features: 1024
[LightGBM] [Info] Start training from score 223.995930


{'Ridge': {'rmse': 111.76282655228317},
 'Lasso': {'rmse': 114.90574457604124},
 'RandomForestRegressor': {'rmse': 51.29199734519531},
 'XGBRegressor': {'rmse': 32.867793430608636},
 'LGBMRegressor': {'rmse': 34.446530429658345},
 'GradientBoostingRegressor': {'rmse': 73.0787377615688}}

In [12]:
X_pca = PCA(n_components=150).fit_transform(X)

X_pca.shape

(1720, 150)

In [13]:
models = {
    'Ridge': {}, 
    'Lasso': {}, 
    'RandomForestRegressor': {},
    'XGBRegressor': {},
    'LGBMRegressor': {},
    'GradientBoostingRegressor': {}
}

for name in models.keys():
    model = eval(name)()
    model.fit(X_pca, y)


    # rmse = root_mean_squared_error(
    #     scaler_y.inverse_transform(y),
    #     scaler_y.inverse_transform(model.predict(X).reshape(-1, 1))
    # )
    rmse = root_mean_squared_error(y, model.predict(X_pca ).reshape(-1, 1))
    models[name]['rmse'] = rmse

models

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38248
[LightGBM] [Info] Number of data points in the train set: 1720, number of used features: 150
[LightGBM] [Info] Start training from score 223.995930


{'Ridge': {'rmse': 119.95938412183764},
 'Lasso': {'rmse': 120.00892872466535},
 'RandomForestRegressor': {'rmse': 51.124217877058435},
 'XGBRegressor': {'rmse': 28.40254835394544},
 'LGBMRegressor': {'rmse': 35.04268165926529},
 'GradientBoostingRegressor': {'rmse': 81.7226033861491}}

In [14]:
model = XGBRegressor()
model.fit(X_pca, y)


fig = make_subplots(
    rows=1, cols=1,
    specs=[
        [{'type': 'scatter'}]
    ]
)

fx1 = ff.create_distplot([y], ['distplot'], curve_type='kde')
fig.add_trace(
    go.Scatter(x=fx1.data[1]['x'], y=fx1.data[1]['y'], name='target'),
    row=1, col=1
)

fx1 = ff.create_distplot([model.predict(X_pca).squeeze()], ['distplot'], curve_type='kde')
fig.add_trace(
    go.Scatter(x=fx1.data[1]['x'], y=fx1.data[1]['y'], name='prediction'),
    row=1, col=1
)

fig.update_layout(
    height=800, 
    width=800,
    title_text="<b>Title</b>",
    title_font_size=30,
    xaxis_tickangle=-90,
    title_x=0.5,
)
fig.show()

### Inputs are no. dishes per type

In [20]:
encoded = dishes.copy()

In [21]:
col_y = 'pcs'

scaler_y = MinMaxScaler((0, 10))

# encoded[col_y] = scaler_y.fit_transform(encoded[col_y].to_numpy().reshape(-1, 1))

encoded.head()

Unnamed: 0,restaurant,num_cat_chicken,num_cat_fish,num_cat_meat,num_cat_vegan,num_cat_vegetarian,pcs
0,Chemicum,0.0,1.0,1.0,1.0,0.0,327.0
1,Chemicum,0.0,2.0,2.0,2.0,1.0,372.0
2,Chemicum,0.0,2.0,1.0,1.0,0.0,387.0
3,Chemicum,1.0,2.0,0.0,1.0,0.0,471.0
4,Chemicum,0.0,2.0,1.0,2.0,0.0,568.0


In [48]:
cols_X = [
    # 'restaurant',
    'num_cat_chicken', 
    'num_cat_fish',
    'num_cat_meat',
    'num_cat_vegan',
    'num_cat_vegetarian',
]
col_y = 'pcs'

for restaurant in encoded['restaurant'].unique():
    break

df = encoded[encoded['restaurant'] == restaurant]

models = {
    'Ridge': {}, 
    'Lasso': {}, 
    'RandomForestRegressor': {},
    'XGBRegressor': {},
    'LGBMRegressor': {},
    'GradientBoostingRegressor': {}
}


X = df[cols_X]
y = df[col_y]
for name in models.keys():
    model = eval(name)()
    model.fit(X, y)

    if name in ["Ridge", 'Lasso',]:
        feats = model.coef_.squeeze()
    else:
        feats = model.feature_importances_

    models[name] = {feat: coef for (feat, coef) in zip(cols_X, feats)}

    # rmse = root_mean_squared_error(
    #     scaler_y.inverse_transform(y),
    #     scaler_y.inverse_transform(model.predict(X).reshape(-1, 1))
    # )
    rmse = root_mean_squared_error(y, model.predict(X).reshape(-1, 1))
    models[name]['rmse'] = rmse
        

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21
[LightGBM] [Info] Number of data points in the train set: 367, number of used features: 5
[LightGBM] [Info] Start training from score 728.310627


In [23]:
fig = make_subplots(
    rows=1, cols=1,
    specs=[
        [{'type': 'scatter'}]
    ]
)

fx1 = ff.create_distplot([y], ['distplot'], curve_type='kde')
fig.add_trace(
    go.Scatter(x=fx1.data[1]['x'], y=fx1.data[1]['y'], name='target'),
    row=1, col=1
)

fx1 = ff.create_distplot([model.predict(X).squeeze()], ['distplot'], curve_type='kde')
fig.add_trace(
    go.Scatter(x=fx1.data[1]['x'], y=fx1.data[1]['y'], name='prediction'),
    row=1, col=1
)

fig.update_layout(
    height=800, 
    width=800,
    title_text="<b>Title</b>",
    title_font_size=30,
    xaxis_tickangle=-90,
    title_x=0.5,
)
fig.show()

### Visualize the MSE

In [61]:
rmse_nodish_per_type = pd.DataFrame.from_records([
    {'model': 'Ridge', 'rmse': 170.33021265443364},
    {'model': 'Lasso', 'rmse': 170.40908480245872},
    {'model': 'RandomForestRegressor', 'rmse': 150.18939825979098},
    {'model': 'XGBRegressor', 'rmse': 146.5323867897732},
    {'model': 'LGBMRegressor', 'rmse': 164.63485166495823},
    {'model': 'GradientBoostingRegressor', 'rmse': 156.78630967346677},
])
rmse_dishname = pd.DataFrame.from_records([
    {'model': 'Ridge', 'rmse': 119.95938412183764},
    {'model': 'Lasso', 'rmse': 120.00892872466535},
    {'model': 'RandomForestRegressor', 'rmse': 51.124217877058435},
    {'model': 'XGBRegressor', 'rmse': 28.40254835394544},
    {'model': 'LGBMRegressor', 'rmse': 35.04268165926529},
    {'model': 'GradientBoostingRegressor', 'rmse': 81.7226033861491},
])

rmse_dishname

Unnamed: 0,model,rmse
0,Ridge,119.959384
1,Lasso,120.008929
2,RandomForestRegressor,51.124218
3,XGBRegressor,28.402548
4,LGBMRegressor,35.042682
5,GradientBoostingRegressor,81.722603


In [70]:
fig = make_subplots(
    rows=1, cols=1,
    specs=[
        [{'type': 'bar'}]
    ]
)

fig.add_trace(
    go.Bar(x=rmse_nodish_per_type['model'], y=rmse_nodish_per_type['rmse'], name='No. dishes per type'),
    row=1, col=1
)
fig.add_trace(
    go.Bar(x=rmse_dishname['model'], y=rmse_dishname['rmse'], name='Dish name'),
    row=1, col=1
)

fig.update_layout(
    height=600, 
    width=800,
    title_text="<b>How powerful using dish name over no. dishes per type to predict the piece quantity</b>",
    title_font_size=18,
    xaxis_tickangle=-90,
    title_x=0.5,
)
fig.show()