In [1]:
%cd ../

/home/hoanghu/projects/Food-Waste-Optimization


In [2]:
from pathlib import Path
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import metrics
from skl2onnx import to_onnx
import onnxruntime as rt

In [3]:
plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

In [4]:
path_root_trained_model = Path("trained_models")
path_dir_processed = Path("experiments_hoangle/processed")

path_fact = path_dir_processed / "fact.csv"
path_metrics = path_dir_processed / "metrics.csv"

# Read dataset

In [5]:
raw = pd.read_csv(path_fact, header=0, parse_dates=['date'])

raw.head(5)

Unnamed: 0,date,restaurant,num_fish,num_chicken,num_vegetarian,num_meat,num_NotMapped,num_vegan,num_customer_in,num_customer_out,num_rcpts,amnt_waste_customer,amnt_waste_coffee,amnt_waste_kitchen,amnt_waste_hall
0,2023-01-02,Chemicum,85.0,0.0,0.0,171.0,1.0,91.0,,,272.0,4.7,1.2,12.0,0.0
1,2023-01-03,Chemicum,163.0,0.0,32.0,78.0,1.0,120.0,,,327.0,5.0,1.4,14.8,0.0
2,2023-01-04,Chemicum,70.0,0.0,0.0,218.0,3.0,137.0,,,351.0,4.15,4.0,7.1,0.0
3,2023-01-05,Chemicum,232.0,85.0,0.0,2.0,4.0,178.0,,,437.0,10.0,3.3,8.5,0.0
4,2023-01-06,Chemicum,,,,,,,,,,,,,


# Start experiments

In [6]:
RESTAURANTS = raw['restaurant'].unique()
cols_X = [
    'num_fish',
    'num_chicken',
    'num_vegetarian',
    'num_meat',
    # 'num_NotMapped',
    'num_vegan',
]
cols_y = [
    'amnt_waste_customer',
    # 'amnt_waste_coffee',
    'amnt_waste_kitchen',
    # 'amnt_waste_hall'
]

## 1. Select best model using KFold

In [7]:
is_run_selection = False

In [8]:
if is_run_selection is True:
    results = []
    for restaurant in RESTAURANTS:
        # Prepare data
        ## Extract X and y
        data = raw[raw['restaurant'] == restaurant][['date', *cols_X, *cols_y]].set_index('date')

        ## Remove rows full NaN
        data = data[~data.isna().any(axis=1)]

        X, y = data[cols_X], data[cols_y]

        # Start cross-validation
        kf = KFold(n_splits=8, shuffle=True)
        models = {
            'linear': LinearRegression(),
            'lasso': Lasso(),
            'ridge': Ridge(),
            'rf': RandomForestRegressor(),
            'gb': MultiOutputRegressor(GradientBoostingRegressor()),
            'svm': MultiOutputRegressor(SVR()),
            'xgb': XGBRegressor(),
            'knn': KNeighborsRegressor(),
            # 'voting': MultiOutputRegressor(VotingRegressor([
            #     ('ridge', Ridge()),
            #     ('svm', MultiOutputRegressor(SVR())),
            #     ('knn', KNeighborsRegressor()),
            #     ('rf', RandomForestRegressor()),
            # ])),
        }

    
        for name, regressor in models.items():
            mse_vals = []
            for train_idx, test_idx in kf.split(X, y):
                regressor.fit(X.iloc[train_idx], y.iloc[train_idx])
                y_test_pred = regressor.predict(X.iloc[test_idx])

                mse = metrics.mean_squared_error(y.iloc[test_idx], y_test_pred, multioutput='raw_values')
                mse_vals.append(mse)

            result = np.vstack(mse_vals).mean(axis=0)
            results.append({
                'restaurant': restaurant,
                'model': name,
                'type': 'waste_customer',
                'value': result[0]
            })
            results.append({
                'restaurant': restaurant,
                'model': name,
                'type': 'waste_kitchen',
                'value': result[1]
            })

    df_results = pd.DataFrame.from_records(results)
    fig = plt.figure(figsize=(8, 12))
    fig.subplots_adjust(wspace=0.3, hspace=0.3)

    for i, restaurant in enumerate(RESTAURANTS):
        df = df_results[df_results['restaurant'] == restaurant]

        ax = fig.add_subplot(3, 1, i+1)
        sns.barplot(df, hue='model', y='value', x='type', ax=ax)
        ax.set_title(restaurant)


## 2. Build model

In [9]:
def tuning(X: pd.DataFrame, y: pd.DataFrame):
    params_grid = {
        'alpha': [1, 1e-2, 1e-3, 1e-4],
        'max_iter': [100, 500, 1000, 1500, 2000],
        'selection': ['cyclic', 'random']
    }


    grid_search = GridSearchCV(estimator=regressor, param_grid=params_grid, cv=5, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(X, y)

    # Get the best model
    best_model = grid_search.best_estimator_

    return best_model

is_fine_tune = True

In [10]:
models = {}
for restaurant in RESTAURANTS:
    # Prepare data
    ## Extract X and y
    data = raw[raw['restaurant'] == restaurant][['date', *cols_X, *cols_y]].set_index('date')

    ## Remove rows full NaN
    data = data[~data.isna().any(axis=1)]

    X, y = data[cols_X], data[cols_y]

    # Build model
    regressor = Lasso()
    if is_fine_tune is True:
        regressor = tuning(X, y)

    regressor.fit(X, y)

    # Store
    models[restaurant] = regressor

## 5. Predict

In [11]:
Chemicum_fish = 85.0000
Chemicum_chicken = 0.0000
Chemicum_vegetarian = 0.0000
Chemicum_meat = 17.0000
Chemicum_vegan = 91.0000

Physicum_fish = 1000.0001
Physicum_chicken = 0.0001
Physicum_vegetarian = 31.6001
Physicum_meat = 1.2001
Physicum_vegan = 0.0001

Exactum_fish = 0.5001
Exactum_chicken = 0.0001
Exactum_vegetarian = 0.0001
Exactum_meat = 0.0001
Exactum_vegan = 0.0001

In [12]:
restaurant = 'Chemicum'
regressor = models[restaurant]

X_predict = pd.DataFrame({
    'num_fish': [Chemicum_fish],
    'num_chicken': [Chemicum_chicken],
    'num_vegetarian': [Chemicum_vegetarian],
    'num_meat': [Chemicum_meat],
    'num_vegan': [Chemicum_vegan]
})

y_predict = regressor.predict(X_predict)
y_predict

array([[4.09503505, 9.96202844]])

## 6. Save model

In [20]:
for restaurant in RESTAURANTS:
    onx = to_onnx(models[restaurant], X[:1].to_numpy())

    path_model = Path(f"trained_models/biowaste/Jul24_Lasso_{restaurant}.onnx")
    with open(path_model, "wb+") as f:
        f.write(onx.SerializeToString())

# Load models

In [22]:
model_sessions = {}

for restaurant in RESTAURANTS:
    path_model = Path(f"trained_models/biowaste/Jul24_Lasso_{restaurant}.onnx")

    model_sessions[restaurant] = rt.InferenceSession(path_model, providers=["CPUExecutionProvider"])

In [23]:
restaurant = "Chemicum"

# Load model
sess = model_sessions[restaurant]
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

# Make prediction
pred_onx = sess.run([label_name], {input_name: X_predict.to_numpy()})[0]

pred_onx

array([[4.09503505, 9.96202844]])

In [26]:
num_of_days = 3

predictions = {}

# Forecast the future
for restaurant in RESTAURANTS:
    pred = models['biowaste'][restaurant].predict(num_of_days)

    df_pred = pred.pd_dataframe().reset_index()
    df_pred['date'] = df_pred['date'].dt.strftime(r"%Y-%m-%d")

    for row in df_pred.itertuples():
        if row.date not in predictions:
            predictions[row.date] = {'date': row.date}

        predictions[row.date][restaurant] = {
            'amnt_waste_customer': row.amnt_waste_customer,
            'amnt_waste_coffee': row.amnt_waste_coffee,
            'amnt_waste_kitchen': row.amnt_waste_kitchen,
            'amnt_waste_hall': row.amnt_waste_hall,
        }

print(json.dumps(predictions, indent=2))

{
  "2024-05-09": {
    "date": "2024-05-09",
    "Chemicum": {
      "amnt_waste_customer": 15.335807096971186,
      "amnt_waste_coffee": 6.749373675160983,
      "amnt_waste_kitchen": 13.120679698841172,
      "amnt_waste_hall": 12.76089686219699
    },
    "Physicum": {
      "amnt_waste_customer": -0.09098052978515625,
      "amnt_waste_coffee": 11.28973388671875,
      "amnt_waste_kitchen": 12.328250885009766,
      "amnt_waste_hall": 0.17769622802734375
    },
    "Exactum": {
      "amnt_waste_customer": 5.2109375,
      "amnt_waste_coffee": 1.4091796875,
      "amnt_waste_kitchen": 8.2734375,
      "amnt_waste_hall": -1.39208984375
    }
  },
  "2024-05-10": {
    "date": "2024-05-10",
    "Chemicum": {
      "amnt_waste_customer": 14.879137084963158,
      "amnt_waste_coffee": 8.28143831276184,
      "amnt_waste_kitchen": 16.26109595145685,
      "amnt_waste_hall": 14.719872271067954
    },
    "Physicum": {
      "amnt_waste_customer": -0.07980728149414062,
      "amnt_waste