In [1]:
%cd ../


/home/hoanghu/projects/Food-Waste-Optimization


In [2]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

# 1. Load data

In [4]:
path = "src/data/basic_mvp_data/Biowaste.csv"

biowaste = pd.read_csv(path, delimiter=';')

biowaste.head()

Unnamed: 0,Date,Ravintola,Asiakasbiojäte. tiski (kg),Biojäte kahvi. porot (kg),Keittiön biojäte (ruoanvalmistus) (kg),Salin biojäte (jämät) (kg)
0,1.1.2023,600 Chemicum,0.0,0.0,0.0,0.0
1,1.1.2023,610 Physicum,0.0,0.0,0.0,0.0
2,1.1.2023,620 Exactum,0.0,0.0,0.0,0.0
3,2.1.2023,600 Chemicum,4.7,1.2,12.0,0.0
4,2.1.2023,610 Physicum,0.0,0.0,0.0,0.0


In [5]:
path = "src/data/basic_mvp_data/tuntidata2.xlsx"

receipts = pd.read_excel(path, header=0)

receipts.head()

Unnamed: 0,Date,Kuitin tunti,Ravintola,Kuitti kpl
0,2023-01-02,8,600 Chemicum,1
1,2023-01-02,10,600 Chemicum,18
2,2023-01-02,11,600 Chemicum,83
3,2023-01-02,12,600 Chemicum,79
4,2023-01-02,13,600 Chemicum,90


In [6]:
path = "src/data/basic_mvp_data/supersight.xlsx"

occupancy = pd.read_excel(path, header=0, index_col=None)

occupancy.head()

Unnamed: 0,dateCreated,countIn,countOut,phoneName
0,2024-03-15T04:26:15.000Z,0,1,S63
1,2024-03-15T04:26:24.000Z,1,2,S63
2,2024-03-15T04:26:27.000Z,1,0,S63
3,2024-03-15T05:19:19.000Z,1,0,S63
4,2024-03-15T05:37:09.000Z,1,0,S63


In [7]:
path = "src/data/basic_mvp_data/kumpula_lounaat_kat.csv"

meals = pd.read_csv(path, delimiter=';', header=None)

meals.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,,Ravintola,,,,,,,,,...,,,,,,,,,,
1,,600 Chemicum,,,,,,,,,...,,,,,,,,,,Total
2,,Kala,%-osuus,Kana,%-osuus,Kasvis,%-osuus,Liha,%-osuus,Not Mapped,...,Kasvis,%-osuus,Liha,%-osuus,Not Mapped,%-osuus,Vegaani,%-osuus,Total,
3,,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,...,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,,Lounaat ilman lisämyyntiä,Lounaat ilman lisämyyntiä
4,Total,73 554,"26,4 %",47 233,"16,9 %",1 524,"0,5 %",41 839,"15,0 %",14 822,...,3 910,"5,3 %",7 744,"10,6 %",195,"0,3 %",31 626,"43,2 %",73 132,397 770


# 2. Pre-process

## With `biowaste`

In [8]:
biowaste['Date'] = biowaste['Date'].apply(lambda x: datetime.strptime(x, r"%d.%m.%Y"))

biowaste.loc[:, 'Ravintola'] = biowaste['Ravintola'].replace({
    "600 Chemicum": "Chemicum",
    "610 Physicum": "Physicum",
    "620 Exactum": "Exactum"
})

biowaste = biowaste.rename(columns={
    'Date': 'date',
    'Ravintola': 'restaurant', 
    'Asiakasbiojäte. tiski (kg)': 'amnt_waste_customer', 
    'Biojäte kahvi. porot (kg)': 'amnt_waste_coffee', 
    'Keittiön biojäte (ruoanvalmistus) (kg)': 'amnt_waste_kitchen',
    'Salin biojäte (jämät) (kg)': 'amnt_waste_hall',
})

# Filter out rows which all 4 types of waste have zero amount
cols = ['amnt_waste_customer', 'amnt_waste_coffee', 'amnt_waste_kitchen', 'amnt_waste_hall']
tmp = biowaste[cols].sum(axis=1)
biowaste = biowaste[tmp > 0]

biowaste.head()

Unnamed: 0,date,restaurant,amnt_waste_customer,amnt_waste_coffee,amnt_waste_kitchen,amnt_waste_hall
3,2023-01-02,Chemicum,4.7,1.2,12.0,0.0
6,2023-01-03,Chemicum,5.0,1.4,14.8,0.0
9,2023-01-04,Chemicum,4.15,4.0,7.1,0.0
12,2023-01-05,Chemicum,10.0,3.3,8.5,0.0
24,2023-01-09,Chemicum,7.65,2.1,4.9,0.0


## With `receipts`

In [9]:
receipts.loc[:, 'Ravintola'] = receipts['Ravintola'].replace({
    "600 Chemicum": "Chemicum",
    "610 Physicum": "Physicum",
    "620 Exactum": "Exactum"
})
    
receipts = receipts.rename(columns={
    'Date': 'date',
    'Kuitin tunti': 'hour',
    'Ravintola': 'restaurant',
    'Kuitti kpl': 'num_rcpts'
})

receipts = receipts.groupby(['date', 'restaurant'])['num_rcpts'].sum().reset_index()

receipts.head()

Unnamed: 0,date,restaurant,num_rcpts
0,2023-01-02,Chemicum,272
1,2023-01-03,Chemicum,327
2,2023-01-04,Chemicum,351
3,2023-01-04,Physicum,1
4,2023-01-05,Chemicum,437


## With `occupancy`

Since we only concern about how occupancy data relates to the Unicafe data, only camera `S163`, `S216` and `S217` are focused

In [10]:
phoneName2restaurant = {
    'S163': 'Exactum',
    'S216': 'Chemicum',
    'S217': 'Physicum',
}

occupancy = occupancy[occupancy['phoneName'].isin(phoneName2restaurant.keys())]
occupancy['restaurant'] = occupancy['phoneName'].apply(lambda x: phoneName2restaurant[x])

occupancy.head()

Unnamed: 0,dateCreated,countIn,countOut,phoneName,restaurant
1138,2024-03-15T06:45:23.000Z,1,0,S163,Exactum
1139,2024-03-15T06:51:18.000Z,0,1,S163,Exactum
1140,2024-03-15T07:58:16.000Z,0,1,S163,Exactum
1141,2024-03-15T08:23:55.000Z,1,0,S163,Exactum
1142,2024-03-15T08:47:09.000Z,0,1,S163,Exactum


In [11]:
# Convert index
occupancy['date'] = pd.to_datetime(occupancy['dateCreated']).dt.tz_localize(None)
occupancy = occupancy.set_index('date')

# Accumulate countIn, countOut by day
cols = ['countIn', 'countOut', 'restaurant']
occupancy = occupancy[cols].groupby(by='restaurant').resample('D').sum()

# Reset date
occupancy = occupancy[['countIn', 'countOut']].reset_index()

# Rename columns' name
occupancy = occupancy.rename(columns={
    'countIn': 'num_customer_in',
    'countOut': 'num_customer_out'
})

occupancy.head()

Unnamed: 0,restaurant,date,num_customer_in,num_customer_out
0,Chemicum,2024-05-27,11,6
1,Chemicum,2024-05-28,767,709
2,Chemicum,2024-05-29,671,677
3,Chemicum,2024-05-30,717,697
4,Chemicum,2024-05-31,689,668


## With `meals`

In [12]:
cols_name = [
    'num_fish', 'percent_fish', 
    'num_chicken', 'percent_chicken',
    'num_vegetable', 'percent_vegetable', 
    'num_meat', 'percent_meat',
    'num_NotMapped', 'percent_NotMapped',
    'num_vegan', 'percent_vegan',
    'total' 
]
len_cols_per_restaurant = len(cols_name) - 1

restaurants = ['Chemicum', 'Physicum', 'Exactum']

cols_important = ['date', 'restaurant', 'num_fish', 'num_chicken', 'num_vegetable', 'num_meat', 'num_NotMapped', 'num_vegan']

def _f_process(x: str):
    x = x.replace(' %', '')
    x = x.replace(' ', '')
    x = x.replace(',', '.')
    
    ret = float(x)

    return ret

In [13]:
# Get date column
dates = meals.loc[5:, 0].apply(lambda x: datetime.strptime(x, r"%Y-%m-%d"))

# Extract restaurant sold meals data
data_restaurants = []

idx_col = 1
for restaurant in restaurants:
    df_restaurant = meals.loc[5:, idx_col:idx_col + len_cols_per_restaurant].copy()

    df_restaurant = df_restaurant.set_axis(cols_name, axis=1)

    df_restaurant = df_restaurant.map(_f_process)

    df_restaurant['restaurant'] = restaurant
    df_restaurant['date'] = dates.copy()

    df_restaurant = df_restaurant[cols_important]
    
    data_restaurants.append(df_restaurant)


In [14]:
meals = pd.concat(data_restaurants)

meals.head()

Unnamed: 0,date,restaurant,num_fish,num_chicken,num_vegetable,num_meat,num_NotMapped,num_vegan
5,2023-01-02,Chemicum,85.0,0.0,0.0,171.0,1.0,91.0
6,2023-01-03,Chemicum,163.0,0.0,32.0,78.0,1.0,120.0
7,2023-01-04,Chemicum,70.0,0.0,0.0,218.0,3.0,137.0
8,2023-01-05,Chemicum,232.0,85.0,0.0,2.0,4.0,178.0
9,2023-01-09,Chemicum,107.0,0.0,0.0,264.0,0.0,207.0


# 3. Create fact tables

In [15]:
date_start = pd.to_datetime('2023-01-02')
date_end = pd.to_datetime('2024-07-02')

date_range = pd.date_range(date_start, date_end, freq='B')

fact = pd.concat(
    [
        pd.DataFrame({
            'date': date_range,
            'restaurant': restaurant
        })
        for restaurant in restaurants
    ]
)

In [16]:
foreign_keys = ['restaurant', 'date']

fact = fact\
    .merge(meals, on=foreign_keys, how='left')\
    .merge(occupancy, on=foreign_keys, how='left')\
    .merge(receipts, on=foreign_keys, how='left')\
    .merge(biowaste, on=foreign_keys, how='left')

fact.head()

Unnamed: 0,date,restaurant,num_fish,num_chicken,num_vegetable,num_meat,num_NotMapped,num_vegan,num_customer_in,num_customer_out,num_rcpts,amnt_waste_customer,amnt_waste_coffee,amnt_waste_kitchen,amnt_waste_hall
0,2023-01-02,Chemicum,85.0,0.0,0.0,171.0,1.0,91.0,,,272.0,4.7,1.2,12.0,0.0
1,2023-01-03,Chemicum,163.0,0.0,32.0,78.0,1.0,120.0,,,327.0,5.0,1.4,14.8,0.0
2,2023-01-04,Chemicum,70.0,0.0,0.0,218.0,3.0,137.0,,,351.0,4.15,4.0,7.1,0.0
3,2023-01-05,Chemicum,232.0,85.0,0.0,2.0,4.0,178.0,,,437.0,10.0,3.3,8.5,0.0
4,2023-01-06,Chemicum,,,,,,,,,,,,,


In [17]:
# path_fact = "experiments_hoangle/processed/fact.csv"

# fact.to_csv(path_fact, index=False)

# 4. Find correlations

In [23]:
fact[~fact.isna().any(axis=1)]

Unnamed: 0,date,restaurant,num_fish,num_chicken,num_vegetable,num_meat,num_NotMapped,num_vegan,num_customer_in,num_customer_out,num_rcpts,amnt_waste_customer,amnt_waste_coffee,amnt_waste_kitchen,amnt_waste_hall
1098,2024-03-15,Exactum,116.0,56.0,0.0,260.0,137.0,217.0,26.0,83.0,346.0,8.6,2.08,5.13,0.0
1099,2024-03-18,Exactum,264.0,0.0,0.0,389.0,143.0,147.0,33.0,94.0,354.0,3.4,0.86,10.6,0.0
1100,2024-03-19,Exactum,353.0,1.0,0.0,0.0,152.0,370.0,234.0,189.0,350.0,3.76,1.1,5.4,0.0
1101,2024-03-20,Exactum,223.0,282.0,0.0,0.0,127.0,175.0,372.0,231.0,515.0,6.89,1.56,9.75,0.0
1102,2024-03-21,Exactum,117.0,340.0,0.0,0.0,166.0,233.0,262.0,179.0,371.0,3.17,1.67,7.89,0.0
1104,2024-03-25,Exactum,112.0,0.0,0.0,335.0,143.0,287.0,305.0,239.0,329.0,9.36,1.27,0.0,5.86
1105,2024-03-26,Exactum,353.0,0.0,8.0,15.0,160.0,303.0,252.0,184.0,355.0,3.0,0.0,2.1,0.0
1106,2024-03-27,Exactum,153.0,267.0,0.0,0.0,141.0,261.0,474.0,535.0,359.0,3.3,2.1,6.5,0.0
1112,2024-04-04,Exactum,226.0,78.0,0.0,92.0,153.0,268.0,296.0,313.0,259.0,2.02,0.1,3.82,0.0
1114,2024-04-08,Exactum,142.0,33.0,0.0,328.0,147.0,264.0,425.0,449.0,308.0,3.21,1.75,5.64,0.0
