In [51]:
# TODO : Remove the faulty frames extracted from side angles videos
# TODO : Move the cleaned metadata file to final-dataset

In [42]:
import os
import shutil
from pathlib import Path
from pprint import pprint

import pandas as pd

## Preprocess metadata file

In [41]:
# Two metadata file
# Metadata file in this format :
# dish_id, total_calories, total_mass, total_fat, total_carb, total_protein, num_ingrs, (ingr_1_id, ingr_1_name, ingr_1_grams, ingr_1_calories, ingr_1_fat, ingr_1_carb, ingr_1_protein, ...)

dish_metadata1 = Path("./metadata/dish_metadata_cafe1.csv")
dish_metadata2 = Path("./metadata/dish_metadata_cafe2.csv")

In [42]:
# dish_metadata1 convert into structured csv file with (dish_id, calories / g, fat/g, carbs/g, protein/g, ingredients as string)
rows = []
with open(dish_metadata1) as file:
    for line in file.readlines():
        row = []
        content = line.split(",")
        row.append(content[0])
        total_calorie = float(content[1])
        total_mass = float(content[2])
        total_fat = float(content[3])
        total_carb = float(content[4])
        total_protein = float(content[5])
        row.extend(
            [
                total_calorie / total_mass,
                total_carb / total_mass,
                total_protein / total_mass,
                total_fat / total_mass,
            ]
        )
        ingredients_content = content[6:]
        ingredients_list = []
        for i in range(1, len(ingredients_content), 7):
            ingredients_list.append(ingredients_content[i])
        row.append(",".join(ingredients_list))
        rows.append(row)
cleaned_dish_metadata1 = pd.DataFrame(
    rows,
    columns=[
        "dish_id",
        "Calories(kcal)",
        "Carbohydrate(g)",
        "Protein(g)",
        "Fat(g)",
        "Ingredients",
    ],
)

In [43]:
cleaned_dish_metadata1.head()

Unnamed: 0,dish_id,Calories(kcal),Carbohydrate(g),Protein(g),Fat(g),Ingredients
0,dish_1561662216,1.55852,0.146209,0.096549,0.064184,"soy sauce,garlic,white rice,parsley,onions,bro..."
1,dish_1562688426,1.563295,0.058977,0.117011,0.093818,"roasted potatoes,chicken apple sausage"
2,dish_1561662054,1.436434,0.090245,0.088735,0.081638,"pepper,white rice,mixed greens,garlic,soy sauc..."
3,dish_1562008979,1.320471,0.035081,0.121881,0.076637,"jalapenos,lemon juice,pork,wheat berry,cabbage..."
4,dish_1560455030,0.199903,0.044903,0.009282,0.001437,"cherry tomatoes,cucumbers,baby carrots"


**Note : An ingredient named "deprecated" is present after several checked it might be heavily linked to foods**

In [44]:
# dish_metadata2 convert into structured csv file with (dish_id, calories / g, fat/g, carbs/g, protein/g, ingredients as string)
# dish_metadata2 did not provide dish level nutrition info, manual add up is required

rows = []
with open(dish_metadata2) as file:
    for line in file.readlines():
        row = []
        content = line.split(",")
        row.append(content[0])
        total_mass = float(content[2])
        ingredients_content = content[6:]
        ingredients_list = []
        total_calorie_from_ingr = 0
        total_carbs_from_ingr = 0
        total_protein_from_ingr = 0
        total_fat_from_ingr = 0
        for i in range(0, len(ingredients_content), 7):
            ingredients_list.append(ingredients_content[i + 1])
            total_calorie_from_ingr += float(ingredients_content[i + 3])
            total_fat_from_ingr += float(ingredients_content[i + 4])
            total_carbs_from_ingr += float(ingredients_content[i + 5])
            total_protein_from_ingr += float(ingredients_content[i + 6])
        total_calorie = total_calorie_from_ingr
        total_carb = total_carbs_from_ingr
        total_protein = total_protein_from_ingr
        total_fat = total_fat_from_ingr
        row.extend(
            [
                total_calorie / total_mass,
                total_carb / total_mass,
                total_protein / total_mass,
                total_fat / total_mass,
            ]
        )

        row.append(",".join(ingredients_list))
        rows.append(row)
cleaned_dish_metadata2 = pd.DataFrame(
    rows,
    columns=[
        "dish_id",
        "Calories(kcal)",
        "Carbohydrate(g)",
        "Protein(g)",
        "Fat(g)",
        "Ingredients",
    ],
)

In [45]:
cleaned_dish_metadata2.head()

Unnamed: 0,dish_id,Calories(kcal),Carbohydrate(g),Protein(g),Fat(g),Ingredients
0,dish_1572974428,1.954041,0.161582,0.043231,0.138704,"tomatoes,avocado,multigrain bread,olive oil,ed..."
1,dish_1572464692,1.66,0.14,0.079,0.096,hummus
2,dish_1571931594,1.41,0.18,0.021,0.068,roasted potatoes
3,dish_1575478635,1.48,0.016,0.1,0.11,scrambled eggs
4,dish_1572887017,0.917682,0.064411,0.076443,0.036212,"hash browns,egg whites,salsa"


In [46]:
len(cleaned_dish_metadata1)

4768

In [47]:
# Concat both dataset together
cleaned_metadata = pd.concat([cleaned_dish_metadata1, cleaned_dish_metadata2])

In [48]:
cleaned_metadata = cleaned_metadata.reset_index(drop=True)

In [49]:
cleaned_metadata

Unnamed: 0,dish_id,Calories(kcal),Carbohydrate(g),Protein(g),Fat(g),Ingredients
0,dish_1561662216,1.558520,0.146209,0.096549,0.064184,"soy sauce,garlic,white rice,parsley,onions,bro..."
1,dish_1562688426,1.563295,0.058977,0.117011,0.093818,"roasted potatoes,chicken apple sausage"
2,dish_1561662054,1.436434,0.090245,0.088735,0.081638,"pepper,white rice,mixed greens,garlic,soy sauc..."
3,dish_1562008979,1.320471,0.035081,0.121881,0.076637,"jalapenos,lemon juice,pork,wheat berry,cabbage..."
4,dish_1560455030,0.199903,0.044903,0.009282,0.001437,"cherry tomatoes,cucumbers,baby carrots"
...,...,...,...,...,...,...
5001,dish_1571934465,0.330000,0.080000,0.007000,0.003000,strawberries
5002,dish_1573073666,0.549019,0.077154,0.024811,0.022488,"broccoli,olive oil,bread crumbs,salt,jalapenos"
5003,dish_1575924356,1.316000,0.248000,0.051000,0.011000,pasta
5004,dish_1574359199,1.506565,0.118590,0.059204,0.086729,"scrambled eggs,roasted potatoes,crepes"


In [58]:
# Write the cleaned metadata file
destination_file = Path("./metadata/cleaned_metadata.csv")
cleaned_metadata.to_csv(destination_file, sep="\t", index=False, float_format="%.4f")

## Preprocess video to images

In [8]:
cleaned_metadata = pd.read_csv("./metadata/cleaned_metadata.csv", sep="\t")

In [10]:
source_dir = Path("./imagery/side_angles")
source_dir2 = Path("./imagery/realsense_overhead")
all_dir = [x.name for x in source_dir.iterdir()]
all_dir2 = [x.stem for x in source_dir2.iterdir()]

In [11]:
all_dir = set(all_dir)
all_dir.update(all_dir2)

In [13]:
# remove rows without image data
all_dish_id = cleaned_metadata["dish_id"]
for dish in all_dish_id:
    if dish not in all_dir:
        index_to_drop = cleaned_metadata[cleaned_metadata["dish_id"] == dish].index
        cleaned_metadata = cleaned_metadata.drop(index_to_drop)

In [14]:
assert len(all_dir) == len(
    cleaned_metadata
), "The total entries of metadata does not matched with the total dish_id in image directory"

In [58]:
# Update the metadata file
destination_file = Path("./metadata/cleaned_metadata.csv")
cleaned_metadata.to_csv(destination_file, sep="\t", index=False, float_format="%.4f")

### Extract frames from side_angles directory

In [28]:
destination_dir = Path("../final-dataset/images/generic")
source_dir = Path("./imagery/side_angles")
destination_dir.mkdir()

In [39]:
for directory in source_dir.iterdir():
    output_dir = destination_dir / directory.name
    output_dir.mkdir()
    output_dir = output_dir.resolve().as_posix()
    for video_file in directory.iterdir():
        input_file = video_file.resolve().as_posix()
        os.system(
            'ffmpeg -i "{input_file}" -vf "select=not(mod(n\,5))" -fps_mode vfr "{output_dir}/{camera_name}_frame_%03d.jpeg"'.format(
                input_file=input_file,
                output_dir=output_dir,
                camera_name=video_file.stem,
            )
        )

In [None]:
# 10.39pm "D:\School Materials\FoodNet\Food Datasets\final-dataset\images\generic\dish_1575393132" camera B frames have issues
# "D:\School Materials\FoodNet\Food Datasets\final-dataset\images\generic\dish_1575407477" camera B and camera C frames have issues

### Preprocess single top images from realsense overhead directory

In [40]:
destination_dir = Path("../final-dataset/images/generic")
source_dir = Path("./imagery/realsense_overhead")

In [50]:
for file in source_dir.iterdir():
    output_dir = destination_dir / file.stem
    output_dir.mkdir()
    destination_path = output_dir / file.name
    shutil.copy(file, destination_path)