**Author :  Ong Cheng Kei TP055620** <br>
**Description :**
<br>This file contains code to build the serialize Recipes5k, Nutrition5k, Food101 into tfrecord files for better efficiency in model training.<br>
This file is also a prerequisite for running the *build_model.ipynb* as it creates the one hot encoder needed to encode the categorical data<br> 
The output will be a set of tfrecord files for each dataset stored in *../Food Datasets/final-dataset/tfrecord*

# Setup

In [1]:
import json
from pathlib import Path
from types import SimpleNamespace

import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras

In [2]:
gpus = tf.config.list_physical_devices("GPU")
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices("GPU")
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

del gpus

1 Physical GPUs, 1 Logical GPUs


# Build a universal one hot encoder that encodes cross-dataset category and ingredients 

In [3]:
class OneHotEncoder:
    def __init__(self, all_category_list, all_ingredient_list):
        self.all_food_categories = all_category_list
        self.all_food_categories.sort()
        self.all_food_categories_integer_encoded = (
            self.__encode_categories_to_integers()
        )
        self.all_ingredients = all_ingredient_list
        self.all_ingredients.sort()
        self.all_ingredients_integer_encoded = self.__encode_ingredients_to_integers()

    def get_category_one_hot_encoding(self, category_name):
        index = self.all_food_categories_integer_encoded[category_name]
        assert index is not None, f"{category_name} does not have an integer mapping"
        num_classes = len(self.all_food_categories)
        return keras.utils.to_categorical(index, num_classes, dtype="uint8")

    def get_ingredients_one_hot_encoding(self, ingredient_list):
        ingredient_list = list(
            map(lambda x: self.__transform_ingredient_to_integer(x), ingredient_list)
        )
        multi_one_hot_layer = tf.keras.layers.CategoryEncoding(
            num_tokens=len(self.all_ingredients), output_mode="multi_hot"
        )
        return tf.cast(multi_one_hot_layer(ingredient_list), dtype=tf.uint8)

    def __transform_ingredient_to_integer(self, ingredient_name):
        index = self.all_ingredients_integer_encoded[ingredient_name]
        assert index is not None, f"{ingredient_name} does not have an integer mapping"
        return index

    def __encode_categories_to_integers(self):
        return {
            category_name: index
            for index, category_name in enumerate(self.all_food_categories)
        }

    def __encode_ingredients_to_integers(self):
        return {
            ingredient_name: index
            for index, ingredient_name in enumerate(self.all_ingredients)
        }

# Build dataset loaders for each dataset

In [4]:
class DatasetLoader:
    def __init__(self, image_dir, metadata_dir, dataset_name):
        self.image_dir = Path(image_dir)
        self.metadata_dir = Path(metadata_dir)
        self.name = dataset_name
        self.metadata = self.load_metadata(
            self.metadata_dir / ("{dataset}_metadata.csv".format(dataset=dataset_name))
        )
        # Default : all_files = metadata, since metadata consists of records of all files
        self.all_files = self.metadata.copy()
        self.all_categories = self.extract_all_categories()
        self.all_ingredients = self.extract_all_ingredients()

    def load_image_to_arr(self, path):
        image = tf.keras.preprocessing.image.load_img(path)
        img_tensor = tf.keras.preprocessing.image.img_to_array(image, dtype="uint8")
        img_tensor = tf.image.resize(img_tensor, (224, 224))
        return tf.cast(img_tensor, tf.uint8)

    def load_metadata(self, path):
        metadata = pd.read_csv(path, sep="\t")
        new_metadata = metadata.copy()
        new_metadata["dataset_name"] = self.name
        return new_metadata

    def extract_all_categories(self):
        return self.metadata["Category"].unique().tolist()

    def extract_all_ingredients(self):
        unique_ingredients = set()
        for ingredient_list in self.metadata["Ingredients"]:
            ingredient_list = ingredient_list.split(",")
            unique_ingredients.update(ingredient_list)
        return [*unique_ingredients]

    def extract_file_pointers(self):
        dataset_name_col = self.all_files["dataset_name"]
        index_col = self.all_files.index
        return pd.DataFrame(
            {"metadata_index": index_col, "dataset_name": dataset_name_col}
        )

    def get_tensors(self, index):
        img_dir = self.image_dir
        row = self.all_files.loc[index]
        img_path = img_dir / row["Category"] / row["ID/File Name"]
        img_tensor = self.load_image_to_arr(img_path)
        if img_path.suffix == ".jpeg" or img_path.suffix == ".jpg":
            img_tensor = tf.io.encode_jpeg(img_tensor, format="rgb")
        elif img_path.suffix == ".png":
            img_tensor = tf.io.encode_png(img_tensor)
        else:
            assert False, "Invalid image format present"
        calorie_tensor = row["Calorie(kcal)"]
        carbs_tensor = row["Carbohydrate(g)"]
        protein_tensor = row["Protein(g)"]
        fat_tensor = row["Fat(g)"]
        return img_tensor, {
            "category_output": tf.constant(row["Category"]),
            "calorie_output": tf.constant(calorie_tensor),
            "carbs_output": tf.constant(carbs_tensor),
            "protein_output": tf.constant(protein_tensor),
            "fat_output": tf.constant(fat_tensor),
            "ingredients_output": tf.constant(row["Ingredients"]),
        }

    def flatten_tensors(self, tensor):
        result = []
        img_data = tensor[0].numpy()
        others_data = [value.numpy() for key, value in tensor[1].items()]
        result.append(img_data)
        result.extend(others_data)
        return result

    def __len__(self):
        return len(self.metadata)

In [5]:
class Recipes5k(DatasetLoader):
    def __init__(self, image_dir, metadata_dir):
        super().__init__(image_dir, metadata_dir, "recipes5k")

In [11]:
class Nutrition5k(DatasetLoader):
    def __init__(self, image_dir, metadata_dir):
        super().__init__(image_dir, metadata_dir, "nutrition5k")
        # Modify all_files since nutrition5k metadata only consists dish_level metadata not image_level
        self.all_files = pd.read_csv(self.metadata_dir / "nutrition5k_all_images.csv")

    # Override method from DatasetLoader
    def get_tensors(self, index):
        img_dir = self.image_dir
        row = self.all_files.loc[index]
        img_path = img_dir / "generic" / row["dish_id"] / row["ID/File Name"]
        img_tensor = self.load_image_to_arr(img_path)
        if img_path.suffix == ".jpeg" or img_path.suffix == ".jpg":
            img_tensor = tf.io.encode_jpeg(img_tensor, format="rgb")
        elif img_path.suffix == ".png":
            img_tensor = tf.io.encode_png(img_tensor)
        else:
            assert False, "Invalid image format present"
        dish_metadata_row = self.metadata.loc[
            self.metadata["dish_id"] == row["dish_id"]
        ].squeeze()
        calorie_tensor = dish_metadata_row["Calorie(kcal)"]
        carbs_tensor = dish_metadata_row["Carbohydrate(g)"]
        protein_tensor = dish_metadata_row["Protein(g)"]
        fat_tensor = dish_metadata_row["Fat(g)"]
        return img_tensor, {
            "category_output": tf.constant(dish_metadata_row["Category"]),
            "calorie_output": tf.constant(calorie_tensor),
            "carbs_output": tf.constant(carbs_tensor),
            "protein_output": tf.constant(protein_tensor),
            "fat_output": tf.constant(fat_tensor),
            "ingredients_output": tf.constant(dish_metadata_row["Ingredients"]),
        }

    # Overrding the method from DatasetLoader
    def __len__(self):
        return len(self.all_files)

In [7]:
class Food101(DatasetLoader):
    def __init__(self, image_dir, metadata_dir):
        super().__init__(image_dir, metadata_dir, "food101")

# Initializing one hot encoder

In [8]:
# Get all the categories and ingredients from all datasets

# Initialize dataset loader without one-hot encoder to get all unique category and ingredients from each dataset
RECIPES5K = Recipes5k(
    image_dir="../Food Datasets/final-dataset/images",
    metadata_dir="../Food Datasets/final-dataset/metadata",
)
NUTRITION5K = Nutrition5k(
    image_dir="../Food Datasets/final-dataset/images",
    metadata_dir="../Food Datasets/final-dataset/metadata",
)
FOOD101 = Food101(
    image_dir="../Food Datasets/final-dataset/images",
    metadata_dir="../Food Datasets/final-dataset/metadata",
)

DATASETS = [RECIPES5K, NUTRITION5K, FOOD101]
DATASETS_NAME = [x.name for x in DATASETS]


def create_one_hot_encoder(datasets):
    all_categories = []
    all_ingredients = []
    for x in datasets:
        all_categories.extend(x.all_categories)
        all_ingredients.extend(x.all_ingredients)
    all_categories = set(all_categories)
    all_ingredients = set(all_ingredients)
    return OneHotEncoder([*all_categories], [*all_ingredients])

In [9]:
ONE_HOT_ENCODER = create_one_hot_encoder(DATASETS)

# Building data pipeline that streams the file index and dataset index

In [12]:
def get_file_data(index, dataset_index):
    target_dataset = DATASETS[dataset_index]
    return target_dataset.flatten_tensors(target_dataset.get_tensors(index))


def build_data_pipeline(datasets, sample_size=None):
    if sample_size is None:
        sample_size = [1.0] * len(datasets)
    assert len(sample_size) == len(
        datasets
    ), "Illegal array of sample sizes provided. Number of sample size does not match number of datasets"
    file_pointers = [
        x.extract_file_pointers().sample(frac=s, random_state=999)
        for x, s in zip(datasets, sample_size)
    ]
    all_file_pointers = pd.concat(file_pointers).sample(frac=1, random_state=999)
    print(f"Total samples : {len(all_file_pointers)}")

    all_file_pointers["dataset_name"] = all_file_pointers["dataset_name"].apply(
        lambda x: DATASETS_NAME.index(x)
    )

    final_dataset = tf.data.Dataset.from_tensor_slices(
        (
            all_file_pointers["metadata_index"].tolist(),
            all_file_pointers["dataset_name"].tolist(),
        )
    )
    return final_dataset

In [11]:
recipes5k_dataset = build_data_pipeline([RECIPES5K])
food101_dataset = build_data_pipeline([FOOD101])
nutrition5k_dataset = build_data_pipeline([NUTRITION5K])

Total samples : 4826
Total samples : 101000
Total samples : 271407


In [12]:
list(recipes5k_dataset.take(2))

[(<tf.Tensor: shape=(), dtype=int32, numpy=2344>,
  <tf.Tensor: shape=(), dtype=int32, numpy=0>),
 (<tf.Tensor: shape=(), dtype=int32, numpy=4556>,
  <tf.Tensor: shape=(), dtype=int32, numpy=0>)]

## Serializing Data Pipeline to TFRecord with TFDS Features

In [13]:
FEATURE_DICTIONARY = tfds.features.FeaturesDict(
    {
        "image_raw": tfds.features.Image(
            shape=(224, 224, 3), doc="Raw bytes of food images encoded with tf.io"
        ),
        "category": tfds.features.Scalar(dtype=tf.string, doc="Category label"),
        "calorie": tfds.features.Scalar(
            dtype=tf.float32, doc="Calorie of the food per gram"
        ),
        "carbs": tfds.features.Scalar(
            dtype=tf.float32, doc="Carbs of the food per gram"
        ),
        "protein": tfds.features.Scalar(
            dtype=tf.float32, doc="Protein of the food per gram"
        ),
        "fat": tfds.features.Scalar(dtype=tf.float32, doc="Fat of the food per gram"),
        "ingredients": tfds.features.Scalar(
            dtype=tf.string, doc="Ingredients of food separated with comma"
        ),
    }
)

### Shard and write to TFRecord file

In [14]:
def shard_and_write(dataset, num_shards, path, dataset_name):
    path = Path(path)
    if not path.exists():
        path.mkdir()

    sharded_template_generator = tfds.core.ShardedFileTemplate(
        data_dir=path.as_posix(),
        template="{DATASET}-{SPLIT}-{SHARD_X_OF_Y}.{FILEFORMAT}",
        dataset_name=dataset_name,
        filetype_suffix="tfrecord",
        split="train",
    )
    shard_length = []
    sharded_filepaths = sharded_template_generator.sharded_filepaths(num_shards)
    for i in range(num_shards):
        current_shard = dataset.shard(num_shards, i)
        with tf.io.TFRecordWriter(sharded_filepaths[i].as_posix()) as writer:
            length = 0
            for record in current_shard.as_numpy_iterator():
                data = get_file_data(record[0], record[1])
                example = {
                    "image_raw": data[0],
                    "category": data[1],
                    "calorie": data[2],
                    "carbs": data[3],
                    "protein": data[4],
                    "fat": data[5],
                    "ingredients": data[6],
                }
                example_bytes = FEATURE_DICTIONARY.serialize_example(example)
                writer.write(example_bytes)
                length += 1
            shard_length.append(length)
    split_info = [
        tfds.core.SplitInfo(
            name="train",
            shard_lengths=shard_length,
            num_bytes=0,
            filename_template=sharded_template_generator,
        )
    ]
    tfds.folder_dataset.write_metadata(
        data_dir=path.as_posix(),
        features=FEATURE_DICTIONARY,
        filename_template="{DATASET}-{SPLIT}-{SHARD_X_OF_Y}.{FILEFORMAT}",
        split_infos=split_info,
    )
    return shard_length

In [19]:
shard_length = shard_and_write(
    recipes5k_dataset,
    10,
    f"../Food Datasets/final-dataset/tfrecord/{RECIPES5K.name}/1.0.0",
    RECIPES5K.name,
)

In [15]:
shard_length2 = shard_and_write(
    food101_dataset,
    30,
    f"../Food Datasets/final-dataset/tfrecord/{FOOD101.name}/1.1.0",
    FOOD101.name,
)

Metadata written. Testing by reading first example. Set check_data=False to skip.


In [14]:
shard_length3 = shard_and_write(
    nutrition5k_dataset,
    20,
    f"../Food Datasets/final-dataset/tfrecord/{NUTRITION5K.name}/1.0.0",
    NUTRITION5K.name,
)

Metadata written. Testing by reading first example. Set check_data=False to skip.


# Exported 

In [15]:
EXPORTED = {"datasets": DATASETS, "one_hot_encoder": ONE_HOT_ENCODER}

In [16]:
EXPORTED = SimpleNamespace(**EXPORTED)

## Export Encoded Categories and Ingredients for Decoding

In [12]:
json.dump(
    ONE_HOT_ENCODER.all_food_categories_integer_encoded,
    open("./encoded_food_categories.json", "w"),
)

In [13]:
json.dump(
    ONE_HOT_ENCODER.all_ingredients_integer_encoded,
    open("./encoded_ingredients.json", "w"),
)

# Data Statistics

## Categories Statistics

In [87]:
print(f"The total number of ingredients : {len(ONE_HOT_ENCODER.all_food_categories)}")

The total number of ingredients : 102


In [30]:
df = pd.concat([RECIPES5K.metadata, FOOD101.metadata])

In [31]:
df_grouped_categories = df.groupby("Category")

In [103]:
print(
    f"The lowest number of images per category : {df_grouped_categories.size().min()}"
)
print(
    f"The highest number of images per category : {df_grouped_categories.size().max()}"
)
total_imgs = df_grouped_categories.size().sum()
total_category = len(df_grouped_categories)

print(f"The average number of images per category : {total_imgs//total_category}")

The lowest number of images per category : 1008
The highest number of images per category : 1050
The average number of images per category : 1047


## Ingredients Statistics

In [80]:
print(f"The total number of ingredients : {len(ONE_HOT_ENCODER.all_ingredients)}")

The total number of ingredients : 1037


In [105]:
average = 0
for ing in df["Ingredients"]:
    count_ing = len(ing.split(","))
    average += count_ing
average = average // len(df)
print(
    f"The average number of ingredients for each dish (recipes5k + food101) : {average}"
)

The average number of ingredients for each dish (recipes5k + food101) : 9


In [81]:
df_nutrition5k = NUTRITION5K.metadata

In [104]:
average = 0
for ing in df_nutrition5k["Ingredients"]:
    count_ing = len(ing.split(","))
    average += count_ing
average = average // len(df_nutrition5k)
print(f"The average number of ingredients for each dish (nutrition5k) : {average}")

The average number of ingredients for each dish (nutrition5k) : 5


# Testing

In [None]:
RECIPES5K.get_tensors(0)

In [None]:
get_file_data(1, 2)