In [1]:
from keras.applications.resnet_v2 import ResNet50V2, ResNet101V2

import datetime
import keras
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import numpy as np
import tensorflow as tf

from keras_preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.losses import CategoricalCrossentropy
from keras.optimizers import adam_v2, rmsprop_v2
from keras.layers import Input, Conv2D, MaxPool2D, Flatten, Dense, Dropout, BatchNormalization, Layer
from typing import List, Set

In [2]:
!pip install gdown

In [3]:
# Download dataset
!gdown https://drive.google.com/uc?id=1MrjIi8zPwjoUIpwzn7UlomedT18i4gaD

In [4]:
!unzip -q "food-recognition-challenge-2021.zip" -d dataset

In [5]:
!cp dataset/train_set/train_set/* dataset/train_set/
!rm -rf dataset/train_set/train_set

In [6]:
INPUT_DATASET = './dataset/train_set'             # Path to a folder that contains all training images
PATH_LABELS = './dataset/train_labels.csv'        # Path to CSV file containing training labels
PATH_DESTINATION = './dataset/train_set_labelled' # Where new folder will be created with dataset organised in folders by label

# Get a dict mapping file name to its label
# Example: {'train_1.jpg': 21}
file_label_dict = pd.read_csv(PATH_LABELS).set_index('img_name').to_dict()['label']

for file in Path(INPUT_DATASET).iterdir():
    label = file_label_dict[file.name]
    dest_path = Path(PATH_DESTINATION) / str(label)
    # Create missing directories if necessary
    dest_path.mkdir(parents=True, exist_ok=True)
    # Copy the file to the folder with a correct label
    file.rename(dest_path / file.name)


In [7]:
for folder in Path(PATH_DESTINATION).iterdir():
    new_filename = ''
    for char in folder.name:
        new_filename += chr(int(char) + 65)
    new_filename = new_filename.rjust(3, 'A')
    folder.rename(PATH_DESTINATION + '/' + new_filename)

In [9]:
# TODO Find a way to turn off that red debugging spam from tensorflow, this does not work
tf.get_logger().setLevel('WARN')

print(f'Using GPU {tf.test.gpu_device_name()}')

In [10]:
# Global params and constants
WIDTH = 224
HEIGHT = 224
BATCH_SIZE = 32
EPOCHS = 50
TRAIN_IMAGES_PATH = r'./dataset/train_set_labelled'
TEST_IMAGES_PATH = r'./dataset/test_set'
TRAIN_LABELS_PATH = r'./dataset/train_labels.csv'
PREDICTIONS_PATH = r'predictions.csv'
NUM_EXAMPLES = len(list(Path(TRAIN_IMAGES_PATH).rglob('*.jpg')))
NUM_CLASSES = len(list(Path(TRAIN_IMAGES_PATH).iterdir()))
print(f'Num classes: {NUM_CLASSES}  num samples: {NUM_EXAMPLES}')

In [11]:
# Generators allow to get the data in batches without having to worry about the memory
generator = ImageDataGenerator(
    validation_split=0.1,
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=90,
    width_shift_range=0.2,
    height_shift_range=0.2,
    rescale=1. // 255,
    shear_range=0.2,
    zoom_range=0.2,
    fill_mode='nearest',
    brightness_range=[0.8, 1.2]
)
train_gen = generator.flow_from_directory(
    directory=TRAIN_IMAGES_PATH,
    class_mode='categorical',
    batch_size=BATCH_SIZE,
    target_size=(WIDTH, HEIGHT),
    shuffle=True,
    subset='training'
)
validation_gen = generator.flow_from_directory(
    directory=TRAIN_IMAGES_PATH,
    class_mode='categorical',
    batch_size=BATCH_SIZE,
    target_size=(WIDTH, HEIGHT),
    shuffle=True,
    subset='validation'
)
test_gen = generator.flow_from_directory(
    directory=TEST_IMAGES_PATH,
    class_mode=None,
    batch_size=BATCH_SIZE,
    target_size=(WIDTH, HEIGHT),
    shuffle=False
)

In [13]:
from matplotlib import pyplot as plt

for _ in range(5):
    img, label = train_gen.next()
    print(np.max(img[0][0]))
    plt.imshow(img[0].astype(np.uint8))
    plt.show()

In [14]:
def make_predictions(model: keras.Model, test_gen: ImageDataGenerator):
    """
    Output a CSV with model's predictions on test set that is ready to be submitted to Kaggle.
    The file will be created in the main directory of the project, named 'predictions <current_time>'
    """
    predictions = model.predict(test_gen, verbose=True, batch_size=BATCH_SIZE)
    # Get names of test files in the same order they were used for predictions
    file_names = list(map(lambda x: x.split('/')[1], test_gen.filenames))
    # Obtain final labels for predictions, add one since classes start from one
    predictions = predictions.argmax(axis=1) + 1
    result = pd.DataFrame({'img_name': file_names, 'label': predictions})
    result = result.set_index('img_name') 
    # Save the CSV file to main project directory
    result.to_csv(f'predictions {datetime.datetime.now().strftime("%d-%m-%Y %Hh %Mm %Ss")}')

In [20]:
from tensorflow.keras.applications import ResNet50

feature_extractor = ResNet50(weights='imagenet', 
                             input_shape=(WIDTH, HEIGHT, 3),
                             include_top=False)

num_layers = len(feature_extractor.layers)
for layer in feature_extractor.layers[:num_layers // 2]:
    layer.trainable = False

model = Sequential()
model.add(tf.keras.Input(shape=(WIDTH, HEIGHT, 3)))
model.add(feature_extractor)
model.add(tf.keras.layers.GlobalAveragePooling2D())
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(tf.keras.layers.Dense(NUM_CLASSES, activation='softmax'))

model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

model.summary()

In [21]:
!mkdir training
checkpoint_filepath = 'training/weights.{epoch:02d}-{val_loss:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [22]:
model.fit(train_gen, epochs=EPOCHS, validation_data=validation_gen, callbacks=[model_checkpoint_callback])

In [None]:
make_predictions(model, test_gen)