# Portugese Meals Classification: Data preprocessing

In [1]:
import pandas as pd
import os
import cv2
import numpy as np
from sklearn.utils import shuffle
from IPython.display import clear_output
import yaml
import albumentations as A
from sklearn.preprocessing import LabelBinarizer


In [2]:
%cd ..

d:\GigaFolder\projects\Portuguese-Meals-Classification


In [3]:
config = yaml.safe_load(open('config.yaml', 'r'))

In [4]:
df = pd.read_csv(config['dataset']['csv_dir'] + 'full.csv')

In [5]:
classes = os.listdir(config['dataset']['data_dir'])
classes.sort()


def print_class_count(df, name: str, ret=False):
    counts = {}
    print(f'{name} CLASS COUNTS:\n')
    for cls in classes:
        count = df["label"].where(df["label"] == cls).dropna().count()
        counts[cls] = count
        print(f'{cls:20s} : {count}')

    print(f'\nTotal samples: {len(counts)}')
    if ret:
        return counts


class_counts = print_class_count(df, 'Initial', True)
clear_output()


In [6]:
a = df.copy()
assert len(df) == len(a)

In [7]:
# first of all - duplicate (twice) underrepresented classes (~30 ex.)
for idx in range(len(a)):
    cls, path = a.iloc[idx]
    if class_counts[cls] < 50:
        a = a.append({'label': cls, 'path': path}, ignore_index=True)
        a = a.append({'label': cls, 'path': path}, ignore_index=True)

clear_output()
len(a)
# now there're ~60-90 examples of examples of under represented classes


6836

In [8]:
# now I'll remove around 250 examples of overrepresented classes
b = a.copy()
for idx in range(len(a)):
    cls, path = a.iloc[idx]
    if b["label"].where(b["label"] == cls).dropna().count() > 250:
        b = b.drop(b[b.path == path].index)

clear_output()
len(b)
# now there 250 examples of over-represented classes
# (60-90)underrepresented, 200aletria, 250overrepresented, 100normaly distibuted


4056

In [9]:
print_class_count(b, 'Step 1 of processing')

Step 1 of processing CLASS COUNTS:

aletria              : 234
arroz_cabidela       : 97
bacalhau_bras        : 250
bacalhau_natas       : 97
batatas_fritas       : 250
bolo_chocolate       : 250
cachorro             : 250
caldo_verde          : 69
cozido_portuguesa    : 104
croissant            : 96
donuts               : 250
esparguete_bolonhesa : 250
feijoada             : 99
francesinha          : 250
gelado               : 250
hamburguer           : 250
jardineira           : 98
nata                 : 98
ovo                  : 96
pasteis_bacalhau     : 111
pizza                : 250
tripas_moda_porto    : 107
waffles              : 250

Total samples: 23


In [10]:
df = shuffle(b, random_state=config['random_seed'])
del a
del b
len(df)

4056

In [11]:
transformation = A.Compose(
    [
        A.Rotate(limit=25, p=1),
        A.HorizontalFlip(p=0.5),
        A.GaussianBlur(blur_limit=(3, 9), p=0.6),
        A.RandomBrightnessContrast(p=0.4),
    ]
)


def resize(image, image_size=config['img_shape'][0]):
    # I'll go for a square image
    return cv2.resize(image, (image_size, image_size), interpolation=cv2.INTER_AREA)


def augment(image):
    return transformation(image=image)['image']


def normalize(image):
    image = image / 255
    return image


def preprocess(image):
    image = augment(image)
    image = resize(image)
    image = normalize(image)
    return image


In [12]:
labels = []
images = []

clear_output()
for idx in range(len(df)):
    cls, path = df.iloc[idx]
    image = cv2.imread(path)
    
    im1 = preprocess(image)

    labels.append(cls)
    images.append(im1)

    if not (cls == 'aletria' or class_counts[cls] >= 250):
        im2 = preprocess(image)

        image = resize(image)
        image = normalize(image)

        labels.append(cls)
        labels.append(cls)

        images.append(im2)
        images.append(image)

len(labels)


6200

In [13]:
arr = np.array(labels)
u, c = np.unique(arr, return_counts=True)
print(np.asarray((u, c)).T)

[['aletria' '234']
 ['arroz_cabidela' '291']
 ['bacalhau_bras' '250']
 ['bacalhau_natas' '291']
 ['batatas_fritas' '250']
 ['bolo_chocolate' '250']
 ['cachorro' '250']
 ['caldo_verde' '207']
 ['cozido_portuguesa' '312']
 ['croissant' '288']
 ['donuts' '250']
 ['esparguete_bolonhesa' '250']
 ['feijoada' '297']
 ['francesinha' '250']
 ['gelado' '250']
 ['hamburguer' '250']
 ['jardineira' '294']
 ['nata' '294']
 ['ovo' '288']
 ['pasteis_bacalhau' '333']
 ['pizza' '250']
 ['tripas_moda_porto' '321']
 ['waffles' '250']]


In [14]:
encoder = LabelBinarizer()
labels = np.asarray(labels)
labels = encoder.fit_transform(labels)

images = np.asarray(images, dtype = np.float16)

In [15]:
def compress_splits(X, Y, dir):
    np.savez_compressed(dir + 'Xvalues.npz', X)
    np.savez_compressed(dir + 'Yvalues.npz', Y)

In [16]:
compress_splits(images, labels, config['dataset']['augmented_dir'])