# Portugese Meals Classification: Data exploration

In [1]:
import pandas as pd
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from IPython.display import clear_output
import yaml


In [2]:
%cd ..

d:\GigaFolder\projects\Portuguese-Meals-Classification


In [3]:
config = yaml.safe_load(open('config.yaml', 'r'))

In [4]:
classes = os.listdir(config['dataset']['data_dir'])
classes.sort()

labels = []
paths = []

for class_name in classes:
    class_path = os.path.join(config['dataset']['data_dir'], class_name)

    file_names = os.listdir(class_path)
    for file_name in file_names:
        file_path = os.path.join(class_path, file_name)

        labels.append(class_name)
        paths.append(file_path)

        # try:
        #     img = cv2.imread(file_path)
        #     shape = img.shape
        #     if len(shape) != 3:
        #         print('Image file ', file_path,
        #               ' is not an rgb image and was not used in the dataframe')
        #     else:
        #         labels.append(class_name)
        #         paths.append(file_path)
        # except:
        #     print('Image file ', file_path, ' is an invalid image and was not used')
# No exceptions = all images are valid

labels = pd.Series(labels, name='label')
paths = pd.Series(paths, name='path')
df = pd.concat([labels, paths], axis=1)
df.to_csv(config['dataset']['csv_dir'] + 'full.csv', index=False)
df

Unnamed: 0,label,path
0,aletria,./data/raw/aletria\1.jpg
1,aletria,./data/raw/aletria\10.jpg
2,aletria,./data/raw/aletria\100.jpg
3,aletria,./data/raw/aletria\101.jpg
4,aletria,./data/raw/aletria\102.jpg
...,...,...
6721,waffles,./data/raw/waffles\971843.jpg
6722,waffles,./data/raw/waffles\97524.jpg
6723,waffles,./data/raw/waffles\98238.jpg
6724,waffles,./data/raw/waffles\995085.jpg


From this it's easy to see that `caldo_verde` and `croissant` are extremle under represented  
I quess I'll oversample theese classes (more augmented examples and duplication)

In [5]:
def print_class_count(df, name: str, ret=False):
    counts = {}
    print(f'{name} CLASS COUNTS:\n')
    for cls in classes:
        count = df["label"].where(df["label"] == cls).dropna().count()
        counts[cls] = count
        print(f'{cls:20s} : {count}')

    print(f'\nTotal samples: {len(counts)}')
    if ret:
        return counts

In [6]:
class_counts = print_class_count(df, 'Initial', True)

Initial CLASS COUNTS:

aletria              : 234
arroz_cabidela       : 97
bacalhau_bras        : 495
bacalhau_natas       : 97
batatas_fritas       : 541
bolo_chocolate       : 500
cachorro             : 490
caldo_verde          : 23
cozido_portuguesa    : 104
croissant            : 32
donuts               : 495
esparguete_bolonhesa : 491
feijoada             : 99
francesinha          : 500
gelado               : 525
hamburguer           : 494
jardineira           : 98
nata                 : 98
ovo                  : 96
pasteis_bacalhau     : 111
pizza                : 500
tripas_moda_porto    : 107
waffles              : 499

Total samples: 23


In [7]:
# Knowing this is good I quess? Though I didn't ever use this
height = []
width = []
for path in df['path']:
    im = cv2.imread(path)
    width.append(im.shape[0])
    height.append(im.shape[1])

print(f'Average aspect ration: {np.mean(width)/np.mean(height)}')
print(f'\nAverage width: {np.mean(width)}')
print(f'Min|Max width: {np.min(width)}, {np.max(width)}')

print(f'\nAverage height: {np.mean(height)}')
print(f'Min|Max height: {np.min(height)}, {np.max(height)}')

Average aspect ration: 0.8651361721825422

Average width: 571.5801367826346
Min|Max width: 140, 4296

Average height: 660.6822777282189
Min|Max height: 176, 4928
