Vérifier les tailles des images CT et pneumonie.

_Résultat_.
Comme prévu, les images COVID sont de même  taille: 299 x 299.
Mais parmi celles de la pneumonie, 90% sont en noir-et-blanc, 
tandis que 10% sont "en couleur" (tenseurs 3d).

_Conséquence_.
Il faudra vérifier si les images "en couleur" ont des canaux couleur différents, et si les 3 canaux sont identiques, garder un seul.

Next version:
in 'path2DF_iter_im' : return an iterator of pairs number, image.
in 'path2DF_sizes' :
* store the numbers,
* use them for the index,
* return sorted DF (by index).  


In [1]:
import os
import matplotlib.image as mpimg
import numpy as np
import time
import pandas as pd

from functools import partial

In [2]:
DIM_NAMES = ['Height', 'Width']
DIM_NAMES3 = ['Height', 'Width', 'Color']

In [None]:
# Fonctions purement techniques
# pour passer du chemin vers un dossier d'images
# et d'une fonction d'extraction de descripteurs
#
# à une DF avec les numéros des images et descripteurs.

def fname_num(fname: str):
    """
    'Type'_'num'.png
    (str)
    ->
    num (integer)
    
    Example:
    >>> fname_num("Viral Pneumonia-42.png")
    42
    """
    str_num = fname.split('.')[0].split('-')[-1]
    return int(str_num)


def path2DF_iter_im(path_folder):
    """
    path to folder with images
    ->

    iterator of tuples:
    image numbers, images (matrices or tensors)

    in desorder.
    """
    file_names = os.listdir(path=path_folder)
    # paths_images = map(partial(os.path.join, path_folder), file_names)
    # iter_images = map(mpimg.imread, paths_images)
    iter_images = ((fname_num(fname), mpimg.imread(os.path.join(path_folder, fname))) for fname in file_names)
    return iter_images


def extract_feat_2DF(fun_feat, iter_img, feat_names):
    """
    path to folder with N images
    ->

    DF (N * features) with:
    image number as index, 
    features as data
    """

    lst_num_shape = [(id_mat[0], fun_feat(id_mat[1])) for id_mat in iter_img]
    # list of (index, arrays of features)
    
    # Conversion dict -> sorted DF
    dict_num_shape = dict(lst_num_shape)
    df_shapes = pd.DataFrame.from_dict(dict_num_shape, orient='index', columns=feat_names)
    df_shapes = df_shapes.sort_index()
    return df_shapes


In [4]:
# Fonctions liées à l'extraction de données spécifiques à cet exercice:
# la taille d'une image
# (vecteur de longueur 3 pour pouvoir s'adapter aux images codées en noir-et-blanc ou "avec 3 canaux de couleurs").

def arr_len3(tup):
    """
    tuple of ints
    ->
    np.ndarray of length 3

    La fin du vecteur est remplie de zéros
    si les données sont plus courtes. 
    """
    res = np.zeros(3, dtype=np.int32)
    L_data = len(tup)
    
    if L_data >= 3:
        res[0:3] = tup[0:3]
    else:
        res[0:L_data] = tup
    return res


def path2DF_sizes(path_folder):
    """
    The main function.
    
    path to folder with images
    ->
    DF (N * 3) with:
    image number as index, 
    sizes as data

    The column 'Color' equals zero if an image is encoded without info of Color.
    """
    return extract_feat_2DF(lambda mat: arr_len3(mat.shape),
                            path2DF_iter_im(path_folder),
                            DIM_NAMES3)


Tailles des images COVID.

In [5]:
condition_2plot_COV = "COVID"
path_dir_images = os.path.join("../data", condition_2plot_COV, "images")

In [6]:
start = time.time()
df_sizes_COV = path2DF_sizes(path_dir_images)
done = time.time()

elapsed = done - start
print(f'time = {elapsed:.2f} sec \n\n')  # 5 sec

print(df_sizes_COV.shape)  # (3616, 3)
df_sizes_COV.head()


time = 4.71 sec 


(3616, 3)


Unnamed: 0,Height,Width,Color
1,299,299,0
2,299,299,0
3,299,299,0
4,299,299,0
5,299,299,0


In [7]:
print('Height:')
print(df_sizes_COV[DIM_NAMES3[0]].value_counts())
# 299   3616
print('------')
print('Width:')
print(df_sizes_COV[DIM_NAMES3[1]].value_counts())
# 299   3616
print('------')
print('Color:')
print(df_sizes_COV[DIM_NAMES3[2]].value_counts())
# 0 3616


Height:
299    3616
Name: Height, dtype: int64
------
Width:
299    3616
Name: Width, dtype: int64
------
Color:
0    3616
Name: Color, dtype: int64


Tailles des images Pneumonie.

In [8]:
condition_2plot_pneumo = "Viral Pneumonia"
path_dir_images_pneumo = os.path.join("../data", condition_2plot_pneumo, "images")

In [9]:
start = time.time()
df_sizes_pneumo = path2DF_sizes(path_dir_images_pneumo)

done = time.time()

elapsed = done - start
print(f'time = {elapsed:.2f} sec \n\n') # 2 sec

print(df_sizes_pneumo.shape)  # (1345, 3)
df_sizes_pneumo.head()


time = 1.98 sec 


(1345, 3)


Unnamed: 0,Height,Width,Color
1,299,299,0
2,299,299,0
3,299,299,0
4,299,299,0
5,299,299,0


In [10]:
print('Height:')
print(df_sizes_pneumo[DIM_NAMES3[0]].value_counts())
# 299   1345
print('------')
print('Width:')
print(df_sizes_pneumo[DIM_NAMES3[1]].value_counts())
# 299   1345
print('------')
print('Color:')
print(df_sizes_pneumo[DIM_NAMES3[2]].value_counts())
# 0 1205
# 3  140


Height:
299    1345
Name: Height, dtype: int64
------
Width:
299    1345
Name: Width, dtype: int64
------
Color:
0    1205
3     140
Name: Color, dtype: int64
