# Exploration des données

## 1. Préparation

In [1]:
import os
from PIL import Image, UnidentifiedImageError
from PIL.TiffTags import TAGS
import pandas as pd
import numpy as np

import time

In [2]:
project_path = '/Users/ben/Work/mle/ds-project/mai25_bds_extraction/' # à modifier par chacun en fonction de son arborescence

raw_data_path = os.path.join(project_path, 'data', 'raw')
processed_data_path = os.path.join(project_path, 'data', 'processed')

rvl_cdip_images_path = os.path.join(raw_data_path, 'RVL-CDIP', 'images')

## 2. Fichiers images RVL-CDIP

In [3]:
def get_rvl_cdip_image_files(path):
    tmp_list = []
    for foldername, _, filenames in os.walk(path):
        if filenames:
            filename = filenames[0]
            # we check that the structure is relevant to our expectation with 2 assert
            assert len(filenames) == 1
            if filename.startswith('.'): # avoid to consider files like .DS_Store on mac
                continue
            assert filename.endswith(".tif"), f"{foldername},{filename}"
            tmp_list.append([
                os.path.relpath(foldername, rvl_cdip_images_path),
                filename
            ])
    return pd.DataFrame(tmp_list, columns = ["relative_path", "filename"])


In [4]:
t = time.time()
df = get_rvl_cdip_image_files(rvl_cdip_images_path)
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
print(f"({len(df)} images traitées)\n")

df.head()

Duree d'exécution: 17.251 secondes.
(400001 images traitées)



Unnamed: 0,relative_path,filename
0,imagesr/r/r/r/rrr93a00,504230207+-0208.tif
1,imagesr/r/r/r/rrr56d00,522734895+-4900.tif
2,imagesr/r/r/r/rrr76d00,ti16741083.tif
3,imagesr/r/r/r/rrr60e00,93502397.tif
4,imagesr/r/r/r/rrr95d00,50469246-9246.tif


In [5]:
# Données disponibles sur les images:
relative_filepath, filename = df.iloc[0, :]
file_path = os.path.join(rvl_cdip_images_path, relative_filepath, filename)
with Image.open(file_path) as img:
    print("Format:", img.format)
    print("Size:", img.size)
    print("Mode:", img.mode)

    # Attributes and methods
    print("\nAttributes and methods list:")
    print(dir(img))

    # Attributes
    print("\nAttributes:")
    for key, value in img.__dict__.items():
        print(f"{key}: {value}")

    # Metadata tags
    print("\nTIFF Metadata:")
    for tag, value in img.tag.items():
        tag_name = TAGS.get(tag, tag)
        print(f"{tag_name}: {value}")


# VOIR ICI pour tout ce que l'on peut extraire: https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image

Format: TIFF
Size: (754, 1000)
Mode: L

Attributes and methods list:
['_Image__transformer', '_TiffImageFile__first', '_TiffImageFile__frame', '_TiffImageFile__next', '__annotations__', '__array_interface__', '__arrow_c_array__', '__arrow_c_schema__', '__class__', '__copy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_close_exclusive_fp_after_loading', '_close_fp', '_compression', '_copy', '_crop', '_dump', '_ensure_mutable', '_exclusive_fp', '_exif', '_expand', '_fp', '_frame_pos', '_get_safe_box', '_im', '_load_libtiff', '_min_frame', '_mode', '_n_frames', '_new', '_open', '_planar_configuration', '_readonly', '_reload_exif', '_repr_image', '_

In [6]:
# A réécrire: 1 seule fonction et 1 apply
def get_image_data(file_path): # [TODO: liste des parametres à affiner]
    try:
        with Image.open(file_path) as img:
            format_ = img.format
            width, length = img.size #ici, vérifier les noms à attribuer 
            mode = img.mode
    except UnidentifiedImageError:
        format_, width, length, mode = [np.nan] * 4 # TODO: NAN 
    return format_, width, length, mode

def get_image_data_to_df(df):
    def f(row):
        relative_filepath, filename = row["relative_path"], row["filename"]
        file_path = os.path.join(rvl_cdip_images_path, relative_filepath, filename)
        return get_image_data(file_path)
    return pd.DataFrame(
        df.apply(f, axis = 1).tolist(),
        columns = ["format", "width", "length", "mode"],
        index = df.index
    )

In [7]:
t = time.time()
df = pd.concat([df, get_image_data_to_df(df)], axis = 1)
print(f"Duree d'exécution: {time.time() - t:.3f} secondes.")
print(f"({len(df)} images traitées)\n")
df



Duree d'exécution: 123.042 secondes.
(400001 images traitées)



Unnamed: 0,relative_path,filename,format,width,length,mode
0,imagesr/r/r/r/rrr93a00,504230207+-0208.tif,TIFF,754.0,1000.0,L
1,imagesr/r/r/r/rrr56d00,522734895+-4900.tif,TIFF,754.0,1000.0,L
2,imagesr/r/r/r/rrr76d00,ti16741083.tif,TIFF,754.0,1000.0,L
3,imagesr/r/r/r/rrr60e00,93502397.tif,TIFF,808.0,1000.0,L
4,imagesr/r/r/r/rrr95d00,50469246-9246.tif,TIFF,754.0,1000.0,L
...,...,...,...,...,...,...
399996,imagesw/w/w/w/www54c00,01254356.tif,TIFF,754.0,1000.0,L
399997,imagesw/w/w/w/www21c00,2085572813.tif,TIFF,754.0,1000.0,L
399998,imagesw/w/w/w/www02e00,2028718121-a.tif,TIFF,780.0,1000.0,L
399999,imagesw/w/w/w/www30d00,524384632+-4633.tif,TIFF,754.0,1000.0,L


In [8]:
df.to_pickle("rvl_cdip_draft0.pkl")
df.to_csv("rvl_cdip_draft0.csv")