<h2> Initialisation et Construction des Jeux de Données depuis l'Herbier </h2>

In [1]:
import os
import zipfile
from pathlib import Path
import re
import cv2
import numpy as np
import pandas as pd

---

#### Importation du "Dataset" des Image d'un Herbier

In [2]:
relativeProjectPath = Path.cwd().parent
dataPath = relativeProjectPath / 'Data'
dataPath.mkdir(parents=True, exist_ok=True)  # Verify whether the Folder exists and Create it

In [3]:
!python -m wget "https://docs.google.com/uc?export=download&confirm=$https://docs.google.com/uc?export=download&id=1NKaDghYPk3NEdQ4ZBVqXjSs1hNacNW6b&id=1NKaDghYPk3NEdQ4ZBVqXjSs1hNacNW6b" -o ../Data/Dataset.zip


Saved under ../Data/Dataset.zip


In [4]:
with zipfile.ZipFile((dataPath / 'Dataset.zip'), 'r') as zip_ref:
    zip_ref.extractall(dataPath)
(dataPath / 'Dataset.zip').unlink()

---

#### Construction des Jeux de Données Initiale depuis le Dossier "Dataset"

In [5]:
initDatasetPath = dataPath / 'Dataset'

# Get ALL IMAGE
lsImages = []
for dir_, _, files in os.walk(initDatasetPath):
    for file_name in files:
      lsImages.append(re.search(r'Dataset.*', (initDatasetPath / dir_ / file_name).__str__()).group())

In [6]:
# Définir les Informations connu d'après les 4 caractères descriptifs des 11 taxons
# Devrions-nous faire une transformation binaire?
# Bord : { Lisse = 0 / Denté = 1 }
# Phyllotaxie : { Opposé = 0 / Alterné = 1 }
# Type Feuille : { Simple = 0 / Composée = 1 }
# Ligneux : { Non = 0 / Oui = 1 }

def makeBinaryHerbarium(lsHerbarium):
    lsHerbarium[0] = 0 if (lsHerbarium[0]=='lisse') else 1
    lsHerbarium[1] = 0 if (lsHerbarium[1]=='opposé') else 1
    lsHerbarium[2] = 0 if (lsHerbarium[2]=='simple') else 1
    lsHerbarium[3] = 0 if (lsHerbarium[3]=='non') else 1
    return lsHerbarium

Herbarium = dict({
    'amborella': ['lisse', 'alterné', 'simple', 'oui'],
    'castanea': ['denté', 'alterné', 'simple', 'oui'],
    'desmodium': ['lisse', 'alterné', 'composée', 'non'],
    'eugenia': ['lisse', 'opposé', 'simple', 'oui'],
    'laurus': ['lisse', 'opposé', 'simple', 'oui'],
    'litsea': ['lisse', 'alterné', 'simple', 'oui'],
    'magnolia': ['lisse', 'alterné', 'simple', 'oui'],
    'rubus': ['denté', 'alterné', 'composée', 'oui'],
    'ulmus': ['denté', 'alterné', 'simple', 'oui'],
    'monimiaceae': ['lisse', 'opposé', 'simple', 'oui'],
    'convolvulaceae': ['lisse', 'alterné', 'simple', 'non']
})

binaryHerbarium = {herb: makeBinaryHerbarium(Herbarium[herb].copy()) for herb in Herbarium.keys()}

In [7]:
lsHerbarium = []
for pathImage in lsImages:
    typeImage = pathImage.split('\\')[-3]
    shapeImage = cv2.imread(((dataPath / pathImage).__str__())).shape
    sizeImage = np.product(shapeImage)
    nameHerbarium = pathImage.split('\\')[-2]  
    lsHerbarium.append([pathImage, typeImage, shapeImage, sizeImage, nameHerbarium] + Herbarium[nameHerbarium] + binaryHerbarium[nameHerbarium])
    
dfImageHerbarium = pd.DataFrame (lsHerbarium, columns = ['ImagePath', 'Type', 'ImageShape', 'ImageSize', 'Herbarium', 'Bord', 'Phyllotaxie', 'TypeFeuille', 'Ligneux', 'binaryBord', 'binaryPhyllotaxie', 'binaryTypeFeuille', 'binaryLigneux'])
dfImageHerbariumTest = dfImageHerbarium[dfImageHerbarium['Type'] == 'Test']
dfImageHerbariumTrain = dfImageHerbarium[dfImageHerbarium['Type'] == 'Train']


In [8]:
# Split and Save into two SubDatasets (Test and Train)
dfImageHerbariumTest.to_csv(path_or_buf=(dataPath / 'initialTestDataset.csv'), index = 0)
dfImageHerbariumTrain.to_csv(path_or_buf=(dataPath / 'initialTrainDataset.csv'), index = 0)

---