## **Separamos el conjunto de datos de entrenamiento y validación**

In [9]:
# Definimos el directorio del dataset y el JSONL creado en pre-processing
dataset_parent_dir = "./data"
# jsonl_annotations = output_jsonl_file
jsonl_annotations = 'fire_images_png.jsonl'
print(jsonl_annotations)

fire_images_png.jsonl


Creamos los archivos JSONL para entrenamiento (train_annotations.jsonl) y validación (validation_annotations.jsonl)

In [10]:
import json
import os

# Definimos los paths de los folders de entrenamiento y validacion ./data/training-mltable-folder
training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder")
validation_mltable_path = os.path.join(dataset_parent_dir, "validation-mltable-folder")

# Creamos los folders: training-mltable-folder y validation-mltable-folder
os.makedirs(training_mltable_path, exist_ok=True)
os.makedirs(validation_mltable_path, exist_ok=True)

# De cada 5 lineas que se lean del dataset original -> 4 lineas iran a training y 1 a validacion
# es decir, de cada 10 samples, 2 iran a validacion y 8 al entrenamiento (20% validation - 80% training)
train_validation_ratio = 5

# Creamos los archivos jsonl para entrenamiento y validacion 
train_annotations_file = os.path.join(training_mltable_path, "train_annotations_png.jsonl")
validation_annotations_file = os.path.join(validation_mltable_path, "validation_annotations_png.jsonl")

# Leemos el archivo JSONL
with open(jsonl_annotations, "r") as annot_f:
    json_lines = annot_f.readlines()

#Abrimos los archivos train_annotations.jsonl y validation_annotations.jsonl para distribuir records del jsonl_annotations (fire_images_png.jsonl)
index = 0;
with open(train_annotations_file, "w") as train_f:
    with open(validation_annotations_file, "w") as validation_f:
        for json_line in json_lines:
            if index % train_validation_ratio == 0:
                validation_f.write(json_line)
            else:
                train_f.write(json_line)
            index += 1

**Creamos el MLTable para enviar al job de AutoML**

In [11]:
def create_ml_table_file(filename):
    return(
        "paths:\n"
        "  - file: ./{0}\n"
        "transformations:\n"
        "  - read_json_lines:\n"
        "      encoding: utf8\n"
        "      invalid_lines: error\n"
        "      include_path_column: false\n"
        "  - convert_column_type:\n"
        "    - columns: image_url\n"
        "      column_type: stream_info"
    ).format(filename)

# Este metodo guarda el archivo "MLTable" en el directorio output_path ./data/training-mltable-folder
def save_ml_table_file(output_path, mltable_file_contents):
    with open(os.path.join(output_path, "MLTable"), "w") as f:
        f.write(mltable_file_contents)

# Creamos el train mltable
train_mltable_file_contents = create_ml_table_file(os.path.basename(train_annotations_file))
save_ml_table_file(training_mltable_path, train_mltable_file_contents)
# Creamos el validation mltable
validation_mltable_file_contents = create_ml_table_file(os.path.basename(validation_annotations_file))
save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)

** Creamos los inputs para mandar al job de AutoML**

In [16]:
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml import Input

#Definimos el input de entrenamiento que usa un MLTable y envia su path ./data/training-mltable-folder
my_training_data_input = Input(type=AssetTypes.MLTABLE, path = training_mltable_path)
print(my_training_data_input)

my_validation_data_input = Input(type=AssetTypes.MLTABLE, path = validation_mltable_path)
print(my_validation_data_input)

{'type': 'mltable', 'path': './data/training-mltable-folder'}
./data/training-mltable-folder
{'type': 'mltable', 'path': './data/validation-mltable-folder'}


In [17]:
#Recuperar todo
training_mltable_path = './data/training-mltable-folder'
validation_mltable_path = './data/validation-mltable-folder'

my_training_data_input = Input(type=AssetTypes.MLTABLE, path = training_mltable_path)
print(my_training_data_input)
my_validation_data_input = Input(type=AssetTypes.MLTABLE, path = validation_mltable_path)
print(my_validation_data_input)

{'type': 'mltable', 'path': './data/training-mltable-folder'}
{'type': 'mltable', 'path': './data/validation-mltable-folder'}
