# Construção do dataset

### Imports

In [None]:
from os.path import join
from os import makedirs
from json import load, dump
from re import search

import pandas as pd

from sklearn.model_selection import train_test_split

from scripts.data import analyse_dataset

### Filtragem dos IDs de exames de aproximação

In [None]:
DATA_PATH = join('..', 'data')

with open(join(DATA_PATH, 'stt_raw_data', 'dataset', 'dataset.json'), 'r', encoding='utf-8') as file:
    data = load(file)

aproximation_data = {}

for exam in data:
    aproximation_series = list(filter(lambda s: search(r'LesÃ£o \d+.', s['seriesdescription']), exam['series']))

    if len(aproximation_series) == 0:
        continue

    aproximation_exam = {'id': exam['id_exame'], 'images': []}

    for series in aproximation_series:
        aproximation_exam['images'] += series['instances']

    aproximation_data[exam['id_exame']] = aproximation_exam

### Conversão de CSV para JSON

In [None]:
df = pd.read_csv(join(DATA_PATH, 'stt_raw_data', 'REDE_QUALIDADE-laudos-reemitidos.csv'))

REPLACEMENTS = {
    '\\n': '\n',
    '<br />': '\n',
    '&emsp;': ' ',
    '&lt;': '<',
    '&gt;': '>',
    '–': '-',
}

for _, row in df.iterrows():
    exam_id = int(row['id_exame'])

    if exam_id not in aproximation_data:
        continue

    report = row['laudo']

    for pattern, replacement in REPLACEMENTS.items():
        report = report.replace(pattern, replacement)

    aproximation_data[exam_id]['report'] = report

### Geração dos laudos estruturados

In [None]:
elementary_lesions_domain = [
    'Mácula/mancha',
    'Pápula',
    'Placa',
    'Nódulo',
    'Vesícula',
    'Pústula',
    'Bolha',
    'Cisto',
    'Comedão',
    'Urtica/ponfo',
    'Púrpura',
    'Petéquia',
    'Equimose',
    'Telangectasias',
    'Úlcera',
    'Ausente',
    'Tumor'
]

secondary_lesions_domain = [
    'Escamas',
    'Crostas',
    'Exulceração',
    'Erosão',
    'Fissura',
    'Liquenificação',
    'Atrofia',
    'Cicatriz',
    'Ausente',
    'Escoriação',
    'Ceratose',
    'Alopécia',
    'Maceração'
]

coloration_domain = [
    'Eritematosa (avermelhada)',
    'Castanha',
    'Negra',
    'Perlácea',
    'Violácea',
    'Azulada',
    'Hipo/acrômica (despigmentada)',
    'Eucrômica',
    'Amarelada',
    'Eucrômica'
]

morphology_domain = [
    'Linear',
    'Zosteriforme',
    'Gutata',
    'Lenticular',
    'Anular',
    'Numular',
    'Policíclica',
    'Circinada',
    'Circular ou Arredondada',
    'Irregular/assimétrica',
    'Séssil / Pedunculada',
    'Papilomatosa / Verrucosa',
    'Intertriginosa',
    'Arboriforme',
    'Puntiforme',
    'Folicular'
]

size_domain = [
    '< 1',
    '1 a 2',
    '2 a 4',
    '> 4'
]

distribution_domain = [
    'Única',
    'Localizada',
    'Disseminada',
    'Generalizada'
]

risk_domain = [
    'VERMELHA - QUADROS AGUDOS E GRAVES',
    'AMARELA - ENCAMINHAMENTO COM PRIORIDADE PARA O AMBULATÓRIO DE REFERÊNCIA TERCIÁRIO',
    'AMARELA - ENCAMINHAMENTO COM PRIORIDADE PARA O AMBULATÓRIO DE REFERÊNCIA',  # Versão alternativa
    'VERDE - AVALIAÇÃO CLÍNICO-CIRURGIA COM ESPECIALISTA',
    'AZUL - TRATAMENTO NA UNIDADE BÁSICA DE SAÚDE (UBS)',
    'BRANCA - SEM NECESSIDADE DE INTERVENÇÃO OU ACOMPANHAMENTO'
]

for exam_id, exam in aproximation_data.items():
    # O [1:] remove e o tipo de laudo. Sempre é "Exame de Teledermatologia"
    report_parts = list(map(str.strip, exam['report'].split('\n')))[1:]

    structured_report = {
        'elementary_lesions': [],
        'secondary_lesions': [],
        'coloration': [],
        'morphology': [],
        'size': '',
        'local': '',
        'distribution': [],
        'risk': '',
        'skin_lesion': '',
        'observations': ''
    }

    while report_parts[0] in elementary_lesions_domain:
        structured_report['elementary_lesions'].append(report_parts.pop(0))

    while report_parts[0] in secondary_lesions_domain:
        structured_report['secondary_lesions'].append(report_parts.pop(0))

    while report_parts[0] in coloration_domain:
        structured_report['coloration'].append(report_parts.pop(0))

    while report_parts[0] in morphology_domain:
        structured_report['morphology'].append(report_parts.pop(0))

    if report_parts[0] in size_domain:
        structured_report['size'] = report_parts.pop(0)

    structured_report['local'] = report_parts.pop(0)  # Sem domínio definido

    while report_parts[0] in distribution_domain:
        structured_report['distribution'].append(report_parts.pop(0))

    if report_parts[0] in risk_domain:
        structured_report['risk'] = report_parts.pop(0)

    structured_report['skin_lesion'] = report_parts.pop(0)  # Sem domínio definido

    structured_report['observations'] = '\n'.join(report_parts)

    aproximation_data[exam_id]['report'] = structured_report

approximation_data_list = []

for exam_id, exam in aproximation_data.items():
    approximation_data_list.append(exam)

makedirs(join(DATA_PATH, 'stt_data'), exist_ok=True)

with open(join(DATA_PATH, 'stt_data', 'dataset.json'), 'w', encoding='utf-8') as file:
    dump(approximation_data_list, file, indent=4, ensure_ascii=False)

data_analysis = analyse_dataset(approximation_data_list, DATA_PATH, 'dataset.json')

### Remoção de lesões raras

In [None]:
data = list(filter(lambda exam: data_analysis['skin_lesion_distribution']['classes'][exam['report']['skin_lesion']]['value'] >= 10, approximation_data_list))

data_analysis = analyse_dataset(data, DATA_PATH, 'filtered_dataset.json')

### Seccionamento dos dados

In [None]:
TRAINING_PROPORTION = 0.8
VALIDATION_PROPORTION = 0.1
TEST_PROPORTION = 0.1

labels = [sample['report']['skin_lesion'] for sample in data]

training_data, test_data = train_test_split(
    data,
    test_size=TEST_PROPORTION,
    train_size=TRAINING_PROPORTION,
    stratify=labels,
    random_state=42
)

training_labels = [sample['report']['skin_lesion'] for sample in training_data]

_, validation_data = train_test_split(
    training_data,
    test_size=VALIDATION_PROPORTION / TRAINING_PROPORTION,
    stratify=training_labels,
    random_state=42
)

dataset_pairs = ((training_data, 'training_dataset.json'),
                 (test_data, 'test_dataset.json'),
                 (validation_data, 'validation_dataset.json'))

for dataset, dataset_name in dataset_pairs:
    with open(join(DATA_PATH, 'stt_data', dataset_name), 'w', encoding='utf-8') as file:
        dump(dataset, file, indent=4, ensure_ascii=False)

    analyse_dataset(dataset, DATA_PATH, dataset_name)