# Construção do dataset

### Imports

In [131]:
from os.path import join
from os import makedirs
from json import load, dump
from re import search

import pandas as pd

### Filtragem dos IDs de exames de aproximação

In [132]:
DATA_PATH = join('..', 'data')

with open(join(DATA_PATH, 'stt_raw_data', 'dataset', 'dataset.json'), 'r', encoding='utf-8') as file:
    data = load(file)

aproximation_data = {}

for exam in data:
    aproximation_series = list(filter(lambda s: search(r'LesÃ£o \d+.', s['seriesdescription']), exam['series']))

    if len(aproximation_series) == 0:
        continue

    aproximation_exam = {'id': exam['id_exame'], 'images': []}

    for series in aproximation_series:
        aproximation_exam['images'] += series['instances']

    aproximation_data[exam['id_exame']] = aproximation_exam

### Conversão de CSV para JSON

In [133]:
df = pd.read_csv(join(DATA_PATH, 'stt_raw_data', 'REDE_QUALIDADE-laudos-reemitidos.csv'))

REPLACEMENTS = {
    '\\n': '\n',
    '<br />': '\n',
    '&emsp;': ' ',
    '&lt;': '<',
    '&gt;': '>',
    '–': '-',
}

for _, row in df.iterrows():
    exam_id = int(row['id_exame'])

    if exam_id not in aproximation_data:
        continue

    report = row['laudo']

    for pattern, replacement in REPLACEMENTS.items():
        report = report.replace(pattern, replacement)

    aproximation_data[exam_id]['report'] = report

### Geração dos laudos estruturados

In [134]:
elementary_lesions_domain = [
    'Mácula/mancha',
    'Pápula',
    'Placa',
    'Nódulo',
    'Vesícula',
    'Pústula',
    'Bolha',
    'Cisto',
    'Comedão',
    'Urtica/ponfo',
    'Púrpura',
    'Petéquia',
    'Equimose',
    'Telangectasias',
    'Úlcera',
    'Ausente',
    'Tumor'
]

secondary_lesions_domain = [
    'Escamas',
    'Crostas',
    'Exulceração',
    'Erosão',
    'Fissura',
    'Liquenificação',
    'Atrofia',
    'Cicatriz',
    'Ausente',
    'Escoriação',
    'Ceratose',
    'Alopécia',
    'Maceração'
]

coloration_domain = [
    'Eritematosa (avermelhada)',
    'Castanha',
    'Negra',
    'Perlácea',
    'Violácea',
    'Azulada',
    'Hipo/acrômica (despigmentada)',
    'Eucrômica',
    'Amarelada',
    'Eucrômica'
]

morphology_domain = [
    'Linear',
    'Zosteriforme',
    'Gutata',
    'Lenticular',
    'Anular',
    'Numular',
    'Policíclica',
    'Circinada',
    'Circular ou Arredondada',
    'Irregular/assimétrica',
    'Séssil / Pedunculada',
    'Papilomatosa / Verrucosa',
    'Intertriginosa',
    'Arboriforme',
    'Puntiforme',
    'Folicular'
]

size_domain = [
    '< 1',
    '1 a 2',
    '2 a 4',
    '> 4'
]

distribution_domain = [
    'Única',
    'Localizada',
    'Disseminada',
    'Generalizada'
]

risk_domain = [
    'VERMELHA - QUADROS AGUDOS E GRAVES',
    'AMARELA - ENCAMINHAMENTO COM PRIORIDADE PARA O AMBULATÓRIO DE REFERÊNCIA TERCIÁRIO',
    'AMARELA - ENCAMINHAMENTO COM PRIORIDADE PARA O AMBULATÓRIO DE REFERÊNCIA',  # Versão alternativa
    'VERDE - AVALIAÇÃO CLÍNICO-CIRURGIA COM ESPECIALISTA',
    'AZUL - TRATAMENTO NA UNIDADE BÁSICA DE SAÚDE (UBS)',
    'BRANCA - SEM NECESSIDADE DE INTERVENÇÃO OU ACOMPANHAMENTO'
]

for exam_id, exam in aproximation_data.items():
    # O [1:] remove e o tipo de laudo. Sempre é "Exame de Teledermatologia"
    report_parts = list(map(str.strip, exam['report'].split('\n')))[1:]

    structured_report = {
        'elementary_lesions': [],
        'secondary_lesions': [],
        'coloration': [],
        'morphology': [],
        'size': '',
        'local': '',
        'distribution': [],
        'risk': '',
        'skin_lesion': '',
        'observations': ''
    }

    while report_parts[0] in elementary_lesions_domain:
        structured_report['elementary_lesions'].append(report_parts.pop(0))

    while report_parts[0] in secondary_lesions_domain:
        structured_report['secondary_lesions'].append(report_parts.pop(0))

    while report_parts[0] in coloration_domain:
        structured_report['coloration'].append(report_parts.pop(0))

    while report_parts[0] in morphology_domain:
        structured_report['morphology'].append(report_parts.pop(0))

    if report_parts[0] in size_domain:
        structured_report['size'] = report_parts.pop(0)

    structured_report['local'] = report_parts.pop(0)  # Sem domínio definido

    while report_parts[0] in distribution_domain:
        structured_report['distribution'].append(report_parts.pop(0))

    if report_parts[0] in risk_domain:
        structured_report['risk'] = report_parts.pop(0)

    structured_report['skin_lesion'] = report_parts.pop(0)  # Sem domínio definido

    structured_report['observations'] = '\n'.join(report_parts)

    aproximation_data[exam_id]['report'] = structured_report

aproximation_data_list = []

for exam_id, exam in aproximation_data.items():
    aproximation_data_list.append(exam)

makedirs(join(DATA_PATH, 'stt_data'), exist_ok=True)

with open(join(DATA_PATH, 'stt_data', 'dataset.json'), 'w', encoding='utf-8') as file:
    dump(aproximation_data_list, file, indent=4, ensure_ascii=False)

### Análise dos dados

In [137]:
with open(join(DATA_PATH, 'stt_data', 'dataset.json'), 'r', encoding='utf-8') as file:
    data = load(file)

data_analysis = {
    'total_size': 0,
    'average_images_per_exam': 0,
    'elementary_lesions_distribution': {},
    'secondary_lesions_distribution': {},
    'coloration_distribution': {},
    'morphology_distribution': {},
    'size_distribution': {},
    'local_distribution': {},
    'distribution_distribution': {},
    'risk_distribution': {},
    'skin_lesion_distribution': {}
}

data_analysis['total_size'] = len(data)
data_analysis['average_images_per_exam'] = sum(len(exam['images']) for exam in data) / data_analysis['total_size']

for exam in data:
    report = exam['report']

    for elementary_lesion in report['elementary_lesions']:
        if elementary_lesion not in data_analysis['elementary_lesions_distribution']:
            data_analysis['elementary_lesions_distribution'][elementary_lesion] = 0

        data_analysis['elementary_lesions_distribution'][elementary_lesion] += 1

    for secondary_lesion in report['secondary_lesions']:
        if secondary_lesion not in data_analysis['secondary_lesions_distribution']:
            data_analysis['secondary_lesions_distribution'][secondary_lesion] = 0

        data_analysis['secondary_lesions_distribution'][secondary_lesion] += 1

    for coloration in report['coloration']:
        if coloration not in data_analysis['coloration_distribution']:
            data_analysis['coloration_distribution'][coloration] = 0

        data_analysis['coloration_distribution'][coloration] += 1

    for morphology in report['morphology']:
        if morphology not in data_analysis['morphology_distribution']:
            data_analysis['morphology_distribution'][morphology] = 0

        data_analysis['morphology_distribution'][morphology] += 1

    if report['size'] not in data_analysis['size_distribution']:
        data_analysis['size_distribution'][report['size']] = 0

    data_analysis['size_distribution'][report['size']] += 1

    if report['local'] not in data_analysis['local_distribution']:
        data_analysis['local_distribution'][report['local']] = 0

    data_analysis['local_distribution'][report['local']] += 1

    for distribution in report['distribution']:
        if distribution not in data_analysis['distribution_distribution']:
            data_analysis['distribution_distribution'][distribution] = 0

        data_analysis['distribution_distribution'][distribution] += 1

    if report['risk'] not in data_analysis['risk_distribution']:
        data_analysis['risk_distribution'][report['risk']] = 0

    data_analysis['risk_distribution'][report['risk']] += 1

    if report['skin_lesion'] not in data_analysis['skin_lesion_distribution']:
        data_analysis['skin_lesion_distribution'][report['skin_lesion']] = 0

    data_analysis['skin_lesion_distribution'][report['skin_lesion']] += 1

with open(join(DATA_PATH, 'data_analysis.json'), 'w', encoding='utf-8') as file:
    dump(data_analysis, file, indent=4, ensure_ascii=False)