# Construção do dataset

### Imports

In [None]:
from os.path import join
from os import makedirs
from json import load, dump
from re import search

import pandas as pd

from sklearn.model_selection import train_test_split

from scripts.data import analyse_dataset, RawData, RawApproximationExam, RawReport, Report, ApproximationExam

import scripts.definitions as defs

### Filtragem dos IDs de exames de aproximação

In [None]:
with open(join(defs.DATA_PATH, 'stt_raw_data', 'dataset', 'dataset.json'), 'r', encoding='utf-8') as file:
    dataset = [RawData(**data) for data in load(file)]

raw_approximation_exams = {}

for raw_exam in dataset:
    approximation_series = list(filter(lambda series: search(
        r'LesÃ£o \d+.', series.seriesdescription), raw_exam.series))

    if len(approximation_series) == 0:
        continue

    raw_approximation_exam = RawApproximationExam(exam_id=0, images=[], report='')

    for series in approximation_series:
        raw_approximation_exam.images += series.instances

    raw_approximation_exams[raw_exam.id_exame] = raw_approximation_exam

### Conversão de CSV para JSON

In [None]:
df = pd.read_csv(join(defs.DATA_PATH, 'stt_raw_data', 'REDE_QUALIDADE-laudos-reemitidos.csv'))

REPLACEMENTS = {
    '\\n': '\n',
    '<br />': '\n',
    '&emsp;': ' ',
    '&lt;': '<',
    '&gt;': '>',
    '–': '-',
}

for _, raw_report in df.iterrows():
    raw_report = RawReport(**raw_report.to_dict())

    exam_id = int(raw_report.id_exame)

    if exam_id not in raw_approximation_exams:
        continue

    report = raw_report.laudo

    for pattern, replacement in REPLACEMENTS.items():
        report = report.replace(pattern, replacement)

    raw_approximation_exams[exam_id].report = report

### Geração dos laudos estruturados

In [None]:
reports = {}

for exam_id, raw_exam in raw_approximation_exams.items():
    # O [1:] remove o tipo de laudo. Sempre é "Exame de Teledermatologia"
    report_parts = list(map(str.strip, raw_exam.report.splitlines()))[1:]

    structured_report = Report(
        elementary_lesions=[],
        secondary_lesions=[],
        coloration=[],
        morphology=[],
        size='',
        local='',
        distribution=[],
        risk='',
        skin_lesion='',
        conclusion='',
    )

    while report_parts[0] in defs.ELEMENTARY_LESIONS_DOMAIN:
        structured_report.elementary_lesions.append(report_parts.pop(0))

    while report_parts[0] in defs.SECONDARY_LESIONS_DOMAIN:
        structured_report.secondary_lesions.append(report_parts.pop(0))

    while report_parts[0] in defs.COLORATION_DOMAIN:
        structured_report.coloration.append(report_parts.pop(0))

    while report_parts[0] in defs.MORPHOLOGY_DOMAIN:
        structured_report.morphology.append(report_parts.pop(0))

    if report_parts[0] in defs.SIZE_DOMAIN:
        structured_report.size = report_parts.pop(0)

    structured_report.local = report_parts.pop(0)  # Sem domínio definido

    while report_parts[0] in defs.DISTRIBUTION_DOMAIN:
        structured_report.distribution.append(report_parts.pop(0))

    if report_parts[0] in defs.RISK_DOMAIN:
        structured_report.risk = report_parts.pop(0)

    if report_parts[0] in defs.SKIN_LESION_DOMAIN:
        structured_report.skin_lesion = report_parts.pop(0)

    structured_report.conclusion = '\n'.join(report_parts)

    if len(structured_report.secondary_lesions) == 0:
        structured_report.secondary_lesions = ['Nenhuma']

    reports[exam_id] = structured_report

approximation_exams: list[ApproximationExam] = []

for exam_id, raw_exam in raw_approximation_exams.items():
    for image in raw_exam.images:
        approximation_exam = ApproximationExam(exam_id=exam_id, image=image, report=reports[exam_id])
        approximation_exams.append(approximation_exam)

approximation_exams_dicts = [approximation_exam.model_dump() for approximation_exam in approximation_exams]

makedirs(join(defs.DATA_PATH, 'stt_data'), exist_ok=True)

with open(join(defs.DATA_PATH, 'stt_data', 'dataset.json'), 'w', encoding='utf-8') as file:
    dump(approximation_exams_dicts, file, indent=4, ensure_ascii=False)

data_analysis = analyse_dataset(approximation_exams, defs.DATA_PATH, 'dataset.json')

### Remoção de lesões raras

In [None]:
dataset = list(filter(lambda data: data_analysis.skin_lesion_distribution.classes[data.report.skin_lesion].count >= 10,
                      approximation_exams))

_ = analyse_dataset(dataset, defs.DATA_PATH, 'filtered_dataset.json')

### Seccionamento dos dados

In [None]:
labels = [data.report.skin_lesion for data in dataset]

training_data, test_data = train_test_split(
    dataset,
    test_size=defs.TEST_PROPORTION,
    train_size=defs.TRAINING_PROPORTION,
    stratify=labels,
    random_state=defs.STATIC_RANDOM_STATE
)

training_labels = [data.report.skin_lesion for data in training_data]

_, validation_data = train_test_split(
    training_data,
    test_size=defs.VALIDATION_PROPORTION / defs.TRAINING_PROPORTION,
    stratify=training_labels,
    random_state=defs.STATIC_RANDOM_STATE
)

dataset_pairs = ((training_data, 'training_dataset.json'),
                 (test_data, 'test_dataset.json'),
                 (validation_data, 'validation_dataset.json'))


for dataset, dataset_name in dataset_pairs:
    dataset_dict = [data.model_dump() for data in dataset]

    with open(join(defs.DATA_PATH, 'stt_data', dataset_name), 'w', encoding='utf-8') as file:
        dump(dataset_dict, file, indent=4, ensure_ascii=False)

    _ = analyse_dataset(dataset, defs.DATA_PATH, dataset_name)