# Construção do dataset

### Imports

In [None]:
from os.path import join
from os import makedirs
from json import load, dump
from re import search

import pandas as pd

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import scripts.definitions as defs
import scripts.data as dt

### Filtragem dos IDs de exames de aproximação

In [None]:
with open(join(defs.DATA_PATH, 'stt_raw_data', 'dataset', 'dataset.json'), 'r', encoding='utf-8') as file:
    dataset = [dt.RawData(**data) for data in load(file)]

raw_lesion_dataset: dict[int, list[dt.RawLesionData]] = {}

for raw_data in tqdm(dataset, desc='Processando exames: '):
    approximation_series = list(filter(lambda series: search(r'LesÃ£o \d+.', series.seriesdescription),
                                       raw_data.series))

    if len(approximation_series) == 0:
        continue

    raw_lesion_dataset[raw_data.id_exame] = []

    for series in approximation_series:
        series_description = series.seriesdescription.split()
        lesion_location = ' '.join(series_description[3:]).encode('latin1').decode('utf-8')

        raw_lesion_data = dt.RawLesionData(
            exam_id=raw_data.id_exame,
            images=series.instances,
            lesion_number=int(series_description[1]),
            lesion_location=lesion_location,
            report=''
        )

        raw_lesion_dataset[raw_data.id_exame].append(raw_lesion_data)

### Conversão de CSV para JSON

In [None]:
df = pd.read_csv(join(defs.DATA_PATH, 'stt_raw_data', 'REDE_QUALIDADE-laudos-reemitidos.csv'))

REPLACEMENTS = {
    '\\n': '\n',
    '<br />': '\n',
    '&emsp;': ' ',
    '&lt;': '<',
    '&gt;': '>',
    '–': '-',
}

raw_report_dataset = {}

for _, raw_report in tqdm(df.iterrows(), desc='Processando laudos: '):
    raw_report = dt.RawReport(**raw_report.to_dict())

    exam_id = int(raw_report.id_exame)

    if exam_id not in raw_lesion_dataset:
        continue

    report = raw_report.laudo

    for pattern, replacement in REPLACEMENTS.items():
        report = report.replace(pattern, replacement)

    raw_report_dataset[exam_id] = report

### Geração dos laudos estruturados

In [None]:
reports = {}

for exam_id, raw_report in tqdm(raw_report_dataset.items(), desc='Estruturando laudos: '):
    # O [1:] remove o tipo de laudo. Sempre é "Exame de Teledermatologia"
    report_parts = list(map(str.strip, raw_report.splitlines()))[1:]

    parsed_reports = []
    footnotes = {}

    while True:
        report, has_next = dt.parse_report(report_parts)

        if not has_next:
            footnotes = dt.parse_report_footnote(report_parts)
            break

        parsed_reports.append(report)

    locations: dict[str, list[dt.Report]] = {}

    for parsed_report in parsed_reports:
        if parsed_report.location not in locations:
            locations[parsed_report.location] = []

        locations[parsed_report.location].append(parsed_report)

    for location, reports in locations.items():
        valid = True
        reference_report = reports[0]

        for report in reports[1:]:
            if report.elementary_lesions != reference_report.elementary_lesions or \
               report.secondary_lesions != reference_report.secondary_lesions or \
               report.coloration != reference_report.coloration or \
               report.morphology != reference_report.morphology or \
               report.size != reference_report.size or \
               report.distribution != reference_report.distribution or \
               report.risk != reference_report.risk or \
               report.skin_lesion != reference_report.skin_lesion:
                valid = False
                break

        if not valid:
            for report in reports:
                parsed_reports.remove(report)
            continue

    for raw_lesion_data in raw_lesion_dataset[exam_id]:
        index = 0

        while index < len(parsed_reports):
            report = parsed_reports[index]

            if raw_lesion_data.lesion_location == report.location:
                if raw_lesion_data.report != '':
                    raise ValueError('Conflito de laudos')

                raw_lesion_data.report = parsed_reports.pop(index)
                break

            index += 1
        
        if raw_lesion_data.report != '':
            lesion_number = raw_lesion_data.lesion_number
            footnote = footnotes.get(lesion_number, None)

            if footnote is not None:
                formatted_footnote = '\n\nConclusão da lesão:\n' + '\n'.join(footnote)
                raw_lesion_data.report.conclusion += formatted_footnote  # type: ignore
lesion_dataset: list[dt.LesionData] = []

for exam_id, raw_lesion_datalist in raw_lesion_dataset.items():
    for raw_lesion_data in raw_lesion_datalist:
        if raw_lesion_data.report != '':
            for image in raw_lesion_data.images:
                lesion_data = dt.LesionData(exam_id=exam_id, image=image, report=raw_lesion_data.report)  # type: ignore
                lesion_dataset.append(lesion_data)

approximation_exams_dicts = [approximation_exam.model_dump() for approximation_exam in lesion_dataset]

makedirs(join(defs.DATA_PATH, 'stt_data'), exist_ok=True)

with open(join(defs.DATA_PATH, 'stt_data', 'dataset.json'), 'w', encoding='utf-8') as file:
    dump(approximation_exams_dicts, file, indent=4, ensure_ascii=False)

data_analysis = dt.analyse_dataset(lesion_dataset, defs.DATA_PATH, 'dataset.json')

### Remoção de lesões raras

In [None]:
dataset = list(filter(lambda data: data_analysis.skin_lesion_distribution.classes[data.report.skin_lesion].count >= 10,
                      lesion_dataset))

_ = dt.analyse_dataset(dataset, defs.DATA_PATH, 'filtered_dataset.json')

### Seccionamento dos dados

In [None]:
# TODO: Resolver o problema dos exames
labels = [data.report.skin_lesion for data in dataset]

training_data, test_data = train_test_split(
    dataset,
    test_size=defs.TEST_PROPORTION,
    train_size=defs.TRAINING_PROPORTION,
    stratify=labels,
    random_state=defs.STATIC_RANDOM_STATE
)

training_labels = [data.report.skin_lesion for data in training_data]

_, validation_data = train_test_split(
    training_data,
    test_size=defs.VALIDATION_PROPORTION / defs.TRAINING_PROPORTION,
    stratify=training_labels,
    random_state=defs.STATIC_RANDOM_STATE
)

dataset_pairs = ((training_data, 'training_dataset.json'),
                 (test_data, 'test_dataset.json'),
                 (validation_data, 'validation_dataset.json'))


for dataset, dataset_name in dataset_pairs:
    dataset_dict = [data.model_dump() for data in dataset]

    with open(join(defs.DATA_PATH, 'stt_data', dataset_name), 'w', encoding='utf-8') as file:
        dump(dataset_dict, file, indent=4, ensure_ascii=False)

    _ = dt.analyse_dataset(dataset, defs.DATA_PATH, dataset_name)