# Construção do dataset

### Conversão de CSV para JSON

In [1]:
from os.path import join

import pandas as pd

from json import dump, load

DATA_DIR = join('..', 'data')

df = pd.read_csv(join(DATA_DIR, 'STT', 'REDE_QUALIDADE-laudos-reemitidos.csv'))
json_data = []

HTML_REPLACEMENTS = {
    '\\n': '\n',
    '<br />': '\n',
    '&emsp;': ' ',
    '&lt;': '<',
    '&gt;': '>'
}

for _, row in df.iterrows():
    laudo_text = row['laudo']

    for pattern, replacement in HTML_REPLACEMENTS.items():
        laudo_text = laudo_text.replace(pattern, replacement)

    json_data.append({
        'id_solicitacao': row['id_solicitacao'],
        'id_exame': row['id_exame'],
        'id_laudo': row['id_laudo'],
        'laudo': laudo_text
    })

with open(join(DATA_DIR, 'laudos.json'), 'w', encoding='utf-8') as file:
    dump(json_data, file, indent=4, ensure_ascii=False)

### Geração do texto legível com os laudos

In [2]:
with open(join(DATA_DIR, 'laudos.json'), 'r', encoding='utf-8') as file:
    data = load(file)

with open(join(DATA_DIR, 'laudos.txt'), 'w', encoding='utf-8') as file:
    for item in data:
        file.write(f'ID da solicitação: {item["id_solicitacao"]}\n')
        file.write(f'ID do exame: {item["id_exame"]}\n')
        file.write(f'ID do laudo: {item["id_laudo"]}\n\n')
        file.write(f'Laudo: {item["laudo"]}\n')
        file.write(120 * '-' + '\n')

### Geração dos laudos estruturados

In [3]:
from re import search


with open(join(DATA_DIR, 'laudos.json'), 'r', encoding='utf-8') as file:
    data = load(file)

structured_reports = []

for item in data:
    report_parts = list(map(str.strip, item['laudo'].split('\n')))

    report_type = report_parts.pop(0)
    description = []

    while not bool(search(r'\d', report_parts[0])):
        description.append(report_parts.pop(0))

    count = report_parts.pop(0)
    location = report_parts.pop(0)
    spread = []

    while not bool(search(r'^(?:AZUL|BRANCA|VERDE|AMARELA).*', report_parts[0])):
        spread.append(report_parts.pop(0))

    structured_reports.append({
        'solicitation_id': item['id_solicitacao'],
        'exam_id': item['id_exame'],
        'report_id': item['id_laudo'],
        'report_type': report_type,
        'description': description,
        'count': count,
        'location': location,
        'spread': spread,
        'attention_class': report_parts[0],
        'lesion': report_parts[1],
        'recommendations': '\n'.join(report_parts[2:])
    })

with open(join(DATA_DIR, 'reports.json'), 'w', encoding='utf-8') as file:
    dump(structured_reports, file, indent=4, ensure_ascii=False)

### Análise dos dados

In [8]:
from json import load

with open(join(DATA_DIR, 'reports.json'), 'r', encoding='utf-8') as file:
    data = load(file)

value_count_by_key = {}

exclude_keys = ['solicitation_id', 'exam_id', 'report_id']

for entry in data:
    for key, value in entry.items():
        if key in exclude_keys:
            continue
        
        if key not in value_count_by_key:
            value_count_by_key[key] = {}

        if type(value) is list:
            value = tuple(value)
        elif type(value) is str:
            value = value.replace('\n', ' ')

        if value not in value_count_by_key[key]:
            value_count_by_key[key][value] = 0

        value_count_by_key[key][value] += 1

result = ''

for key, value in value_count_by_key.items():
    result += f'{key}:\n'
    total = sum(value.values())

    for value, count in sorted(value.items(), key=lambda x: x[1], reverse=True):
        result += f'  {value}: {count} - {count / total:.2%}\n'

    result += f'  Total: {total}\n\n'

with open(join(DATA_DIR, 'reports_key_value_count.txt'), 'w', encoding='utf-8') as file:
    file.write(result)