In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch==2.3 transformers==4.39.0 appdirs jsonpickle filelock h5py spacy nltk pytest radgraph tqdm


Collecting torch==2.3
  Downloading torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting transformers==4.39.0
  Downloading transformers-4.39.0-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3)

In [None]:
import os
import shutil

src_path = "/content/drive/MyDrive/Dual view Slava/radgraph-xl.tar.gz"
cache_dir = "/root/.cache/radgraph/0.1.2"
dst_path = f"{cache_dir}/radgraph-xl.tar.gz"

os.makedirs(cache_dir, exist_ok=True)

shutil.copy(src_path, dst_path)

print("Model copied to RadGraph cache location.")


In [None]:
import json
from tqdm import tqdm
from radgraph import RadGraph

def initialize_radgraph():
    try:
        return RadGraph(model_type="radgraph-xl", device="cpu")
    except Exception as e:
        print(f"Failed to initialize RadGraph: {e}")
        return None



def clean_text_with_context(text, radgraph_instance):
    if not text.strip():
        return ""

    try:
        annotations = radgraph_instance([text])
        entities = annotations["0"].get("entities", {})
        important_tokens = set()
        for ent in entities.values():

            if ent["label"] not in ("O",):
                important_tokens.add(ent["tokens"].strip())


        sentences = [s.strip() for s in text.replace('\n', ' ').split('.') if s.strip()]


        kept_sentences = []
        for sentence in sentences:
            if "_" in sentence:
                continue
            for token in important_tokens:
                if token in sentence:
                    kept_sentences.append(sentence.strip())
                    break

        return '. '.join(kept_sentences) + '.' if kept_sentences else ""

    except Exception as e:
        print(f"Error processing text: {e}")
        return ""


def main():
    rg = initialize_radgraph()
    if not rg:
        return

    try:
        with open("cleaned_reports.json") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Failed to load input file: {e}")
        return


    for study_id, entry in tqdm(data.items()):

        try:
            entry["findings"] = clean_text_with_context(entry.get("findings", ""), rg)
            entry["impression"] = clean_text_with_context(entry.get("impression", ""), rg)

        except Exception as e:
            print(f"Error processing entry {study_id}: {e}")
            continue

    try:
        with open("cleaned_reports_clinical.json", "w") as f:
            json.dump(data, f, indent=2)

    except Exception as e:
        print(f"Failed to save results: {e}")

if __name__ == "__main__":
    main()

Using device: cpu


100%|██████████| 32734/32734 [4:25:56<00:00,  2.05it/s]


In [None]:
import json

def has_short_sentence(text):
    if not text.strip():
        return True
    return False

def main():
    try:
        with open("cleaned_reports_clinical.json") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Failed to load file: {e}")
        return


    cleaned_data = {}
    empty_findings_count = 0
    empty_impression_count = 0
    both_empty_count = 0

    for study_id, entry in data.items():
        findings = entry.get("findings", "")
        impression = entry.get("impression", "")

        findings_empty = not findings.strip()
        impression_empty = not impression.strip()

        if findings_empty and impression_empty:
            both_empty_count += 1
        elif findings_empty:
            empty_findings_count += 1
        elif impression_empty:
            empty_impression_count += 1


        if not findings_empty and not impression_empty:
            cleaned_data[study_id] = entry


    try:
        with open("cleaned_radgraph_output.json", "w") as f:
            json.dump(cleaned_data, f, indent=2)


        print(f"Original studies count: {len(data)}")
        print(f"Cleaned studies count: {len(cleaned_data)}")
        print(f"Removed studies with empty findings: {empty_findings_count}")
        print(f"Removed studies with empty impressions: {empty_impression_count}")
        print(f"Removed studies with both empty: {both_empty_count}")
        print(f"Total removed: {empty_findings_count + empty_impression_count + both_empty_count}")

    except Exception as e:
        print(f"Failed to save cleaned file: {e}")

if __name__ == "__main__":
    main()

Original studies count: 32734
Cleaned studies count: 31830
Removed studies with empty findings: 179
Removed studies with empty impressions: 725
Removed studies with both empty: 0
Total removed: 904
