In [2]:
import os

# Define image extensions
image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff')

# Target folders to scan
folders_to_scan = [
    'data/foot/gout/CAUNURI',
    'data/foot/gout/CAUOLD',
    'data/foot/RA/foot_ra_caunuri',
    'data/foot/RA/foot_ra_cauold'
]

def analyze_folders(base_paths):
    total_subfolders = 0
    folders_with_6_images = 0

    for base_path in base_paths:
        if not os.path.exists(base_path):
            print(f"Path not found: {base_path}")
            continue

        for subfolder in os.listdir(base_path):
            subfolder_path = os.path.join(base_path, subfolder)
            if os.path.isdir(subfolder_path):
                total_subfolders += 1

                image_files = [
                    f for f in os.listdir(subfolder_path)
                    if f.lower().endswith(image_extensions)
                ]

                if len(image_files) == 6:
                    folders_with_6_images += 1

    return total_subfolders, folders_with_6_images

# Run the analysis
total_subfolders, folders_with_6_images = analyze_folders(folders_to_scan)

# Print the results
print(f"Total number of subfolders: {total_subfolders}")
print(f"Total number of folders with exactly 6 images: {folders_with_6_images}")


Total number of subfolders: 2423
Total number of folders with exactly 6 images: 469


In [7]:
import pandas as pd
import re
import unicodedata
import os

# Cleaning function (unchanged)
def clean_report(text, eos_token):
    if pd.isna(text):
        return text

    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'([.!?]){2,}', r'\1', text)

    text = re.sub(r'\[\s*finding\s*\]', '[FINDING]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[\s*conclusion\s*\]', '[CONCLUSION]', text, flags=re.IGNORECASE)
    text = re.sub(r'\[\s*diagnosis\s*\]', '[DIAGNOSIS]', text, flags=re.IGNORECASE)

    parts = re.split(r'\[\s*recommend(?:ation)?\s*\]', text, flags=re.IGNORECASE)
    text = parts[0]

    finding_match = re.search(r'\[FINDING\](.*?)(?=\[|$)', text, flags=re.IGNORECASE | re.DOTALL)
    conclusion_match = re.search(r'\[CONCLUSION\](.*?)(?=\[|$)', text, flags=re.IGNORECASE | re.DOTALL)
    if finding_match and conclusion_match:
        finding_text = finding_match.group(1).strip()
        conclusion_text = conclusion_match.group(1).strip()
        if finding_text.lower() == conclusion_text.lower():
            text = re.sub(r'\[CONCLUSION\].*?(?=\[|$)', '', text, flags=re.IGNORECASE | re.DOTALL)

    text = re.sub(r'\[\s*(FINDING|CONCLUSION|DIAGNOSIS)\s*\]', '', text, flags=re.IGNORECASE)
    text = text.replace('_x000D_', ' ')
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Setup
report_folder = 'report'
output_folder = os.path.join(report_folder, 'cleaned')
os.makedirs(output_folder, exist_ok=True)

# File list
excel_files = [
    'foot_ra_report_cau.xlsx',
    'foot_ra_report_old.xlsx',
    'gout_report_cau.xlsx',
    'gout_report_old.xlsx'
]

# Process
for file in excel_files:
    input_path = os.path.join(report_folder, file)
    if not os.path.exists(input_path):
        print(f"❌ File not found: {input_path}")
        continue

    df = pd.read_excel(input_path, dtype=str)  # Read everything as string to preserve structure

    if '판독문' in df.columns:
        # Create new column with cleaned text
        df['cleaned_판독문'] = df['판독문'].apply(lambda x: clean_report(x, '<eos>'))
    else:
        print(f"⚠️ '판독문' column not found in {file}. Skipping cleaning.")

    # Save cleaned version
    output_file = 'cleaned_' + file
    output_path = os.path.join(output_folder, output_file)
    df.to_excel(output_path, index=False)
    print(f"✅ Saved cleaned file: {output_path}")


✅ Saved cleaned file: report/cleaned/cleaned_foot_ra_report_cau.xlsx
✅ Saved cleaned file: report/cleaned/cleaned_foot_ra_report_old.xlsx
✅ Saved cleaned file: report/cleaned/cleaned_gout_report_cau.xlsx
✅ Saved cleaned file: report/cleaned/cleaned_gout_report_old.xlsx
