In [2]:
import os
import re
from io import BytesIO
import base64
from PIL import Image, UnidentifiedImageError
from paddleocr import PaddleOCR
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import numpy as np
import shutil
import logging
import pandas as pd
import warnings
import pydicom

In [None]:
# Set logging level for PaddleOCR
logging.getLogger("ppocr").setLevel(logging.ERROR)

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

warnings.filterwarnings("ignore", category=UserWarning, module="pydicom")

In [None]:
# Initialize global DataFrame variables
done_df = pd.DataFrame(columns=['Folder'])
not_done_df = pd.DataFrame(columns=['Folder'])

In [None]:
def modify_dicom_files(root_path, excel_path):
    # Check if the master Excel file exists
    if os.path.exists(excel_path):
        institution_df = pd.read_excel(excel_path)
    else:
        institution_df = pd.DataFrame(columns=["InstitutionName", "Counter"])

    dicom_files = []
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.endswith('.dcm'):
                dicom_files.append(os.path.join(root, file))

    for dicom_path in tqdm(dicom_files, desc="Processing DICOM files"):
        dicom_image = pydicom.dcmread(dicom_path, force=True)
        
        # Check if PatientName attribute exists
        if hasattr(dicom_image, 'PatientName'):
            # Extract the patient's age from PatientName
            patient_name = str(dicom_image.PatientName)
            age_match = re.search(r'(\d+)(?:Y(?:rs?)?|Year?)$', patient_name, re.IGNORECASE)
            if age_match:
                dicom_image.PatientAge = age_match.group(1)

            # Delete the PatientName, PatientID, and InstitutionName tags
            del dicom_image.PatientName
            del dicom_image.PatientID
            
        tag_numbers = [(0x0021, 0x0012), (0x0400, 0x0561), (0x0009, 0x0010), (0x0008, 0x1070), (0x0002, 0x0013), (0x0008, 0x0090), (0x0002, 0x0016), (0x0010, 0x0020)]
        
        metadata = dicom_image.file_meta
        
        metadata[(0x0002, 0x0016)].value = ''
        metadata[(0x0002, 0x0013)].value = ''

        # Handle the InstitutionName tag
        if hasattr(dicom_image, 'InstitutionName'):
            institution_name = str(dicom_image.InstitutionName)
            if institution_name not in institution_df['InstitutionName'].values:
                new_counter = len(institution_df) + 1
                new_entry = pd.DataFrame({"InstitutionName": [institution_name], "Counter": [new_counter]})
                institution_df = pd.concat([institution_df, new_entry], ignore_index=True)
            else:
                new_counter = institution_df[institution_df['InstitutionName'] == institution_name]['Counter'].values[0]
            dicom_image.InstitutionName = str(new_counter)

        for tag_number in tag_numbers:
            if tag_number in dicom_image:
                del dicom_image[tag_number]

        # Save the modified DICOM image with the same name
        try:
            dicom_image.save_as(dicom_path)
        except Exception as e:
            print(f"Error saving DICOM file: {dicom_path}")
            print(f"Error message: {str(e)}")

    # Save the DataFrame to an Excel file
    institution_df.to_excel(excel_path, index=False)

In [None]:
# Function to save DataFrames to Excel
def save_dataframes():
    global done_df, not_done_df, done_file, not_done_file
    done_df.to_excel(done_file, index=False)
    not_done_df.to_excel(not_done_file, index=False)
    print("DataFrames saved to Excel.")

def extract_image_tags(soup):
    img_tags = soup.find_all('img')
    for img_tag in img_tags:
        base64str = img_tag['src']
        image = base64_to_image(base64str)
        if image is None:
            continue
        image = np.array(image)
        
        result = ocr.ocr(image, cls=True)
        extracted_text = ' '.join([element[1][0] for line in result if line for element in line if element])
        
        pattern = r"Reg(.*)"
        match = re.search(pattern, extracted_text, re.IGNORECASE)
        if match:
            return base64str, match, extracted_text
    return None, None, None

def base64_to_image(base64str):
    base64_string = base64str.split(",")[-1]
    
    # Fix padding issues in base64 string
    missing_padding = len(base64_string) % 4
    if missing_padding:
        base64_string += '=' * (4 - missing_padding)
    
    try:
        image_data = base64.b64decode(base64_string)
        image = Image.open(BytesIO(image_data))
        return image
    except (UnidentifiedImageError, base64.binascii.Error) as e:
        logging.warning(f"Unidentified image file or invalid base64 string: {e}")
        return None

def replace_image_with_text(soup, original_base64str, text):
    img_tag = None
    for tag in soup.find_all('img'):
        if original_base64str in tag['src']:
            img_tag = tag
            break
    if img_tag:
        img_tag.replace_with(text)

def erase_and_save_details(input_folder, excel_path, no_report_path, error_folder):
    global done_df, not_done_df, done_file, not_done_file
    count = 0
    no_report = []

    # Initialize an empty DataFrame to store the extracted data
    columns = ["Folder", "Patient ID", "Patient Name", "Age", "Gender", "Study", "Modality", "Study Date", "Accession", "Physician", "Extracted Text", "Reg No"]
    data_df = pd.DataFrame(columns=columns)

    # Load existing done folders from Excel if it exists
    done = []
    done_file = os.path.join(excel_path, 'done_folders.xlsx')
    not_done_file = os.path.join(excel_path, 'not_done_folders.xlsx')
    
    if os.path.exists(done_file):
        done_df = pd.read_excel(done_file)
        done = done_df['Folder'].tolist()
    
    # Ensure directories exist
    os.makedirs(no_report_path, exist_ok=True)
    os.makedirs(error_folder, exist_ok=True)

    # Iterate through each folder in the input folder
    for folder in tqdm(os.listdir(input_folder)):
        if folder not in done:
            inside_folder = os.path.join(input_folder, folder)
            html_files = [f for f in os.listdir(inside_folder) if f.endswith('.html')]
            
            if not html_files:
                shutil.move(inside_folder, os.path.join(no_report_path, folder))
                no_report.append(folder)
            else:
                for filename in os.listdir(inside_folder):
                    if filename.startswith('Approved'):              
                        # Read the HTML file
                        with open(os.path.join(inside_folder, filename), 'r', encoding='utf-16') as file:
                            html_content = file.read()
        
                        # Parse the HTML content with BeautifulSoup
                        soup = BeautifulSoup(html_content, 'html.parser')
        
                        base64str, match, extracted_text = extract_image_tags(soup)
        
                        if base64str:    
                            patient_name_value = None
                            patient_id_value = None
                            age_value = None
                            sex_value = None
                            study_value = None
                            modality_value = None
                            study_date_value = None
                            accession_value = None
                            physician_value = None
        
                            # Find and erase 'Patient Name' and 'Patient ID' values
                            for tag in soup.find_all('td'):
                                if tag.find('b') and 'Patient Name' in tag.find('b').text:
                                    patient_name_value = tag.get_text().strip().replace('Patient Name:', '').strip()
                                    age_match = re.search(r'\d+', patient_name_value)
                                    if age_match:
                                        age_value = patient_name_value[age_match.start():].strip()
                                        # Remove the age part from patient_name_value
                                        patient_name_value = patient_name_value[:age_match.start()].strip()
                                    # Replace text after 'Patient Name' with empty string
                                    tag.contents[-1].replace_with('')
        
                                if tag.find('b') and 'Patient ID' in tag.find('b').text:
                                    patient_id_value = tag.get_text().strip().replace('Patient ID:', '').strip()
                                    tag.contents[-1].replace_with('')
        
                                if tag.find('b') and 'Sex' in tag.find('b').text:
                                    sex_value = tag.get_text().strip().replace('Sex:', '').strip()
        
                                if tag.find('b') and 'Modality' in tag.find('b').text:
                                    modality_value = tag.get_text().strip().replace('Modality:', '').strip()
        
                                if tag.find('b') and 'Study' in tag.find('b').text and not 'Study ' in tag.find('b').text:
                                    study_value = tag.get_text().strip().replace('Study:', '').strip()
        
                                if tag.find('b') and 'Study Date' in tag.find('b').text:
                                    study_date_value = tag.get_text().strip().replace('Study Date:', '').strip()
        
                                if tag.find('b') and 'Accession Number' in tag.find('b').text:
                                    accession_value = tag.get_text().strip().replace('Accession Number:', '').strip()
                                    tag.contents[-1].replace_with('')
        
                                if tag.find('b') and 'Referring Physician' in tag.find('b').text:
                                    physician_value = tag.get_text().strip().replace('Referring Physician:', '').strip()
                                    tag.contents[-1].replace_with('')
        
                            # Insert the age value into the <b>Age</b> tag if it exists
                            if age_value:
                                for tag in soup.find_all('td'):
                                    if tag.find('b') and 'Age' in tag.find('b').text:
                                        # Replace ':' with ': ' + age_value
                                        tag.contents[-1].replace_with(f':{age_value}')
                            else:
                                for tag in soup.find_all('td'):
                                    if tag.find('b') and 'Age' in tag.find('b').text:
                                        age_value = tag.get_text().strip().replace('Age:', '').strip()
                                        if not age_value:
                                            age_value = "0"
        
                            if match:
                                reg_no = match.group(0).strip()
                                replace_image_with_text(soup, base64str, reg_no)
        
                                # Create a DataFrame row with the extracted data
                                data_row = {
                                    "Folder": folder,
                                    "Patient ID": patient_id_value,
                                    "Patient Name": patient_name_value,
                                    "Age": age_value,
                                    "Gender": sex_value,
                                    "Study": study_value,
                                    "Modality": modality_value,
                                    "Study Date": study_date_value,
                                    "Accession": accession_value,
                                    "Physician": physician_value,
                                    "Extracted Text": extracted_text,
                                    "Reg No": reg_no
                                }
                                
                                # Append the data row to the DataFrame
                                data_df = pd.concat([data_df, pd.DataFrame([data_row])], ignore_index=True)
            
                                # Write the modified HTML to the output folder
                                with open(os.path.join(inside_folder, filename), 'w', encoding='utf-16') as file:
                                    file.write(str(soup))
    
                                done_df = pd.concat([done_df, pd.DataFrame([{'Folder': folder}])], ignore_index=True)
                            else:
                                count += 1
                                not_done_df = pd.concat([not_done_df, pd.DataFrame([{'Folder': folder}])], ignore_index=True)
                                shutil.move(inside_folder, os.path.join(error_folder, folder))
                                break
                        else:
                            count += 1
                            not_done_df = pd.concat([done_df, pd.DataFrame([{'Folder': folder}])], ignore_index=True)
                            shutil.move(inside_folder, os.path.join(error_folder, folder))
    
    # Save the data DataFrame to an Excel file
    data_df.to_excel(os.path.join(excel_path, 'extracted_data.xlsx'), index=False)
    
    # Save the done folders DataFrame to an Excel file
    done_df.to_excel(done_file, index=False)
    
    # Save the not done folders DataFrame to an Excel file
    not_done_df.to_excel(not_done_file, index=False)
    
    # Print the count after the main loop
    print(f"Total count of folders moved to error folder: {count}")
    print(f"Total count of folders without report: {len(no_report)}")

In [None]:
input_folder = r"C:\Users\Techjiva\Downloads\Sagar\WRIST FRACTURE"
error_folder = r"C:\Users\Techjiva\Downloads\Sagar"
no_report_path = r"C:\Users\Techjiva\Downloads\Sagar"
excel_path = r"C:\Users\Techjiva\Downloads\Sagar\WRIST FRACTURE"
dic_excel_path = "C:/Users/Techjiva/Downloads/Sagar/WRIST FRACTURE/Institution_names.xlsx"

In [None]:
# Run the main function
erase_and_save_details(input_folder, excel_path, no_report_path, error_folder)

In [None]:
modify_dicom_files(input_folder, dic_excel_path)