In [None]:
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import os
import chardet

def extract_patient_details_from_text(content):
    patterns = {
        "Patient ID": r"patient id:\s*(.*?)\s*patient name:",
        "Patient Name": r"patient name:\s*(.*?)\s*age:",
        "Age": r"age:\s*(.*?)\s*sex:",
        "Sex": r"sex:\s*(.*?)\s*accession number:",
        "Accession": r"accession number:\s*(.*?)\s*modality:",
        "Modality": r"modality:\s*(.*?)\s*referring physician:",
        "Physician": r"referring physician:\s*(.*?)\s*study:",
        "Study": r"study:\s*(.*?)\s*study date:",
        "Study Date": r"study date:\s*(\d{2}-\w{3}-\d{4})"
    }

    patient_details = {}

    for key, pattern in patterns.items():
        match = re.search(pattern, content, re.DOTALL)
        if match:
            patient_details[key] = match.group(1).strip()
        else:
            patient_details[key] = None

    return (
        patient_details.get("Patient ID"),
        patient_details.get("Patient Name"),
        patient_details.get("Age"),
        patient_details.get("Sex"),
        patient_details.get("Accession"),
        patient_details.get("Modality"),
        patient_details.get("Physician"),
        patient_details.get("Study"),
        patient_details.get("Study Date")
    )

# Function to extract patient details
def extract_patient_details_from_table1(soup):
    table = soup.find('table', {'border': '1', 'cellapdding': '0', 'cellspacing': '0'})
    
    if not table:
        return None, None, None, None, None, None, None, None, None
    
    patient_id = None
    patient_name = None
    age = None
    sex = None
    accession = None
    modality = None
    physician = None
    study = None
    study_date = None
    
    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 4:
            key1, value1 = cells[0].get_text(strip=True), cells[1].get_text(strip=True)
            key2, value2 = cells[2].get_text(strip=True), cells[3].get_text(strip=True)
            
            if key1 == "Patient ID:":
                patient_id = value1
            elif key1 == "Patient Name:":
                patient_name = value1
            elif key1 == "Age:":
                age = value1
            elif key1 == "Sex:":
                sex = value1
            elif key1 == "Accession:":
                accession = value1
            elif key1 == "Modality:":
                modality = value1
            elif key1 == "Physician:":
                physician = value1
            elif key1 == "Study:":
                study = value1
            elif key1 == "Study Date:":
                study_date = value1
            
            if key2 == "Patient ID:":
                patient_id = value2
            elif key2 == "Patient Name:":
                patient_name = value2
            elif key2 == "Age:":
                age = value2
            elif key2 == "Sex:":
                sex = value2
            elif key2 == "Accession:":
                accession = value2
            elif key2 == "Modality:":
                modality = value2
            elif key2 == "Physician:":
                physician = value2
            elif key2 == "Study:":
                study = value2
            elif key2 == "Study Date:":
                study_date = value2

    age_match = re.search(r'\d+', patient_name)
    if age_match:
        age = patient_name[age_match.start():].strip()
        patient_name = patient_name[:age_match.start()].strip()
    
    return patient_id, patient_name, age, sex, accession, modality, physician, study, study_date

# Function to extract patient details
def extract_patient_details_from_table2(soup):
    # Find and erase 'Patient Name' and 'Patient ID' values
    for tag in soup.find_all('td'):
        if tag.find('b') and 'Patient Name' in tag.find('b').text:
            patient_name_value = tag.get_text().strip().replace('Patient Name:', '').strip()
            age_match = re.search(r'\d+', patient_name_value)
            if age_match:
                age_value = patient_name_value[age_match.start():].strip()
                patient_name_value = patient_name_value[:age_match.start()].strip()

        if tag.find('b') and 'Patient ID' in tag.find('b').text:
            patient_id_value = tag.get_text().strip().replace('Patient ID:', '').strip()

        if tag.find('b') and 'Sex' in tag.find('b').text:
            sex_value = tag.get_text().strip().replace('Sex:', '').strip()

        if tag.find('b') and 'Study' in tag.find('b').text and not 'Study ' in tag.find('b').text:
            study_value = tag.get_text().strip().replace('Study:', '').strip()

        if tag.find('b') and 'Modality' in tag.find('b').text:
            modality_value = tag.get_text().strip().replace('Modality:', '').strip()

        if tag.find('b') and 'Study Date' in tag.find('b').text:
            study_date_value = tag.get_text().strip().replace('Study Date:', '').strip()

        if tag.find('b') and 'Accession' in tag.find('b').text:
            accession_value = tag.get_text().strip().replace('Accession Number:', '').strip()

        if tag.find('b') and 'Physician' in tag.find('b').text:
            physician_value = tag.get_text().strip().replace('Referring Physician:', '').strip()
            
    return patient_id_value, patient_name_value, age_value, sex_value, accession_value, modality_value, physician_value, study_value, study_date_value

def clean_value(value):
    # Remove non-printable characters
    if isinstance(value, str):
        # Using a regular expression to remove non-printable characters
        return re.sub(r'[\x00-\x1F\x7F]', '', value)
    return value

def save_details(input_folder, excel_path):
    count = 0
    no_report = []

    # Initialize an empty DataFrame to store the extracted data
    columns = ["Folder", "Patient ID", "Patient Name", "Age", "Gender", "Study", "Modality", "Study Date", "Accession", "Physician", "Result"]
    data_df = pd.DataFrame(columns=columns)

    # Iterate through each folder in the input folder
    for folder in tqdm(os.listdir(input_folder)):
        inside_folder = os.path.join(input_folder, folder)
        html_files = [f for f in os.listdir(inside_folder) if f.endswith('.html')]

        if not html_files:
            no_report.append(folder)
        else:
            for filename in os.listdir(inside_folder):
                if filename.startswith('Final'):              
                    # Read the HTML file
                    report_file_path = os.path.join(inside_folder, filename)

                    with open(report_file_path, 'rb') as f:
                        raw_data = f.read()
                        encoding_result = chardet.detect(raw_data)
                        file_encoding = encoding_result['encoding']

                    with open(report_file_path, 'r', encoding=file_encoding) as f:
                        content = f.read()

                    # Parse the HTML content with BeautifulSoup
                    soup = BeautifulSoup(content, 'html.parser')
                    
                    # Extract the report text
                    text = soup.get_text().lower()

                    # Determine if the report is normal or abnormal
                    result = "Normal" if "no significant abnorm" in text else "Abnormal"

                    patient_name_value = None
                    patient_id_value = None
                    age_value = None
                    sex_value = None
                    study_value = None
                    modality_value = None
                    study_date_value = None
                    accession_value = None
                    physician_value = None
                    
                    patient_id_value, patient_name_value, age_value, sex_value, accession_value, modality_value, physician_value, study_value, study_date_value = extract_patient_details_from_text(text)
                    
                    # Use the new extraction function if primary extraction fails
                    if not any([patient_name_value, patient_id_value, sex_value, study_value, modality_value, study_date_value, accession_value, physician_value]):
                        (patient_id_value, patient_name_value, age_value, sex_value, accession_value, modality_value, physician_value, study_value, study_date_value) = extract_patient_details_from_table1(soup)
                    
                    # Use the new extraction function if primary extraction fails
                    if not any([patient_name_value, patient_id_value, sex_value, study_value, modality_value, study_date_value, accession_value, physician_value]):
                        (patient_id_value, patient_name_value, age_value, sex_value, accession_value, modality_value, physician_value, study_value, study_date_value) = extract_patient_details_from_table2(soup)
                    
                    if patient_name_value:
                        age_match = re.search(r'\d+', patient_name_value)

                        if age_match:
                            age_value = patient_name_value[age_match.start():].strip()
                            patient_name_value = patient_name_value[:age_match.start()].strip()

                    # Insert the age value into the <b>Age</b> tag if it exists
                    if not age_value:
                        age_value = "0"
                    
                    # Clean the values before adding them to the DataFrame
                    new_row = {
                        "Folder": clean_value(folder),
                        "Patient ID": clean_value(patient_id_value),
                        "Patient Name": clean_value(patient_name_value),
                        "Age": clean_value(age_value),
                        "Gender": clean_value(sex_value),
                        "Study": clean_value(study_value),
                        "Modality": clean_value(modality_value),
                        "Study Date": clean_value(study_date_value),
                        "Accession": clean_value(accession_value),
                        "Physician": clean_value(physician_value),
                        "Result": clean_value(result),
                    }
                    
                    data_df = pd.concat([data_df, pd.DataFrame([new_row])], ignore_index=True)

                    count += 1
    
    data_df.to_excel(os.path.join(excel_path, f"{os.path.basename(input_folder)}_details.xlsx"), index=False)
    print(f"Processed {count} folders with extracted details saved to 'details.xlsx'.")
    print(f"Folders with no report: {len(no_report)}")

input_folder = "/data/AP-Extracted/01/"
excel_path = "/data/AP-Extracted/"

# Call the function
save_details(input_folder, excel_path)