In [5]:
import os
import time
import shutil
import pandas as pd
from collections import defaultdict
import medspacy
import spacy
from rapidfuzz import fuzz
from medspacy.context import ConText, ConTextRule
from medspacy.target_matcher import TargetRule
from medspacy.section_detection import Sectionizer, SectionRule
from medspacy.target_matcher import TargetMatcher
from medspacy.postprocess import PostprocessingPattern, PostprocessingRule
from spacy.tokens import Token, Span
from spacy.language import Language

# Define file paths for input reports (sorted by PHN) and output directory (sorted by cancer type)
phn_sorted_path = "./Sorted_by_PHN_Cleaned_Reports"
sorted_path = "./Sorted_By_Cancer_Type"

# Register custom extensions for cancer detection
Token.set_extension("cancer_type", default=None, force=True)
Span.set_extension("confidence_score", default=0.0, force=True)

# # Define cancer types and their associated terms
# All keywords used in other algorithm, test which ones are best
# cancer_type_terms = {
#     "breast_cancer": [
#         "breast cancer", "breast carcinoma", "invasive ductal carcinoma", "invasive lobular carcinoma",
#         "ductal carcinoma in situ", "dcis", "lobular carcinoma in situ", "lcis", "her2", "triple negative",
#         "estrogen receptor", "progesterone receptor", "er positive", "er +", "pr +", "pr positive", "er negative",
#         "pr negative", "mastectomy", "lumpectomy", "mammary carcinoma",
#         "sentinel lymph node biopsy", "axillary dissection", "mammogram abnormality",
#         "left breast", "right breast", "nipple", "her2 signals per cell", "invasive ductal"
#     ],
#     "lung_cancer": [
#         "lung cancer", "lung carcinoma", "non-small cell lung cancer", "nsclc", "small cell lung cancer",
#         "sclc", "large cell carcinoma", "mucinous adenocarcinoma", "bronchial carcinoma", "ttf-1",
#         "adenocarcinoma of lung", "squamous cell carcinoma of lung", "egfr mutation",
#         "alk rearrangement", "pdl1", "pleural effusion", "lobectomy", "thoracotomy", "bronchogenic carcinoma",
#         "endobronchial carcinoma", "pulmonary carcinoma", "lung metastasis",
#         "bronchoscopy", "lung biopsy", "transbronchial biopsies", "bronchovascular"
#     ],
#     "lymph_cancer": [
#         "lymphoma", "hodgkin", "non-hodgkin", "b-cell lymphoma", "t-cell lymphoma",
#         "mantle cell lymphoma", "follicular lymphoma", "diffuse large b-cell lymphoma",
#         "dlbcl", "reed-sternberg cells", "splenomegaly", "lymphadenopathy",
#         "marginal zone lymphoma", "burkitt lymphoma", "cd20 positive", "cd30 positive",
#         "left supraclavicular lymph node", "extranodal lymphoma", "lymphocytic"
#     ],
#     "brain_cancer": [
#         "brain cancer", "glioblastoma", "gbm", "astrocytoma", "meningioma", "oligodendroglioma",
#         "craniopharyngioma", "medulloblastoma", "ependymoma", "glioma", "cns tumor",
#         "brain metastasis", "intracranial mass", "neurological deficit", "seizures",
#         "idh1", "idh2", "1p19q co-deletion", "mgmt methylation", "brain tumor", "craniotomy",
#         "brain biopsy", "stereotactic radiosurgery", "intracranial neoplasm"
#     ],
#     "skin_cancer": [
#         "skin cancer", "melanoma", "basal cell carcinoma", "bcc", "squamous cell carcinoma", "scc",
#         "merkel cell carcinoma", "melanoma in situ", "nevus", "dysplastic nevus",
#         "braf mutation", "skin lesion", "excisional biopsy", "dermatofibrosarcoma protuberans",
#         "breslow thickness", "clark level", "sentinel node biopsy", "nerve-derived tumor", "skin biopsy",
#         "wide local excision", "mohs surgery", "dermatologic surgery", "skin metastasis", "superficial spreading melanoma", "nodular melanoma"
#     ],
#     "colon_cancer": [
#         "colon cancer", "colorectal cancer", "rectal cancer", "colonic polyp",
#         "adenocarcinoma of colon", "colonic mucosa", "adenomatous mucosa",
#         "sigmoid colon", "fecal occult blood", "colonoscopy", "kras mutation", "nras mutation",
#         "microsatellite instability", "msi", "msi-h", "mismatch repair", "mmr", "braf mutation",
#         "lynch syndrome", "familial adenomatous polyposis", "fap", "rectal biopsy", "colon resection",
#         "colectomy"
#     ],
#     "uterine_or_ovarian_cancer": [
#         "ovarian cancer", "uterine cancer", "endometrial cancer",
#         "uterine sarcoma", "fallopian tube carcinoma", "endometrial hyperplasia",
#         "serous carcinoma", "clear cell carcinoma", "ca-125", "total abdominal hysterectomy",
#         "bilateral salpingo-oophorectomy", "pelvic mass", "endometriosis",
#         "pten mutation", "lynch syndrome", "ovaries",
#         "omentectomy", "ovarian papillary", "fallopian tube", "uterine malignancy",
#         "uterine biopsy", "hysterectomy", "myometrial invasion", "endometrial sampling", "uterine bleeding",
#         "uterine mass", "endometrial thickening", "uterine metastasis", "lymphadenectomy", "pelvic washing"
#     ],
#     "vulvar_cancer": [
#         "vulvar cancer", "vulvar intraepithelial neoplasia", "vin", "vulvar lesion",
#         "groin lesion", "female genital tract", "hpv-associated carcinoma",
#         "squamous cell carcinoma of vulva", "bartholin gland carcinoma",
#         "vulvar melanoma", "vulvar metastasis", "vulvar mass",
#         "vulvar biopsy", "radical vulvectomy", "vulvar resection", "inguinal lymph node dissection",
#         "vulvar reconstruction", "hpv related vulvar cancer", "introitus"
#     ],
#     "leukemia_cancer": [
#         "leukemia", "acute myeloid leukemia", "aml", "acute lymphoblastic leukemia", "all",
#         "chronic myeloid leukemia", "cml", "chronic lymphocytic leukemia", "cll", "bone marrow",
#         "white blood cell count", "philadelphia chromosome", "bcr-abl", "cd19",
#         "cd20", "cd34", "blast cells", "peripheral smear", "bone marrow biopsy"
#         "hairy cell leukemia", "myelodysplastic syndrome", "mds", "flow cytometry", "cytogenetics",
#         "bone marrow transplant", "stem cell transplant", "leukemic cells"
#     ],
#     "cervical_cancer": [
#         "cervical cancer", "cervical intraepithelial neoplasia", "cin", "cervix",
#         "endocervical curettings", "leep procedure", "cone biopsy",
#         "pap smear", "hpv positive", "squamous cell carcinoma of cervix",
#         "adenocarcinoma of cervix", "pelvic lymphadenopathy", "human papillomavirus",
#         "radical hysterectomy", "trachelectomy", "cervical biopsy", "cervical metastasis",
#         "parametrial invasion", "cervical screening"
#     ],
#     "stomach_cancer": [
#         "stomach cancer", "gastric cancer", "gastric adenocarcinoma",
#         "gastric lymphoma", "gastric mass", "gastric ulcer",
#         "signet ring cell", "epigastric pain", "upper endoscopy", "h. pylori", "gastrectomy",
#         "her2 positive gastric", "gastroesophageal junction", "endoscopy", "gastric biopsy",
#         "helicobacter pylori", "gastric resection", "stomach metastasis", "linitis plastica", "gastric cardia cancer"
#     ],
#     "pancreatic_cancer": [
#         "pancreatic cancer", "pancreas", "pancreatic adenocarcinoma",
#         "pancreatic neuroendocrine tumor", "pnet", "whipple procedure", "ca 19-9",
#         "pancreatic mass", "jaundice", "ampullary carcinoma", "intraductal papillary mucinous neoplasm",
#         "ipmn", "mucinous cystic neoplasm", "mcn", "pancreatic biopsy", "ercp", "pancreatic duct dilation",
#         "pancreatic resection", "distal pancreatectomy", "pancreatic head mass", "pancreatic tail mass", "pancreatic metastasis"
#     ],
#     "kidney_cancer": [
#         "kidney cancer", "renal cell carcinoma", "rcc", "clear cell renal carcinoma",
#         "papillary renal cell carcinoma", "nephrectomy", "renal mass", "hematuria",
#         "oncocytoma", "renal vein invasion", "abdominal ultrasound", "von hippel-lindau", "vhl",
#         "chromophobe renal cell carcinoma", "collecting duct carcinoma", "clear cell carcinoma",
#         "partial nephrectomy", "renal biopsy", "renal tumor", "kidney metastasis",
#         "von hippel-lindau disease", "renal cyst", "renal artery embolization", "renal"
#     ],
#     "thyroid_cancer": [
#         "thyroid cancer", "thyroid carcinoma", "papillary thyroid cancer", "ptc",
#         "follicular thyroid cancer", "medullary thyroid cancer", "mtc",
#         "thyroid nodule", "thyroidectomy", "tsh", "thyroglobulin",
#         "fine needle aspiration", "bethesda classification", "braf v600e",
#         "ret/ptc rearrangement", "anaplastic thyroid cancer", "thyroid",
#         "thyroid ultrasound", "thyroid biopsy", "thyroid metastasis", "thyroid lymph node"
#     ]
# }

# Define cancer types, smaller list to test first
cancer_type_terms = {
    "breast_cancer": [
        "breast cancer", "breast carcinoma", "invasive ductal carcinoma", "invasive lobular carcinoma",
        "ductal carcinoma in situ", "dcis", "lobular carcinoma in situ", "lcis", "her2", "triple negative",
        "estrogen receptor", "progesterone receptor", "er positive", "er +", "pr +", "pr positive", "er negative", 
        "pr negative", "brca1", "brca2", "mastectomy", "lumpectomy", "mammary carcinoma", 
        "sentinel lymph node biopsy", "axillary dissection", "mammogram abnormality"
    ],
    "lung_cancer": [
        "lung cancer", "lung carcinoma", "non-small cell lung cancer", "nsclc", "small cell lung cancer",
        "sclc", "large cell carcinoma", "mucinous adenocarcinoma", "bronchial carcinoma", "ttf-1",
        "adenocarcinoma of lung", "squamous cell carcinoma of lung", "egfr mutation",
        "alk rearrangement", "pdl1", "pleural effusion", "lobectomy", "thoracotomy", "bronchogenic carcinoma"
    ],
    "lymph_cancer": [
        "lymphoma", "hodgkin", "non-hodgkin", "b-cell lymphoma", "t-cell lymphoma",
        "mantle cell lymphoma", "follicular lymphoma", "diffuse large b-cell lymphoma",
        "dlbcl", "reed-sternberg cells", "splenomegaly", "lymphadenopathy", "bone marrow biopsy",
        "marginal zone lymphoma", "burkitt lymphoma", "cd20 positive", "cd30 positive"
    ],
    "brain_cancer": [
        "brain cancer", "glioblastoma", "gbm", "astrocytoma", "meningioma", "oligodendroglioma",
        "craniopharyngioma", "medulloblastoma", "ependymoma", "glioma", "cns tumor",
        "brain metastasis", "intracranial mass", "neurological deficit", "seizures",
        "idh1", "idh2", "1p19q co-deletion", "mgmt methylation"
    ],
    "skin_cancer": [
        "skin cancer", "melanoma", "basal cell carcinoma", "bcc", "squamous cell carcinoma", "scc",
        "merkel cell carcinoma", "melanoma in situ", "nevus", "dysplastic nevus",
        "braf mutation", "skin lesion", "excisional biopsy", "dermatofibrosarcoma protuberans",
        "breslow thickness", "clark level", "sentinel node biopsy"
    ],
    "colon_cancer": [
        "colon cancer", "colorectal cancer", "rectal cancer", "colonic polyp",
        "adenocarcinoma of colon", "colonic mucosa", "adenomatous mucosa",
        "sigmoid colon", "fecal occult blood", "colonoscopy", "kras mutation", "nras mutation",
        "microsatellite instability", "msi", "msi-h", "mismatch repair", "mmr", "braf mutation",
        "lynch syndrome", "familial adenomatous polyposis", "fap"
    ],
    "uterine_or_ovarian_cancer": [
        "ovarian cancer", "uterine cancer", "endometrial cancer",
        "uterine sarcoma", "fallopian tube carcinoma", "endometrial hyperplasia",
        "serous carcinoma", "clear cell carcinoma", "ca-125", "total abdominal hysterectomy",
        "bilateral salpingo-oophorectomy", "pelvic mass", "endometriosis",
        "pten mutation", "lynch syndrome", "brca1", "brca2"
    ],
    "vulvar_cancer": [
        "vulvar cancer", "vulvar intraepithelial neoplasia", "vin", "vulvar lesion",
        "groin lesion", "female genital tract", "hpv-associated carcinoma",
        "squamous cell carcinoma of vulva", "bartholin gland carcinoma",
        "vulvar melanoma", "vulvar metastasis"
    ],
    "leukemia_cancer": [
        "leukemia", "acute myeloid leukemia", "aml", "acute lymphoblastic leukemia", "all",
        "chronic myeloid leukemia", "cml", "chronic lymphocytic leukemia", "cll", "bone marrow",
        "white blood cell count", "philadelphia chromosome", "bcr-abl", "cd19",
        "cd20", "cd34", "blast cells", "peripheral smear", "chemotherapy",
        "hairy cell leukemia", "myelodysplastic syndrome", "mds"
    ],
    "cervical_cancer": [
        "cervical cancer", "cervical intraepithelial neoplasia", "cin", "cervix",
        "endocervical curettings", "leep procedure", "cone biopsy",
        "pap smear", "hpv positive", "squamous cell carcinoma of cervix",
        "adenocarcinoma of cervix", "pelvic lymphadenopathy", "human papillomavirus"
    ],
    "stomach_cancer": [
        "stomach cancer", "gastric cancer", "gastric adenocarcinoma",
        "gastric lymphoma", "gastric mass", "gastric ulcer",
        "signet ring cell", "epigastric pain", "upper endoscopy", "h. pylori", "gastrectomy",
        "her2 positive gastric", "gastroesophageal junction"
    ],
    "pancreatic_cancer": [
        "pancreatic cancer", "pancreas", "pancreatic adenocarcinoma",
        "pancreatic neuroendocrine tumor", "pnet", "whipple procedure", "ca 19-9",
        "pancreatic mass", "jaundice", "ampullary carcinoma", "intraductal papillary mucinous neoplasm",
        "ipmn", "mucinous cystic neoplasm", "mcn"
    ],
    "kidney_cancer": [
        "kidney cancer", "renal cell carcinoma", "rcc", "clear cell renal carcinoma",
        "papillary renal cell carcinoma", "nephrectomy", "renal mass", "hematuria",
        "oncocytoma", "renal vein invasion", "abdominal ultrasound", "von hippel-lindau", "vhl",
        "chromophobe renal cell carcinoma", "collecting duct carcinoma"
    ],
    "thyroid_cancer": [
        "thyroid cancer", "thyroid carcinoma", "papillary thyroid cancer", "ptc",
        "follicular thyroid cancer", "medullary thyroid cancer", "mtc",
        "thyroid nodule", "thyroidectomy", "tsh", "thyroglobulin",
        "fine needle aspiration", "bethesda classification", "braf v600e",
        "ret/ptc rearrangement", "anaplastic thyroid cancer"
    ],
    "liver_cancer": [
        "liver cancer", "hepatocellular carcinoma", "hcc", "cholangiocarcinoma",
        "liver mass", "liver nodule", "alpha-fetoprotein", "afp", "cirrhosis", "hepatic lesion",
        "transarterial chemoembolization", "tace", "liver biopsy", "radiofrequency ablation", "rfa",
        "portal vein thrombosis", "hepatitis b", "hepatitis c"
    ],
    "bladder_cancer": [
        "bladder cancer", "bladder carcinoma", "transitional cell carcinoma", "tcc",
        "urothelial carcinoma", "turbt", "cystectomy",
        "hematuria", "bcg treatment", "urine cytology", "carcinoma in situ", "cis",
        "muscle-invasive bladder cancer", "mibc", "non-muscle invasive bladder cancer", "nmibc"
    ]
}

# Load all reports from the PHN-sorted directory
def load_reports():
    all_files = {}
    for root, dirs, files in os.walk(phn_sorted_path):
        # Only collect files directly under PHN folders (skip deeper nesting if any)
        if not files:
            continue
        phn = os.path.relpath(root, phn_sorted_path)
        if phn not in all_files:
            all_files[phn] = {"files": []}
        for file in files:
            sorted_file_path = os.path.join(root, file)
            all_files[phn]["files"].append(sorted_file_path)
    return all_files


def setup_medspacy():
    # Load the medium size model from spacy
    nlp = spacy.load("en_core_web_md")
    
    # Add sentencizer if not present
    if "sentencizer" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer", first=True)
    
    # Add entity ruler for better entity recognition
    if "entity_ruler" not in nlp.pipe_names:
        ruler = nlp.add_pipe("entity_ruler", before="ner")
        patterns = []
        for cancer_type, terms in cancer_type_terms.items():
            for term in terms:
                #Add patterns to recognize in sorting
                patterns.append({"label": cancer_type, "pattern": term})
        ruler.add_patterns(patterns)
    
    # Add target matcher
    if "medspacy_target_matcher" not in nlp.pipe_names:
        nlp.add_pipe("medspacy_target_matcher")
        target_matcher = nlp.get_pipe("medspacy_target_matcher")
        
        # Add target rules
        for cancer_type, terms in cancer_type_terms.items():
            for term in terms:
                rule = TargetRule(literal=term, category=cancer_type, 
                                 pattern=None, on_match=None)
                target_matcher.add(rule)

    # Add context component
    if "medspacy_context" not in nlp.pipe_names:
        nlp.add_pipe("medspacy_context", after="medspacy_target_matcher")
    
    context = nlp.get_pipe("medspacy_context")
    
    # Add negation terms
    negation_terms = [
        "negative", "no evidence of", "ruled out", "free of", "without",
        "no sign of", "no indication of", "negative for", "absent", "no",
        "denies", "deny", "ruled out", "not seen", "not identified", "unremarkable for"
    ]
    
    # Add negation terms to context rules
    negation_rules = [
        ConTextRule(literal=term, category="NEGATED_EXISTENCE", direction="forward")
        for term in negation_terms
    ]
    context.add(negation_rules)
    
    # Add section detection
    if "medspacy_sectionizer" not in nlp.pipe_names:
        nlp.add_pipe("medspacy_sectionizer")
    # Section detector for reports    
    section_detector = nlp.get_pipe("medspacy_sectionizer")
    
    # Define sections
    section_rules = [
        SectionRule("diagnosis", "diagnosis", pattern=None),
        SectionRule("clinical history", "clinical history", pattern=None),
        SectionRule("gross description", "gross description", pattern=None),
        SectionRule("microscopic description", "microscopic description", pattern=None),
        SectionRule("impression", "impression", pattern=None),
        SectionRule("comment", "comment", pattern=None),
        SectionRule("final diagnosis", "final diagnosis", pattern=None),
        SectionRule("assessment", "assessment", pattern=None),
        SectionRule("conclusion", "conclusion", pattern=None),
        SectionRule("findings", "findings", pattern=None)
    ]
    
    section_detector.add(section_rules)
    
    return nlp

def get_section_weight(section_name):
    # Give each section weightings
    section_weights = {
        "final diagnosis": 5.0,
        "diagnosis": 4.0,
        "impression": 3.5,
        "assessment": 3.0,
        "conclusion": 3.0,
        "microscopic description": 2.0,
        "findings": 2.0,
        "gross description": 1.0,
        "clinical history": 0.5,
        "comment": 0.5
    }
    
    return section_weights.get(section_name.lower(), 1.0)

def process_report(file_path, nlp):
    # Process reports
    try:
        # Load from file path
        with open(file_path, "r") as f:
            content = f.read()
        
        # Process with NLP pipeline
        doc = nlp(content)
        
        cancer_scores = defaultdict(float)
        confidence_scores = defaultdict(float)
        total_weighted_mentions = 0
        
        # Process each section with appropriate weighting
        for section in doc._.sections:
            if not hasattr(section, 'title') or not section.title:
                continue
                
            section_weight = get_section_weight(section.title.lower())
            
            for sent in section.sentences:
                for ent in sent.ents:
                    # Skip if entity is negated
                    if hasattr(ent._, 'is_negated') and ent._.is_negated:
                        continue
                        
                    # Check if entity is a known cancer type
                    for cancer_type, terms in cancer_type_terms.items():
                        for term in terms:
                            similarity = fuzz.token_sort_ratio(term.lower(), ent.text.lower())
                            # Fuzzy match the words to leave room for error in specific words
                            if similarity > 90: 
                                # Assign them scores
                                weighted_score = similarity * 0.01 * section_weight
                                cancer_scores[cancer_type] += weighted_score
                                confidence_scores[cancer_type] += 1
                                total_weighted_mentions += weighted_score
                                break
        
        # If no direct matches were found, use target matcher results
        if not cancer_scores:
            # Process entities without checking section boundaries
            for ent in doc.ents:
                if ent.label_ in cancer_type_terms.keys():
                    # Skip if entity is negated
                    if hasattr(ent._, 'is_negated') and ent._.is_negated:
                        continue
                    
                    # Find the section this entity belongs to by checking if the entity's text is in any section's text
                    section_weight = 1.0  # Default weight
                    
                    # Get entity's sentence for context
                    for sent in doc.sents:
                        if ent.start >= sent.start and ent.end <= sent.end:
                            # Try to find which section this sentence belongs to
                            for section in doc._.sections:
                                if hasattr(section, 'title') and section.title:
                                    # Check if this section contains our sentence text
                                    if any(s.text == sent.text for s in section.sentences):
                                        section_weight = get_section_weight(section.title.lower())
                                        break
                    # Assign scores
                    cancer_scores[ent.label_] += 1.0 * section_weight
                    confidence_scores[ent.label_] += 1
                    total_weighted_mentions += 1.0 * section_weight
        
        # Calculate final scores
        if cancer_scores:
            for cancer_type in cancer_scores:
                if total_weighted_mentions > 0:
                    # Final score calculation per file
                    cancer_scores[cancer_type] = cancer_scores[cancer_type] / total_weighted_mentions
        
        return cancer_scores, confidence_scores
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return defaultdict(float), defaultdict(float)

def process_reports(all_files, nlp):
    report_classifications = {}
    confidence_data = {}
    
    total_files = sum(len(folder_data["files"]) for folder_data in all_files.values())
    processed = 0
    
    for folder, folder_data in all_files.items():
        # Process each file separately
        folder_cancer_scores = defaultdict(float)
        folder_confidence = defaultdict(float)
        
        for file_path in folder_data["files"]:
            processed += 1
            if processed % 100 == 0:
                print(f"Processed {processed}/{total_files} files...")
                
            # Process individual report
            cancer_scores, confidence_scores = process_report(file_path, nlp)
            
            # Combine scores for this folder
            for cancer_type, score in cancer_scores.items():
                folder_cancer_scores[cancer_type] += score
            
            for cancer_type, count in confidence_scores.items():
                folder_confidence[cancer_type] += count
        
        # Determine classification based on combined scores
        if folder_cancer_scores:
            max_score = max(folder_cancer_scores.values())
            max_cancer_types = [cancer_type for cancer_type, score in folder_cancer_scores.items() 
                               if abs(score - max_score) < 0.001]  # Handle floating point comparison
            
            if len(max_cancer_types) == 1:
                report_classifications[folder] = max_cancer_types[0]
                confidence_data[folder] = {
                    "cancer_type": max_cancer_types[0],
                    "confidence": folder_confidence[max_cancer_types[0]],
                    "score": folder_cancer_scores[max_cancer_types[0]]
                }
            else:
                # Check if one type has significantly more mentions
                max_confidence = max(folder_confidence[cancer_type] for cancer_type in max_cancer_types)
                confident_types = [cancer_type for cancer_type in max_cancer_types 
                                 if folder_confidence[cancer_type] == max_confidence]
                
                if len(confident_types) == 1:
                    report_classifications[folder] = confident_types[0]
                    confidence_data[folder] = {
                        "cancer_type": confident_types[0],
                        "confidence": folder_confidence[confident_types[0]],
                        "score": folder_cancer_scores[confident_types[0]]
                    }
                else:
                    report_classifications[folder] = "conflicting_cancer_types"
                    confidence_data[folder] = {
                        "cancer_type": "conflicting_cancer_types",
                        "confidence": 0,
                        "score": 0,
                        "candidates": max_cancer_types
                    }
        else:
            report_classifications[folder] = "unknown_type"
            confidence_data[folder] = {
                "cancer_type": "unknown_type",
                "confidence": 0,
                "score": 0
            }
    
    return report_classifications, confidence_data

def sort_reports(all_files, report_classifications):
    if os.path.exists(sorted_path):
        shutil.rmtree(sorted_path)
        print("Previous sorted reports deleted")
    os.makedirs(sorted_path, exist_ok=True)
    
    # Create folders for all cancer types plus unknown and conflicting
    for cancer_type in list(cancer_type_terms.keys()) + ["unknown_type", "conflicting_cancer_types"]:
        os.makedirs(os.path.join(sorted_path, cancer_type), exist_ok=True)
    
    # Sort and copy files
    for folder, folder_data in all_files.items():
        cancer_type = report_classifications.get(folder, "unknown_type")
        
        for file_path in folder_data["files"]:
            relative_path = os.path.relpath(file_path, phn_sorted_path)
            target_dir = os.path.join(sorted_path, cancer_type, folder)

            os.makedirs(target_dir, exist_ok=True)
            
            with open(file_path, "r") as f:
                content = f.read()
            
            new_file_path = os.path.join(target_dir, os.path.basename(file_path))
            
            with open(new_file_path, "w") as f:
                f.write(content)
    
    print(f"Reports sorted successfully into {sorted_path}")

def generate_statistics(report_classifications, confidence_data):
    cancer_type_counts = defaultdict(int)
    confidence_by_type = defaultdict(list)
    
    for folder, cancer_type in report_classifications.items():
        cancer_type_counts[cancer_type] += 1
        
        if folder in confidence_data:
            conf_data = confidence_data[folder]
            if "confidence" in conf_data:
                confidence_by_type[cancer_type].append(conf_data["confidence"])
    
    # Calculate average confidence by type
    avg_confidence = {}
    for cancer_type, confidences in confidence_by_type.items():
        if confidences:
            avg_confidence[cancer_type] = sum(confidences) / len(confidences)
        else:
            avg_confidence[cancer_type] = 0
    
    # Create statistics dataframe
    stats_data = []
    for cancer_type, count in cancer_type_counts.items():
        stats_data.append({
            "Cancer Type": cancer_type,
            "Count": count,
            "Avg Confidence": avg_confidence.get(cancer_type, 0)
        })
    
    stats_df = pd.DataFrame(stats_data)
    stats_df = stats_df.sort_values("Count", ascending=False)
    
    print("\nClassification Statistics:")
    print(stats_df)
    
    # Calculate problematic classifications
    total = sum(cancer_type_counts.values())
    problematic = cancer_type_counts.get("unknown_type", 0) + cancer_type_counts.get("conflicting_cancer_types", 0)
    problematic_pct = (problematic / total) * 100 if total > 0 else 0
    
    print(f"\nProblematic classifications: {problematic}/{total} ({problematic_pct:.2f}% of total)")
    
    # List the most common conflicting combinations
    conflicting_combinations = []
    for folder, data in confidence_data.items():
        if data.get("cancer_type") == "conflicting_cancer_types" and "candidates" in data:
            conflicting_combinations.append(tuple(sorted(data["candidates"])))
    
    if conflicting_combinations:
        from collections import Counter
        common_conflicts = Counter(conflicting_combinations).most_common(5)
        
        print("\nMost common conflicting cancer type combinations:")
        for combo, count in common_conflicts:
            print(f"  {' vs '.join(combo)}: {count} cases")
    
    # Save statistics to CSV
    stats_df.to_csv(os.path.join(sorted_path, "classification_statistics.csv"), index=False)
    
    return stats_df

def main():
    # Load Reports
    start = time.time()
    print("Loading reports...")
    all_files = load_reports()
    
    # Setup NLP
    print("Setting up medspaCy...")
    nlp = setup_medspacy()
    
    # Classify the reports
    print("Processing reports with NLP...")
    report_classifications, confidence_data = process_reports(all_files, nlp)
    
    # Sort by Cancer type
    print("Sorting reports into cancer type directories...")
    sort_reports(all_files, report_classifications)
    
    # Show stats
    print("Generating statistics...")
    generate_statistics(report_classifications, confidence_data)
    
    # Save confidence data for further analysis
    confidence_df = pd.DataFrame([
        {
            "Folder": folder,
            "Cancer Type": data["cancer_type"],
            "Confidence": data.get("confidence", 0),
            "Score": data.get("score", 0),
            "Candidates": ", ".join(data.get("candidates", []))
        }
        for folder, data in confidence_data.items()
    ])
    confidence_df.to_csv(os.path.join(sorted_path, "classification_confidence.csv"), index=False)
    
    # Calculate and display execution time
    end = time.time()
    print(f"Total execution time: {end - start:.2f} seconds")
    print("Done!")

if __name__ == "__main__":
    main()

Loading reports...
Setting up medspaCy...
Processing reports with NLP...
Processed 100/18488 files...
Processed 200/18488 files...
Processed 300/18488 files...
Processed 400/18488 files...
Processed 500/18488 files...
Processed 600/18488 files...
Processed 700/18488 files...
Processed 800/18488 files...
Processed 900/18488 files...
Processed 1000/18488 files...
Processed 1100/18488 files...
Processed 1200/18488 files...
Processed 1300/18488 files...
Processed 1400/18488 files...
Processed 1500/18488 files...
Processed 1600/18488 files...
Processed 1700/18488 files...
Processed 1800/18488 files...
Processed 1900/18488 files...
Processed 2000/18488 files...
Processed 2100/18488 files...
Processed 2200/18488 files...
Processed 2300/18488 files...
Processed 2400/18488 files...
Processed 2500/18488 files...
Processed 2600/18488 files...
Processed 2700/18488 files...
Processed 2800/18488 files...
Processed 2900/18488 files...
Processed 3000/18488 files...
Processed 3100/18488 files...
Proces