In [2]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting Levenshtein==0.27.3 (from python-Levenshtein)
  Downloading levenshtein-0.27.3-cp312-cp312-win_amd64.whl.metadata (3.7 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.3->python-Levenshtein)
  Downloading rapidfuzz-3.14.3-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.3-py3-none-any.whl (9.5 kB)
Downloading levenshtein-0.27.3-cp312-cp312-win_amd64.whl (94 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:-

In [4]:
import Levenshtein
from collections import Counter

def calculate_layout_aware_metrics(ground_truth: str, extracted_text: str) -> dict:
    """
    Calculates Character Error Rate (CER) and Word Error Rate (WER) 
    using Levenshtein distance. These metrics are LAYOUT-AWARE 
    (order matters).
    """
    gt_clean = ground_truth.strip()
    ext_clean = extracted_text.strip()
    
    # Character Error Rate (CER)
    char_distance = Levenshtein.distance(gt_clean, ext_clean)
    char_length = len(gt_clean)
    cer = char_distance / char_length if char_length > 0 else 0.0
    
    # Word Error Rate (WER)
    gt_words = gt_clean.split()
    ext_words = ext_clean.split()
    word_distance = Levenshtein.distance(gt_words, ext_words)
    word_length = len(gt_words)
    wer = word_distance / word_length if word_length > 0 else 0.0
    
    return {
        "Layout_Aware_CER": round(cer, 4),
        "Layout_Aware_WER": round(wer, 4),
        "Layout_Aware_CER_Accuracy": round(1 - cer, 4),
        "Layout_Aware_WER_Accuracy": round(1 - wer, 4),
    }

def calculate_content_only_metrics(ground_truth: str, extracted_text: str) -> dict:
    """
    Compares only text content with word frequency, completely 
    LAYOUT-AGNOSTIC (order doesn't matter). Best for evaluating 
    OCR text extraction quality independent of layout.
    """
    gt_clean = ground_truth.strip().lower()
    ext_clean = extracted_text.strip().lower()
    
    gt_counter = Counter(gt_clean.split())
    ext_counter = Counter(ext_clean.split())
    
    # Words that appear with correct frequency
    correct_counts = sum((gt_counter & ext_counter).values())
    total_gt_words = sum(gt_counter.values())
    total_ext_words = sum(ext_counter.values())
    
    # Calculate metrics
    recall = correct_counts / total_gt_words if total_gt_words > 0 else 0.0
    precision = correct_counts / total_ext_words if total_ext_words > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    # Additional insights
    missing_words = total_gt_words - correct_counts
    extra_words = total_ext_words - correct_counts
    
    return {
        "Content_Precision": round(precision, 4),
        "Content_Recall": round(recall, 4),
        "Content_F1_Score": round(f1, 4),
        "Correct_Words": correct_counts,
        "Missing_Words": missing_words,
        "Extra_Words": extra_words,
        "Total_GT_Words": total_gt_words,
        "Total_Extracted_Words": total_ext_words,
    }

def calculate_character_content_metrics(ground_truth: str, extracted_text: str) -> dict:
    """
    Character-level content comparison (layout-agnostic).
    Useful for languages with complex word boundaries like Sinhala/Tamil.
    """
    gt_clean = ground_truth.strip().lower()
    ext_clean = extracted_text.strip().lower()
    
    # Remove all whitespace for pure character comparison
    gt_chars = Counter(gt_clean.replace(" ", "").replace("\n", "").replace("\t", ""))
    ext_chars = Counter(ext_clean.replace(" ", "").replace("\n", "").replace("\t", ""))
    
    correct_chars = sum((gt_chars & ext_chars).values())
    total_gt_chars = sum(gt_chars.values())
    total_ext_chars = sum(ext_chars.values())
    
    recall = correct_chars / total_gt_chars if total_gt_chars > 0 else 0.0
    precision = correct_chars / total_ext_chars if total_ext_chars > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return {
        "Character_Content_Precision": round(precision, 4),
        "Character_Content_Recall": round(recall, 4),
        "Character_Content_F1": round(f1, 4),
    }

def evaluate_ocr_from_files(ground_truth_file: str, extracted_text_file: str) -> dict:
    """
    Main function to evaluate OCR accuracy from two text files.
    
    Args:
        ground_truth_file: Path to the ground truth text file
        extracted_text_file: Path to the extracted/predicted text file
        
    Returns:
        Dictionary containing all evaluation metrics
    """
    # Read files
    try:
        with open(ground_truth_file, 'r', encoding='utf-8') as f:
            ground_truth = f.read()
        with open(extracted_text_file, 'r', encoding='utf-8') as f:
            extracted_text = f.read()
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        return {}
    except Exception as e:
        print(f"Error reading files: {e}")
        return {}
    
    # Calculate all metrics
    results = {}
    
    # Layout-aware metrics (traditional CER/WER)
    layout_metrics = calculate_layout_aware_metrics(ground_truth, extracted_text)
    results.update(layout_metrics)
    
    # Content-only metrics (layout-agnostic)
    content_metrics = calculate_content_only_metrics(ground_truth, extracted_text)
    results.update(content_metrics)
    
    # Character-level content metrics
    char_metrics = calculate_character_content_metrics(ground_truth, extracted_text)
    results.update(char_metrics)
    
    return results

def print_results(results: dict):
    """Pretty print the evaluation results."""
    if not results:
        print("No results to display.")
        return
    
    print("\n" + "="*60)
    print("OCR EVALUATION RESULTS")
    print("="*60)
    
    print("\nüìä LAYOUT-AWARE METRICS (Order Matters)")
    print("-" * 60)
    print(f"  CER (Character Error Rate):     {results['Layout_Aware_CER']}")
    print(f"  CER Accuracy:                   {results['Layout_Aware_CER_Accuracy']}")
    print(f"  WER (Word Error Rate):          {results['Layout_Aware_WER']}")
    print(f"  WER Accuracy:                   {results['Layout_Aware_WER_Accuracy']}")
    
    print("\nüìù CONTENT-ONLY METRICS (Order Ignored)")
    print("-" * 60)
    print(f"  Precision:                      {results['Content_Precision']}")
    print(f"  Recall:                         {results['Content_Recall']}")
    print(f"  F1 Score:                       {results['Content_F1_Score']}")
    print(f"  Correct Words:                  {results['Correct_Words']}")
    print(f"  Missing Words:                  {results['Missing_Words']}")
    print(f"  Extra Words:                    {results['Extra_Words']}")
    
    print("\nüî§ CHARACTER-LEVEL CONTENT METRICS")
    print("-" * 60)
    print(f"  Precision:                      {results['Character_Content_Precision']}")
    print(f"  Recall:                         {results['Character_Content_Recall']}")
    print(f"  F1 Score:                       {results['Character_Content_F1']}")
    
    print("\n" + "="*60 + "\n")

# === USAGE EXAMPLE ===
if __name__ == "__main__":
    # Specify your file paths
    ground_truth_file = "original.txt"
    extracted_text_file = "predicted.txt"
    
    # Run evaluation
    results = evaluate_ocr_from_files(ground_truth_file, extracted_text_file)
    
    # Display results
    print_results(results)
    
    # Optionally save results to JSON
    # import json
    # with open('evaluation_results.json', 'w') as f:
    #     json.dump(results, f, indent=2)


OCR EVALUATION RESULTS

üìä LAYOUT-AWARE METRICS (Order Matters)
------------------------------------------------------------
  CER (Character Error Rate):     0.2529
  CER Accuracy:                   0.7471
  WER (Word Error Rate):          0.2909
  WER Accuracy:                   0.7091

üìù CONTENT-ONLY METRICS (Order Ignored)
------------------------------------------------------------
  Precision:                      0.7707
  Recall:                         0.7333
  F1 Score:                       0.7516
  Correct Words:                  121
  Missing Words:                  44
  Extra Words:                    36

üî§ CHARACTER-LEVEL CONTENT METRICS
------------------------------------------------------------
  Precision:                      0.9725
  Recall:                         0.9567
  F1 Score:                       0.9645




In [1]:
import re
from difflib import SequenceMatcher

class MarkdownEvaluator:
    def __init__(self, header_threshold=0.8):
        self.header_threshold = header_threshold

    def clean_text(self, text: str) -> str:
        """
        Normalizes text for comparison:
        1. Removes markdown formatting (*, _, `) but KEEPS pipes | for tables.
        2. Normalizes whitespace.
        3. Lowers case (optional, strictness depends on use case).
        """
        # Remove bold/italic/code markers
        text = re.sub(r'[*_`]', '', text) 
        # Normalize whitespace (tabs/newlines -> single space)
        return re.sub(r'\s+', ' ', text).strip()

    def parse_markdown(self, text: str) -> dict:
        """
        Parses text into sections. 
        If no headers (#) are found, treats the whole text as a 'Document' section.
        """
        # Regex for standard Markdown headers (# Header)
        pattern = re.compile(r'(^|\n)(#+)\s*(.*?)(?=\n#|\Z)', re.DOTALL)
        sections = {}
        
        matches = list(pattern.finditer(text))
        
        # FAILSAFE: If no headers found, treat entire text as body content
        if not matches:
            sections['Whole Document'] = {
                'title': 'Whole Document',
                'content': text.strip()
            }
            return sections

        # If headers exist, parse normally
        if matches[0].start() > 0:
            preamble = text[:matches[0].start()].strip()
            if preamble:
                sections['PREAMBLE'] = {'title': 'PREAMBLE', 'content': preamble}

        for match in matches:
            hashes, title, content = match.group(2), match.group(3), match.group(0)
            # Remove the header line itself from the content to avoid duplication
            content_only = content.replace(f"{hashes} {title}", "", 1).strip()
            
            sections[title.strip()] = {
                'title': title.strip(),
                'content': content_only
            }
            
        return sections

    def get_diff_highlight(self, a: str, b: str) -> str:
        """Helper to show exactly where characters differ."""
        s = SequenceMatcher(None, a, b)
        diff_out = []
        for tag, i1, i2, j1, j2 in s.get_opcodes():
            if tag == 'replace':
                diff_out.append(f"MISMATCH: '{a[i1:i2]}' vs '{b[j1:j2]}'")
            elif tag == 'delete':
                diff_out.append(f"MISSING in Pred: '{a[i1:i2]}'")
            elif tag == 'insert':
                diff_out.append(f"EXTRA in Pred: '{b[j1:j2]}'")
        return " | ".join(diff_out)

    def evaluate(self, gt_text: str, pred_text: str) -> dict:
        gt_sections = self.parse_markdown(gt_text)
        pred_sections = self.parse_markdown(pred_text)
        
        results = {'matches': [], 'score_sum': 0, 'count': 0}
        
        # Combine Title + Content for comparison to catch everything
        def get_full_text(sec):
            # If it's the "Whole Document" fallback, just return content
            if sec['title'] == 'Whole Document':
                return sec['content']
            return f"{sec['title']} {sec['content']}"

        matched_pred_keys = set()

        for gt_key, gt_data in gt_sections.items():
            best_match = None
            best_score = 0.0
            
            gt_full_clean = self.clean_text(get_full_text(gt_data))
            
            # Find best matching section in Pred
            for pred_key, pred_data in pred_sections.items():
                if pred_key in matched_pred_keys: continue
                
                # Compare fuzzy headers, OR if we are in "Whole Document" mode, compare full text
                if gt_key == "Whole Document" or pred_key == "Whole Document":
                     # If parsing failed, we force a comparison of the body
                    header_sim = 1.0 
                else:
                    header_sim = SequenceMatcher(None, gt_key, pred_key).ratio()

                if header_sim > self.header_threshold:
                    pred_full_clean = self.clean_text(get_full_text(pred_data))
                    content_sim = SequenceMatcher(None, gt_full_clean, pred_full_clean).ratio()
                    
                    if content_sim > best_score:
                        best_score = content_sim
                        best_match = pred_key

            if best_match:
                matched_pred_keys.add(best_match)
                
                # Get raw texts for diffing
                gt_raw = self.clean_text(get_full_text(gt_data))
                pred_raw = self.clean_text(get_full_text(pred_sections[best_match]))
                
                diff_notes = ""
                if best_score < 1.0:
                    diff_notes = self.get_diff_highlight(gt_raw, pred_raw)

                results['matches'].append({
                    'section': gt_key,
                    'score': best_score,
                    'diff': diff_notes
                })
                results['score_sum'] += best_score
                results['count'] += 1
            else:
                # Missing section penalty
                results['matches'].append({'section': gt_key, 'score': 0.0, 'diff': "Section Missing"})
                results['count'] += 1

        results['final_score'] = results['score_sum'] / results['count'] if results['count'] > 0 else 0
        return results

# ==========================================
# TEST WITH YOUR SINHALA DATA
# ==========================================

gt = """
03. ‡∂ö‡∑É‡∑ä‡∂ª‡∑î ‡∑Ñ‡∑è ‡∂¢‡∂∏‡∑ä‡∂ã ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏‡∂ß ‡∂ú‡∂∏‡∂±‡∂Ø ‡∂Ø‡∑í‡∂ú ‡∂ú‡∂∏‡∂±‡∂ö‡∑ä ‡∑Ä‡∑í‡∂∫ ‡∂∫‡∑î‡∂≠‡∑î‡∂∫. ‡∂∏‡∑ô‡∂∏ ‡∑Ä‡∂ª‡∂¥‚Äç‡∑ä‚Äç‡∂ª‡∑É‡∑è‡∂Ø‡∂∫ ‡∂ú‡∂∏‡∑ä ‡∂¥‚Äç‡∑ä‚Äç‡∂ª‡∂Ø‡∑ö‡∑Å‡∂∫‡∑ö ‡∑É‡∑ì‡∂∏‡∑è‡∑Ä ‡∂â‡∂ö‡∑ä‡∂∏‡∑Ä‡∑ñ ‡∑Ä‡∑í‡∂ú‡∑É ‡∂∏ ‡∂á‡∂ª‡∂π‡∑ö. ‡∂ë‡∂∏‡∑ô‡∂±‡∑ä ‡∂∏ ‡∂¢‡∂∏‡∑ä‡∂ã ‡∂≠‡∂É‡∂ö‡∑ì‡∂ª‡∑ä ‡∂â‡∂ß‡∑î‡∂ö‡∂ª‡∂±‡∑ä‡∂±‡∑è ‡∂Ö‡∂Ø‡∑è‡∑Ö ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∑ö ‡∑Ä‡∑ö‡∂Ω‡∑è‡∑Ä ‡∂â‡∂ö‡∑ä‡∂∏‡∑Ä‡∑ì‡∂∏‡∂ß ‡∂¥‡∑ô‡∂ª ‡∂Ö‡∂±‡∑ô‡∂ö‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ ‡∑Ñ‡∑è ‡∂ë‡∂ö‡∑ä ‡∂ö‡∑ú‡∂ß ‡∂â‡∂ß‡∑î ‡∂ö‡∂ª‡∂± ‡∂∂‡∑Ä‡∂ß ‡∂±‡∑í‡∂∫‡∑í‡∂∫‡∂≠‡∂∫ ‡∂≠‡∑ê‡∂∂‡∑í‡∂∫ ‡∂∫‡∑î‡∂≠‡∑î ‡∂∫.

04. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∑è ‡∑Ü‡∂¢‡∑ä‡∂ª‡∑ä‡∑Ñ‡∑í ‡∑É‡∑î‡∂±‡∑ä‡∂±‡∂≠‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂±‡∑ä ‡∑Ñ‡∑è ‡∑Ä‡∑í‡∂≠‡∑ä‡∂ª‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂±‡∑ä ‡∂±‡∑ú‡∂ö‡∂©‡∑Ä‡∑è ‡∂â‡∂ß‡∑î ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏ ‡∂∫‡∑Ñ‡∂¥‡∂≠‡∑ä ‡∂∫.

05. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∂±‡∑ä ‡∂¢‡∂∏‡∑ä‡∂ã ‡∂ö‡∂ª‡∂± ‡∑Ä‡∑í‡∂ß ‡∑É‡∂Ω‡∑è‡∂≠‡∑ä ‡∂Ø‡∑ô‡∂ö‡∂ö‡∂ß ‡∂ë‡∂ö‡∑ä ‡∂Ö‡∂Ø‡∑è‡∂±‡∂∫‡∂ö‡∑ä ‡∂Ø, ‡∂ë‡∂ö‡∑í‡∂±‡∑ô‡∂ö ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂±‡∑ä ‡∂ë‡∂ö‡∑í‡∂±‡∑ô‡∂ö‡∂ß ‡∑Ä‡∑ô‡∂±‡∑ä ‡∑Ä‡∑ô‡∂±‡∑ä ‡∑Ä‡∑Å‡∂∫‡∑ô‡∂±‡∑ä ‡∂â‡∂ö‡∑è‡∂∏‡∂≠‡∑ä ‡∂Ø ‡∂ö‡∑í‡∑Ä ‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.

06. ‡∂ö‡∑ô‡∂ß‡∑í ‡∂ö‡∂ª ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ ‡∂â‡∂ß‡∑î ‡∂ö‡∂ª‡∂±‡∑ä‡∂±‡∂±‡∑ä, ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ ‡∑É‡∂∏‡∑ä‡∂¥‡∑ñ‡∂ª‡∑ä‡∂´ ‡∑Ä‡∑Å‡∂∫‡∑ô‡∂±‡∑ä ‡∂â‡∂ß‡∑î ‡∂ö‡∂ª‡∂± ‡∂â‡∂∏‡∑è‡∂∏‡∑ä‡∑Ä‡∂ª‡∂∫‡∑è ‡∂¥‡∑í‡∑Ö‡∑í‡∂¥‡∑ê‡∂Ø‡∑í‡∂∫ ‡∂±‡∑ú‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.

**‡∂Ö‡∂∑‡∑ä‚Äç‡∂∫‡∑è‡∑É**

‡∂¥‡∑Ñ‡∂≠ ‡∂¥‚Äç‡∑ä‚Äç‡∂ª‡∂ö‡∑è‡∑Å ‡∑Ñ‡∂ª‡∑í ‡∂±‡∂∏‡∑ä ( ‚úî ) ‡∂Ω‡∂ö‡∑î‡∂´ ‡∂Ø, ‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂±‡∂∏‡∑ä ( x ) ‡∂Ω‡∂ö‡∑î‡∂´ ‡∂Ø ‡∑Ä‡∂ª‡∑Ñ‡∂±‡∑ä ‡∂≠‡∑î‡∑Ö ‡∂∫‡∑ú‡∂Ø‡∂±‡∑ä‡∂±.

‡∂Ö) 1. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∑ô‡∂ö‡∑î‡∂ß ‡∑É‡∂Ω‡∑è‡∂≠‡∑ä ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏ ‡∂Ö‡∂¥‡∑Ñ‡∑É‡∑î ‡∂±‡∂∏‡∑ä ‡∂ö‡∂Ω‡∑è ‡∂ö‡∑Ö ‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.                                             
   2. ‡∑Ü‡∂¢‡∑ä‡∂ª‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ ‡∂ö‡∑ô‡∂ß‡∑í ‡∂ö‡∂ª ‡∑Ñ‡∑ù ‡∂ë‡∂ö‡∂≠‡∑î ‡∂ö‡∂ª ‡∂â‡∂ß‡∑î‡∂ö‡∑Ö ‡∂±‡∑ú‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.      	
   3. ‚Äò‡∂¢‡∂∏‡∑ä‡∂ã ‡∂≠‡∂ö‡∑ä‡∂Ø‡∑ì‡∂∏‡∑ä‚Äô ‡∂∫‡∂±‡∑î ‡∂¥‡∑ô‡∂ª‡∂ß‡∑î ‡∂ö‡∂ª ‡∑É‡∂Ω‡∑è‡∂≠‡∑ä ‡∂â‡∂ß‡∑î ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏‡∂∫‡∑í.                           	
   4. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∂±‡∑ä ‡∂ö‡∑í‡∑É‡∑í ‡∂∏ ‡∑É‡∑î‡∂±‡∑ä‡∂±‡∂≠‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂ö‡∑ä ‡∂â‡∂ß‡∑î ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏ ‡∂Ö‡∑Ä‡∑Å‡∑ä‚Äç‡∂∫ ‡∂±‡∑ú‡∑Ä‡∑ö.                                                      		

(‡∂Ü) ‡∂î‡∂∂ ‡∂¢‡∑ì‡∑Ä‡∑í‡∂≠‡∂∫‡∑ö ‡∂ö‡∑É‡∑ä‡∂ª‡∑î ‡∂¢‡∂∏‡∑ä‡∂ã ‡∂ö‡∑Ö ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂ö‡∑ô‡∂ß‡∑í‡∂∫‡∑ô‡∂±‡∑ä ‡∑Ä‡∑í‡∂ú‚Äç‡∑ä‚Äç‡∂ª‡∑Ñ ‡∂ö‡∂ª‡∂±‡∑ä‡∂±.
"""

pred = """
03. ‡∂ö‡∑É‡∑ä‡∂ª‡∑î ‡∑Ñ‡∑è ‡∂¢‡∂∏‡∑ä‡∂ã ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏‡∂ß ‡∂ú‡∂∏‡∂± ‡∂Ø ‡∂Ø‡∑í‡∂ú ‡∂ú‡∂∏‡∂±‡∂ö‡∑ä ‡∑Ä‡∑í‡∂∫ ‡∂∫‡∑î‡∂≠‡∑î‡∂∫. ‡∂∏‡∑ô‡∂∏ 
‡∑Ä‡∂ª‡∂¥‚Äç‡∑ä‚Äç‡∂ª‡∑É‡∑è‡∂Ø‡∂∫ ‡∂ú‡∂∏‡∑ä ‡∂¥‚Äç‡∑ä‚Äç‡∂ª‡∂Ø‡∑ö‡∑Å‡∂∫‡∑ö ‡∑É‡∑ì‡∂∏‡∑è‡∑Ä ‡∂â‡∂ö‡∑ä‡∂∏‡∑Ä‡∑ñ ‡∑Ä‡∑í‡∂ú‡∑É ‡∂∏ ‡∂á‡∂ª‡∂π‡∑ö. ‡∂ë‡∂∏‡∑ô‡∂±‡∑ä ‡∂∏ 
‡∂¢‡∂∏‡∑ä‡∂ã ‡∂≠‡∂É‡∂ö‡∑ì‡∂ª‡∑ä ‡∂â‡∂ß‡∑î‡∂ö‡∂ª‡∂±‡∑ä‡∂±‡∑è ‡∂Ö‡∂Ø‡∑è‡∑Ö ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∑ö ‡∑Ä‡∑ö‡∂Ω‡∑è‡∑Ä ‡∂â‡∂ö‡∑ä‡∂∏‡∑Ä‡∑ì‡∂∏‡∂ß ‡∂¥‡∑ô‡∂ª ‡∂Ö‡∂±‡∑ô‡∂ö‡∑ä 
‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ ‡∑Ñ‡∑è ‡∂ë‡∂ö‡∑ä ‡∂ö‡∑ú‡∂ß ‡∂â‡∂ß‡∑î ‡∂ö‡∂ª‡∂± ‡∂∂‡∑Ä‡∂ß ‡∂±‡∑í‡∂∫‡∑í‡∂∫‡∂≠‡∂∫ ‡∂≠‡∑ê‡∂∂‡∑í‡∂∫ ‡∂∫‡∑î‡∂≠‡∑î ‡∂∫.

04. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∑è ‡∑Ü‡∂¢‡∑ä‡∂ª‡∑ä‡∑Ñ‡∑í ‡∑É‡∑î‡∂±‡∑ä‡∂±‡∂≠‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂±‡∑ä ‡∑Ñ‡∑è ‡∑Ä‡∑í‡∂≠‡∑ä‡∂ª‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂±‡∑ä 
‡∂±‡∑ú‡∂ö‡∂©‡∑Ä‡∑è ‡∂â‡∂ß‡∑î ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏ ‡∂∫‡∑Ñ‡∂¥‡∂≠‡∑ä ‡∂∫.

05. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∂±‡∑ä ‡∂¢‡∂∏‡∑ä‡∂ã ‡∂ö‡∂ª‡∂± ‡∑Ä‡∑í‡∂ß 
‡∑É‡∂Ω‡∑è‡∂≠‡∑ä ‡∂Ø‡∑ô‡∂ö‡∂ö‡∂ß ‡∂ë‡∂ö‡∑ä ‡∂Ö‡∂Ø‡∑è‡∂±‡∂∫‡∂ö‡∑ä ‡∂Ø, ‡∂ë‡∂ö‡∑í‡∂±‡∑ô‡∂ö 
‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂±‡∑ä ‡∂ë‡∂ö‡∑í‡∂±‡∑ô‡∂ö‡∂ß ‡∑Ä‡∑ô‡∂±‡∑ä ‡∑Ä‡∑ô‡∂±‡∑ä ‡∑Ä‡∑Å‡∂∫‡∑ô‡∂±‡∑ä 
‡∂â‡∂ö‡∑è‡∂∏‡∂≠‡∑ä ‡∂Ø ‡∂ö‡∑í‡∑Ä ‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.

06. ‡∂ö‡∑ô‡∂ß‡∑í ‡∂ö‡∂ª ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ ‡∂â‡∂ß‡∑î ‡∂ö‡∂ª‡∂±‡∑ä‡∂±‡∂±‡∑ä, ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ 
‡∑É‡∂∏‡∑ä‡∂¥‡∑ñ‡∂ª‡∑ä‡∂´ ‡∑Ä‡∑Å‡∂∫‡∑ô‡∂±‡∑ä ‡∂â‡∂ß‡∑î ‡∂ö‡∂ª‡∂± ‡∂â‡∂∏‡∑è‡∂∏‡∑ä‡∑Ä‡∂ª‡∂∫‡∑è ‡∂¥‡∑í‡∑Ö‡∑í‡∂¥‡∑ê‡∂Ø‡∑í‡∂∫ 
‡∂±‡∑ú‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.

**‡∂Ö‡∂∑‡∑ä‚Äç‡∂∫‡∑è‡∑É**

‡∂¥‡∑Ñ‡∂≠ ‡∂¥‚Äç‡∑ä‚Äç‡∂ª‡∂ö‡∑è‡∑Å ‡∑Ñ‡∂ª‡∑í ‡∂±‡∂∏‡∑ä ( ‚úî ) ‡∂Ω‡∂ö‡∑î‡∂´ ‡∂Ø, ‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂±‡∂∏‡∑ä ( x ) ‡∂Ω‡∂ö‡∑î‡∂´ ‡∂Ø ‡∑Ä‡∂ª‡∑Ñ‡∂±‡∑ä ‡∂≠‡∑î‡∑Ö ‡∂∫‡∑ú‡∂Ø‡∂±‡∑ä‡∂±.

(‡∂Ö)
1. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∑ô‡∂ö‡∑î‡∂ß ‡∑É‡∂Ω‡∑è‡∂≠‡∑ä ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏ ‡∂Ö‡∂¥‡∑Ñ‡∑É‡∑î ‡∂±‡∂∏‡∑ä ‡∂ö‡∂Ω‡∑è ‡∂ö‡∑Ö ‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.
2. ‡∑Ü‡∂¢‡∑ä‡∂ª‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫ ‡∂ö‡∑ô‡∂ß‡∑í ‡∂ö‡∂ª ‡∑Ñ‡∑ù ‡∂ë‡∂ö‡∂≠‡∑î ‡∂ö‡∂ª ‡∂â‡∂ß‡∑î‡∂ö‡∑Ö ‡∂±‡∑ú‡∑Ñ‡∑ê‡∂ö‡∑í ‡∂∫.
3. ‚Äò‡∂¢‡∂∏‡∑ä‡∂ã ‡∂≠‡∂ö‡∑ä‡∂Ø‡∑ì‡∂∏‡∑ä‚Äô ‡∂∫‡∂±‡∑î ‡∂¥‡∑ô‡∂ª‡∂ß‡∑î ‡∂ö‡∂ª ‡∑É‡∂Ω‡∑è‡∂≠‡∑ä ‡∂â‡∂ß‡∑î ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏‡∂∫‡∑í.
4. ‡∂ú‡∂∏‡∂±‡∑ô‡∑Ñ‡∑í ‡∂∫‡∑ô‡∂Ø‡∑ô‡∂±‡∑ä‡∂±‡∂±‡∑ä ‡∂ö‡∑í‡∑É‡∑í ‡∂∏ ‡∑É‡∑î‡∂±‡∑ä‡∂±‡∂≠‡∑ä ‡∑É‡∂Ω‡∑è‡∂≠‡∂∫‡∂ö‡∑ä ‡∂â‡∂ß‡∑î ‡∂ö‡∑í‡∂ª‡∑ì‡∂∏ ‡∂Ö‡∑Ä‡∑Å‡∑ä‚Äç‡∂∫ ‡∂±‡∑ú‡∑Ä‡∑ö.

(‡∂Ü) ‡∂î‡∂∂ ‡∂¢‡∑ì‡∑Ä‡∑í‡∂≠‡∂∫‡∑ö ‡∂ö‡∑É‡∑ä‡∂ª‡∑î ‡∂¢‡∂∏‡∑ä‡∂ã ‡∂ö‡∑Ö ‡∂Ö‡∂≠‡∑ä‡∂Ø‡∑ê‡∂ö‡∑ì‡∂∏‡∂ö‡∑ä ‡∂ö‡∑ô‡∂ß‡∑í‡∂∫‡∑ô‡∂±‡∑ä ‡∑Ä‡∑í‡∂ú‚Äç‡∑ä‚Äç‡∂ª‡∑Ñ ‡∂ö‡∂ª‡∂±‡∑ä‡∂±.
"""

evaluator = MarkdownEvaluator()
metrics = evaluator.evaluate(gt, pred)

print(f"Overall Score: {metrics['final_score']:.4f}")
print("-" * 40)
for m in metrics['matches']:
    print(f"Section: {m['section']}")
    print(f"Score:   {m['score']:.4f}")
    if m['diff']:
        print(f"Errors:  {m['diff']}")

Overall Score: 0.9989
----------------------------------------
Section: Whole Document
Score:   0.9989
Errors:  EXTRA in Pred: ' ' | EXTRA in Pred: '('
