In [31]:
import os
import json
import cv2
from sklearn.metrics import precision_score, recall_score, f1_score
import difflib
from fuzzywuzzy import fuzz




In [32]:
# Path configurations
#DATASET_DIR = "/path/to/ICDAR2019/dataset"
GROUND_TRUTH_DIR = "output/groundtruth"  # Update if structure differs
#SCANNED_IMAGES_DIR = os.path.join(DATASET_DIR, "images")
PREDICTIONS_DIR = "output/extracted"

# Load ground truth
def load_ground_truth(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def load_predictions(file_path):
    if not os.path.exists(file_path):
        return None
    with open(file_path, 'r') as file:
        data = json.load(file)
        
        # Flatten nested JSON structure
        flattened = {}
        def flatten_dict(d, parent_key=''):
            for k, v in d.items():
                new_key = f"{parent_key}_{k}" if parent_key else k
                if isinstance(v, dict):
                    flatten_dict(v, new_key)
                else:
                    flattened[new_key.lower()] = str(v).lower().strip() if v is not None else ""
        
        flatten_dict(data)
        return flattened

    

In [33]:
def similarity_score(str1, str2):
    """Computes a similarity score using fuzzy matching to allow minor formatting differences."""
    return fuzz.ratio(str1, str2) / 100.0  # Normalize to range [0,1]

def evaluate(ground_truth, predictions, threshold=0.7):
    """Evaluates predictions with leniency for minor formatting issues."""
    gt_processed = {k.lower(): str(v).lower().strip() for k, v in ground_truth.items()}
    
    gt_fields = set(gt_processed.keys())
    pred_fields = set(predictions.keys())
    common_fields = gt_fields.intersection(pred_fields)
    
    if not common_fields:
        print("Warning: No common fields found between ground truth and predictions")
        return {}, 0, 0, 0
    
    results = {}
    for field in common_fields:
        gt_value = gt_processed[field]
        pred_value = predictions[field]
        
        similarity = similarity_score(gt_value, pred_value)
        match = 1 if similarity >= threshold else 0
        
        # Keep precision and recall same as before
        precision = match
        recall = match
        
        # Calculate F1 using the standard formula
        if precision + recall > 0:
            f1 = (2 * precision * recall) / (precision + recall)
        else:
            f1 = 0
        
        results[field] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'similarity': similarity,
            'gt_value': gt_value,
            'pred_value': pred_value
        }
    
    # Calculate average metrics across all fields
    avg_precision = sum(results[f]['precision'] for f in common_fields) / len(common_fields)
    avg_recall = sum(results[f]['recall'] for f in common_fields) / len(common_fields)
    
    # Calculate overall F1 using the averages
    if avg_precision + avg_recall > 0:
        avg_f1 = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall)
    else:
        avg_f1 = 0
    
    return results, avg_precision, avg_recall, avg_f1

In [34]:
# Main function
def main():
    results = []
    gt_files = [f for f in os.listdir(GROUND_TRUTH_DIR) if f.endswith('.txt')]
    
    for gt_file in gt_files:
        base_name = os.path.splitext(gt_file)[0]
        gt_path = os.path.join(GROUND_TRUTH_DIR, gt_file)
        pred_path = os.path.join(PREDICTIONS_DIR, f"{base_name}_gpt_extracted.json")
        
        if not os.path.exists(pred_path):
            print(f"Missing predictions for {base_name}. Skipping...")
            continue
        
        ground_truth = load_ground_truth(gt_path)
        predictions = load_predictions(pred_path)
        
        if predictions is None:
            print(f"Error loading predictions for {base_name}. Skipping...")
            continue
        
        field_results, avg_precision, avg_recall, avg_f1 = evaluate(ground_truth, predictions)
        results.append((base_name, field_results, avg_precision, avg_recall, avg_f1))
    
    print("\nDetailed Results:")
    print("-" * 100)
    for file_name, field_results, avg_p, avg_r, avg_f1 in results:
        print(f"\nFile: {file_name}")
        for field, metrics in field_results.items():
            print(f"\n{field}:")
            print(f"GT:   {metrics['gt_value']}")
            print(f"Pred: {metrics['pred_value']}")
            print(f"Similarity: {metrics['similarity']:.2f}")
            print(f"Metrics: Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1={metrics['f1']:.2f}")
        print(f"\nAverage: Precision={avg_p:.2f}, Recall={avg_r:.2f}, F1={avg_f1:.2f}")
        print("-" * 100)
    
    if results:
        overall_precision = sum(r[2] for r in results) / len(results)
        overall_recall = sum(r[3] for r in results) / len(results)
        overall_f1 = sum(r[4] for r in results) / len(results)
        print("\nOverall Metrics:")
        print(f"Precision: {overall_precision:.4f}")
        print(f"Recall: {overall_recall:.4f}")
        print(f"F1 Score: {overall_f1:.4f}")

In [35]:
if __name__ == "__main__":
    main()


Detailed Results:
----------------------------------------------------------------------------------------------------

File: X00016469612

date:
GT:   25/12/2018
Pred: 25/12/2018
Similarity: 1.00
Metrics: Precision=1.00, Recall=1.00, F1=1.00

company:
GT:   book ta .k (taman daya) sdn bhd
Pred: book ta ktamandaya sdn bhd
Similarity: 0.91
Metrics: Precision=1.00, Recall=1.00, F1=1.00

address:
GT:   no.53 55,57 & 59, jalan sagu 18, taman daya, 81100 johor bahru, johor.
Pred: no.555,57&59jalansagu18, taman daya, 81100 johor bahru, johor.
Similarity: 0.94
Metrics: Precision=1.00, Recall=1.00, F1=1.00

total:
GT:   9.00
Pred: 9.0
Similarity: 0.86
Metrics: Precision=1.00, Recall=1.00, F1=1.00

Average: Precision=1.00, Recall=1.00, F1=1.00
----------------------------------------------------------------------------------------------------

File: X00016469619

date:
GT:   19/10/2018
Pred: 19/10/2018
Similarity: 1.00
Metrics: Precision=1.00, Recall=1.00, F1=1.00

company:
GT:   indah gift & 