# Evaluation Logic Test

This notebook tests the `HungarianEvaluator` using the processed Kaggle dataset.

In [None]:
import sys
import os
import json

# Add project root to path
sys.path.append(os.path.abspath("../.."))

from core.parsing.evaluators.evaluator import HungarianEvaluator
from core.parsing.evaluators import  FullResumeEvaluator


In [2]:
def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)

data_dir = "tests_data/resume_and_texts_kaggle/some"
files = [f for f in os.listdir(data_dir) if f.endswith(".json")]
files.sort()

print(f"Found {len(files)} JSON files.")
print(files)

Found 7 JSON files.
['ADVOCATE_14445309.json', 'BUSINESS-DEVELOPMENT_65708020.json', 'DESIGNER_37058472.json', 'HEALTHCARE_23617240.json', 'HR_16852973.json', 'INFORMATION-TECHNOLOGY_36856210.json', 'TEACHER_12467531.json']


In [3]:
if len(files) >= 2:
    # Select two different files to compare
    file_gt = files[0]
    file_pred = files[1]
    
    path_gt = os.path.join(data_dir, file_gt)
    path_pred = os.path.join(data_dir, file_pred)
    
    print(f"Ground Truth: {file_gt}")
    print(f"Predicted:    {file_pred}")
    
    gt_data = load_json(path_gt)
    pred_data = load_json(path_pred)
else:
    print("Not enough files to compare.")

Ground Truth: ADVOCATE_14445309.json
Predicted:    BUSINESS-DEVELOPMENT_65708020.json


In [4]:
# Initialize Evaluator
evaluator = HungarianEvaluator()

# Config for 'work' section
config = {
    "key_fields": ["name", "position"],
    "fields": {
        "name": "substring",
        "position": "substring",
        "startDate": "date",
        "summary": "text_similarity"
    }
}

gt_work = gt_data.get("work", [])
pred_work = pred_data.get("work", [])

print(f"GT Work Items: {len(gt_work)}")
print(f"Pred Work Items: {len(pred_work)}")

GT Work Items: 2
Pred Work Items: 3


In [5]:
# Run Evaluation
metrics = evaluator.evaluate_section(gt_work, pred_work, config)

print("Evaluation Metrics:")
print(json.dumps(metrics, indent=2))

Evaluation Metrics:
{
  "precision": 0.6666666666666666,
  "recall": 1.0,
  "f1": 0.8,
  "field_scores": {
    "name": 0.5,
    "position": 0.0,
    "startDate": 0.0,
    "summary": 1.0
  }
}


In [6]:
# gt_data, pred_data

In [7]:
# Run full resume evaluation
evaluator = FullResumeEvaluator()

results = evaluator.evaluate_resume(gt_data, pred_data)
overall = evaluator.compute_overall_metrics(results)
# Display results for each section
for section, metrics in results.items():
    print(f"\n{section.upper()}:")
    print(f"  Precision: {metrics['precision']:.2f}")
    print(f"  Recall:    {metrics['recall']:.2f}")
    print(f"  F1:        {metrics['f1']:.2f}")
    
    if metrics.get('field_scores'):
        print(f"  Field Scores:")
        for field, score in metrics['field_scores'].items():
            print(f"    - {field}: {score:.2f}")

print("\n" + "=" * 60)
print("Full resume evaluation complete!")
print("Overall Metrics:", overall)


BASICS:
  Precision: 1.00
  Recall:    1.00
  F1:        0.72
  Field Scores:
    - name: 1.00
    - label: 0.29
    - email: 1.00
    - phone: 1.00
    - url: 1.00
    - summary: 0.06

WORK:
  Precision: 0.67
  Recall:    1.00
  F1:        0.80
  Field Scores:
    - name: 0.50
    - position: 0.00
    - startDate: 0.00
    - endDate: 0.00
    - summary: 1.00

EDUCATION:
  Precision: 1.00
  Recall:    0.33
  F1:        0.50
  Field Scores:
    - institution: 0.00
    - area: 0.00
    - studyType: 0.00
    - startDate: 1.00
    - endDate: 0.00

SKILLS:
  Precision: 1.00
  Recall:    0.86
  F1:        0.92
  Field Scores:
    - name: 0.00
    - level: 1.00

PROJECTS:
  Precision: 0.00
  Recall:    0.00
  F1:        0.00
  Field Scores:
    - name: 0.00
    - description: 0.00
    - startDate: 0.00
    - endDate: 0.00

AWARDS:
  Precision: 1.00
  Recall:    1.00
  F1:        1.00
  Field Scores:
    - title: 0.00
    - date: 1.00
    - awarder: 0.00
    - summary: 1.00

CERTIFICATES:
  P

# for two identical JSONs

In [8]:
# Run full resume evaluation
evaluator = FullResumeEvaluator()

results = evaluator.evaluate_resume(gt_data, gt_data)
overall = evaluator.compute_overall_metrics(results)
# Display results for each section
for section, metrics in results.items():
    print(f"\n{section.upper()}:")
    print(f"  Precision: {metrics['precision']:.2f}")
    print(f"  Recall:    {metrics['recall']:.2f}")
    print(f"  F1:        {metrics['f1']:.2f}")
    
    if metrics.get('field_scores'):
        print(f"  Field Scores:")
        for field, score in metrics['field_scores'].items():
            print(f"    - {field}: {score:.2f}")

print("\n" + "=" * 60)
print("Full resume evaluation complete!")
print("Overall Metrics:", overall)


BASICS:
  Precision: 1.00
  Recall:    1.00
  F1:        1.00
  Field Scores:
    - name: 1.00
    - label: 1.00
    - email: 1.00
    - phone: 1.00
    - url: 1.00
    - summary: 1.00

WORK:
  Precision: 1.00
  Recall:    1.00
  F1:        1.00
  Field Scores:
    - name: 1.00
    - position: 1.00
    - startDate: 1.00
    - endDate: 1.00
    - summary: 1.00

EDUCATION:
  Precision: 1.00
  Recall:    1.00
  F1:        1.00
  Field Scores:
    - institution: 1.00
    - area: 1.00
    - studyType: 1.00
    - startDate: 1.00
    - endDate: 1.00

SKILLS:
  Precision: 1.00
  Recall:    1.00
  F1:        1.00
  Field Scores:
    - name: 1.00
    - level: 1.00

PROJECTS:
  Precision: 0.00
  Recall:    0.00
  F1:        0.00
  Field Scores:
    - name: 0.00
    - description: 0.00
    - startDate: 0.00
    - endDate: 0.00

AWARDS:
  Precision: 1.00
  Recall:    1.00
  F1:        1.00
  Field Scores:
    - title: 1.00
    - date: 1.00
    - awarder: 1.00
    - summary: 1.00

CERTIFICATES:
  P