In [1]:
import re

def clean_term(term):
    term = re.sub(r'^\d+\.\s*', '', term)
    term = term.strip(" `'\t\n\r")
    http_match = re.search(r'(http[s]?://.+)', term)
    return http_match.group(1).strip() if http_match else term

def extract_equivalence_lines(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [line.strip() for line in lines if '≡' in line]

def extract_lhs_rhs_from_lines(eq_lines):
    raw_lhs = []
    raw_rhs = []

    for line in eq_lines:
        match = re.search(r'`([^`]+)`\s*≡\s*`([^`]+)`', line)
        if not match:
            match = re.search(r'(.+?)\s*≡\s*(.+)', line)

        if match:
            lhs_raw = match.group(1)
            rhs_raw = match.group(2)
            raw_lhs.append(clean_term(lhs_raw))
            raw_rhs.append(clean_term(rhs_raw))

    return sorted(set(raw_lhs)), sorted(set(raw_rhs))

# file_path = '/content/drive/MyDrive/EnslavedOM/Outputs1/alignment_output_AgeRecordRules_3.txt'  # Use your actual path
# eq_lines = extract_equivalence_lines(file_path)
# lhs_list, rhs_list = extract_lhs_rhs_from_lines(eq_lines)

# # === Clean Output ===
# print("LHS Elements:")
# for item in lhs_list:
#     print(f"- {item}")

# print("\nRHS Elements:")
# for item in rhs_list:
#     print(f"- {item}")


In [2]:
def parse_reference_rule(rule):
    lhs, rhs = rule.split("↔")
    lhs_atoms = [atom.strip() for atom in lhs.strip().split("?")]
    rhs_atoms = [atom.strip() for atom in rhs.strip().split("?")]
    return lhs_atoms, rhs_atoms

In [3]:
def extract_name(atom):
    match = re.search(r':\s*([^(]+)', atom)
    return match.group(1).strip() if match else None

In [4]:
def insert_arrow_before_right_side(rule_line):
    parts = rule_line.strip().split('?')
    left_parts = []
    right_parts = []
    found_right = False

    for part in parts:
        part = part.strip()
        if not found_right and (part.startswith('ed') or part.startswith('ep') or part.startswith('wikibase')):
            found_right = True
        if found_right:
            right_parts.append(part)
        else:
            left_parts.append(part)

    return ' ? '.join(left_parts) + ' ↔ ' + ' ? '.join(right_parts)


In [7]:
import os
import pandas as pd
data = []
df = pd.read_csv('EnslavedR.csv')
textfile_dir = 'Outputs'
col_names = df.columns
print(col_names)
for name in col_names:
  print("col name",name)
  cname = df[name]
  for i in range (cname.count()):
      print(cname[i])
      f = "alignment_output_" +str(name)+"_"+str(i+1)+".txt"
      file_path = os.path.join(textfile_dir, f)
      eq_lines = extract_equivalence_lines(file_path)
      lhs_list, rhs_list = extract_lhs_rhs_from_lines(eq_lines)


      formatted_rule = insert_arrow_before_right_side(cname[i])
      ref_lhs, ref_rhs = parse_reference_rule(formatted_rule)
      print(rhs_list)
      cleaned_lhs = [extract_name(a) for a in ref_lhs]
      cleaned_rhs = [extract_name(a) for a in ref_rhs]

      count = 0
      for i in range(len(cleaned_lhs)):
          if cleaned_lhs[i] in rhs_list:
            count+=1


      data.append({
                'rule_name': cname[i],
                'file_name': f,
                'matched': count,
                'len(generated)': len(rhs_list),
                'len(reference)': len(ref_lhs)
            })
rules_df = pd.DataFrame(data)

rules_df.to_csv('rules_evaluation2.csv', index=False)


Index(['AgeRecordRules', 'SexRecordRules', 'NameRecordRules',
       'PersonStatusRecordRules', 'ParticipantRoleRecordRules',
       'OccupationRecordRules', 'RaceRecordRules',
       'InterAgentRelationshipRecordRules', 'OriginRecordRules',
       'ResearchProjectRules', 'PlaceRules', 'EventRules',
       'EntityWithProvenanceRules'],
      dtype='object')
col name AgeRecordRules
enslaved : Person(x ) ? ed : Q410(Person)(x )
['Person']
enslaved : AgeCategory(x) ? ed : Q412(AgeCategory)(x)
[]
enslaved : Person(x) ? enslaved : hasAgeRecord(x, y) ? enslaved : AgeRecord(y) ? ed : Q410(Person)(x) ? ep : P42(hasAge)(x, y) ? wikibase : Statement(y)
['AgentRecord', 'Person', 'hasAgeRecord']
enslaved : Person(w) ? enslaved : hasAgeRecord(w, x) ? enslaved : AgeRecord(x) ? enslaved : hasAgeValue(x, z) ? ed : Q410(Person)(w) ? ep : P42(hasAge)(w, x) ? wikibase : Statement(x) ? eps : P42(hasAge)(x, y)? ed : Q424(AgeRecord)(y) ? edt : P3(hasAgeValue)(y, z)
['AgeRecord` (Enslaved)', 'AgentRecord` (E

In [9]:
precision = rules_df['matched'].sum() / rules_df['len(generated)'].sum()
recall = rules_df['matched'].sum() / rules_df['len(reference)'].sum()

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Precision: 0.2646
Recall: 0.2831
