# Synthetic tables

In [160]:
import os
import pypdfium2 as pdfium

pdf_dir = "../../benchmark_truth/synthetic_tables/separate_files"
pdf_texts = []

for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(pdf_dir, filename)
        doc = pdfium.PdfDocument(file_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.get_page(page_num)
            text += page.get_textpage().get_text_range()
        pdf_texts.append({'filepath': file_path, 'text': text})

# pdf_texts now contains the extracted text for each PDF file



In [161]:
aktiva_structure_hgb = {
    'Anlagevermögen': {
        'Immaterielle Vermögensgegenstände': [
            'Selbst geschaffene gewerbliche Schutzrechte und ähnliche Rechte und Werte',
            'Geschäfts- oder Firmenwert',
            'geleistete Anzahlungen',
            'entgeltlich erworbene Konzessionen, gewerbliche Schutzrechte und ähnliche Rechte und Werte sowie Lizenzen an solchen Rechten und Werten'
        ], 
        'Sachanlagen': [
            'Grundstücke, grundstücksgleiche Rechte und Bauten einschließlich der Bauten auf fremden Grundstücken',
            'Technische Anlagen und Maschinen',
            'Andere Anlagen, Betriebs- und Geschäftsausstattung',
            'geleistete Anzahlungen und Anlagen im Bau'
        ],
        'Finanzanlagen': [
            'Sonstige Finanzanlagen',
            'Anteile an verbundenen Unternehmen',
            'Ausleihungen an verbundene Unternehmen',
            'Beteiligungen',
            'Ausleihungen an Unternehmen, mit denen ein Beteiligungsverhältnis besteht',
            'Wertpapiere des Anlagevermögens',
            'Sonstige Ausleihungen'
        ]
    },
    'Umlaufvermögen': {
        'Vorräte': [
            'Roh-, Hilfs- und Betriebsstoffe',
            'Unfertige Erzeugnisse, unfertige Leistungen',
            'Fertige Erzeugnisse und Waren',
            'Geleistete Anzahlungen'
        ],
        'Forderungen und sonstige Vermögensgegenstände': [
            'Forderungen aus Lieferungen und Leistungen',
            'Forderungen gegen verbundene Unternehmen',
            'Forderungen gegen Unternehmen, mit denen ein Beteiligungsverhältnis besteht',
            'Sonstige Vermögensgegenstände'
        ],
        'Wertpapiere': [
            'Anteile an verbundenen Unternehmen',
            'Sonstige Wertpapiere'
        ],
        'Kassenbestand, Bundesbankguthaben, Guthaben bei Kreditinstituten und Schecks': []
    },
    'Rechnungsabgrenzungsposten': dict(),
    'Aktive latente Steuern': dict(),
    'Aktiver Unterschiedsbetrag aus der Vermögensverrechnung': dict()
}

In [162]:
entry = pdf_texts[0]
first_text = entry['text']
print(first_text)

Aktiva 31.12.2017 31.12.2016
Tsd. € Tsd. € Tsd. € Tsd. €
A. Anlagevermögen
I. Immaterielle Vermögensgegenstände
1. Selbst geschaffene gewerbliche Schutzrechte und ähnliche Rechte und Werte 1.000,23 7.337,60
2. Geschäfts- oder Firmenwert 6.779,06 3.317,23
3. geleistete Anzahlungen 1.019,10 8.777,59
4. entgeltlich erworbene Konzessionen, gewerbliche Schutzrechte und ähnliche Rechte und Werte sowie Lizenzen an
solchen Rechten und Werten
8,41 2.829,97
8.806,80 22.262,39
II. Sachanlagen
1. Grundstücke, grundstücksgleiche Rechte und Bauten einschließlich der Bauten auf fremden Grundstücken 4.705,04 6.603,19
2. Technische Anlagen und Maschinen 8.166,37 4.283,97
3. Andere Anlagen, Betriebs- und Geschäftsausstattung 35,77 2.611,16
4. geleistete Anzahlungen und Anlagen im Bau 6.856,40 9.846,82
19.763,58 23.345,14
III. Finanzanlagen
1. Sonstige Finanzanlagen 5.493,46 8.893,84
2. Anteile an verbundenen Unternehmen 4.366,63 9.122,83
3. Ausleihungen an verbundene Unternehmen 8.879,80 148,39
4. Ausle

In [163]:
import re

def extract_numbers_by_key(text, key):
    # Find all numbers in the text
    # Match numbers with optional thousands separators and decimal part
    # Search for the key followed by numbers, and extract the substring starting from the key
    text = text.replace('\n', ' ').replace('\r', '')  # Replace newlines with spaces for better regex matching
    if key not in text:
        # print(f"Key '{key}' not found in text.")
        pass
    match = re.search(rf'({key}\s*((?:\d{{1,3}}(?:[\s.]\d{{3}})*[,]\d+[\s]*)+))', text)
    if match:
        numbers_str = match.group(1)
        numbers = re.findall(r'\d{1,3}(?:[.,]\d{3})*[.,]\d+', numbers_str)
    else:
        numbers = []
    # print(f"Extracted numbers for key '{key}': {numbers}")
    return [float(num.replace('.', '').replace(',', '.')) for num in numbers]  # Replace comma with dot for float conversion

def add_row(extracted_rows, key1, key2, item, numbers):
    if len(numbers) > 0:
        if len(numbers) > 1:
            extracted_rows.append({'E1': key1, 'E2': key2, 'E3': item, 'year': numbers[0], 'previous_year': numbers[1]})
            if len(numbers) > 2:
                # print(f"Warning: More than two numbers found for {item}. Only the first two will be used.")
                pass
        else:
            extracted_rows.append({'E1': key1, 'E2': key2, 'E3': item, 'year': numbers[0], 'previous_year': None})
    else:
        extracted_rows.append({'E1': key1, 'E2': key2, 'E3': item, 'year': None, 'previous_year': None})

def extract_numbers(text):
    extracted_rows = []

    for key1, value1 in aktiva_structure_hgb.items():
        for key2, value2 in value1.items():
            for item in value2:
                # print(f"{key1} > {key2} > {item}")
                numbers = extract_numbers_by_key(text, item)
                # print(f"Numbers found: {numbers}")
                add_row(extracted_rows, key1, key2, item, numbers)
                numbers = []
            if len(value2) == 0:
                # print(f"{key1} > {key2} > (no items)")
                numbers = extract_numbers_by_key(text, key2)
                # print(f"Numbers found: {numbers}")
                add_row(extracted_rows, key1, key2, None, numbers)
                numbers = []
        if len(value1) == 0:
            # print(f"{key1} > (no subcategories)")
            numbers = extract_numbers_by_key(text, key1)
            # print(f"Numbers found: {numbers}")
            add_row(extracted_rows, key1, None, None, numbers)
            numbers = []

    return extracted_rows

In [164]:
import pandas as pd

extracted_rows = extract_numbers(first_text)
df_result = pd.DataFrame(extracted_rows)
df_result


Unnamed: 0,E1,E2,E3,year,previous_year
0,Anlagevermögen,Immaterielle Vermögensgegenstände,Selbst geschaffene gewerbliche Schutzrechte un...,1000.23,7337.6
1,Anlagevermögen,Immaterielle Vermögensgegenstände,Geschäfts- oder Firmenwert,6779.06,3317.23
2,Anlagevermögen,Immaterielle Vermögensgegenstände,geleistete Anzahlungen,1019.1,8777.59
3,Anlagevermögen,Immaterielle Vermögensgegenstände,"entgeltlich erworbene Konzessionen, gewerblich...",8.41,2829.97
4,Anlagevermögen,Sachanlagen,"Grundstücke, grundstücksgleiche Rechte und Bau...",4705.04,6603.19
5,Anlagevermögen,Sachanlagen,Technische Anlagen und Maschinen,8166.37,4283.97
6,Anlagevermögen,Sachanlagen,"Andere Anlagen, Betriebs- und Geschäftsausstat...",35.77,2611.16
7,Anlagevermögen,Sachanlagen,geleistete Anzahlungen und Anlagen im Bau,6856.4,9846.82
8,Anlagevermögen,Finanzanlagen,Sonstige Finanzanlagen,5493.46,8893.84
9,Anlagevermögen,Finanzanlagen,Anteile an verbundenen Unternehmen,4366.63,9122.83


In [165]:
import json
import nltk

ebnf_rows = pd.read_csv("../../benchmark_truth/real_tables/ebnf_rows.csv")

def parse_json(string):
    # Remove code block markers and extra whitespace, then parse as JSON
    json_str = string.strip()
    if json_str.startswith("```json"):
        json_str = json_str[len("```json"):].strip()
    if json_str.endswith("```"):
        json_str = json_str[:-3].strip()
    parsed_json = json.loads(json_str)
    return parsed_json


def replace_special_characters(text):
    # Replace German special characters in the input text
    return (
        text.replace("ä", "ae")
        .replace("ö", "oe")
        .replace("ü", "ue")
        .replace("Ä", "Ae")
        .replace("Ö", "Oe")
        .replace("Ü", "Ue")
        .replace("ß", "ss")
    )

def get_json_string(df, multiplier=1):
    df_rounded = df.copy()
    df_rounded = ebnf_rows.merge(df_rounded, how="left", on=["E1", "E2", "E3"])
    if len(df_rounded.columns) >= 2:
        last_two = df_rounded.columns[-2:]
        df_rounded = df_rounded.rename(columns={last_two[-2]: "year", last_two[-1]: "previous_year"})
    for col in ["year", "previous_year"]:
        if col in df_rounded.columns:
            df_rounded[col] = pd.to_numeric(df_rounded[col], errors='coerce')
            df_rounded[col] = df_rounded[col].apply(lambda x: f"{x/multiplier:.2f}" if pd.notnull(x) else x)
    # print(df_rounded.head(5))
    # print(df_rounded.shape[0], "rows in the dataframe after merging with EBNF rows.")

    s = df_rounded.to_json(orient='records', indent=0, force_ascii=False)#.replace("null", '"null"')
    s_fixed = re.sub(r'("year":)"([0-9\.\-e]+)"', r'\1\2', s)
    s_fixed = re.sub(r'("previous_year":)"([0-9\.\-e]+)"', r'\1\2', s_fixed)
    json_str = replace_special_characters(s_fixed)

    prev_entry = ",{\"E1\":\"Anlagevermoegen\",\"E2\":\"Finanzanlagen\",\"E3\":\"Ausleihungen an Unternehmen, mit denen ein Beteiligungsverhaeltnis besteht\""
    beteilingungen_entry = ",{\"E1\":\"Anlagevermoegen\",\"E2\":\"Finanzanlagen\",\"E3\":\"Beteiligungen\""

    if beteilingungen_entry not in json_str:
        json_str = json_str.replace(prev_entry, beteilingungen_entry + ',"year":null,"previous_year":null}' + prev_entry)
        # print(f"Ground truth JSON string: \n{json_str}")
    
    return json_str

unit_list = {
    'EUR': 1, 
    '€': 1, 
    'Tsd. EUR': 1000, 
    'Mio. EUR': 1000000, 
    'TEUR': 1000, 
    'T€': 1000, 
    'Tsd. €': 1000, 
    'Mio. €': 1000000
}

def evaluate_single_result(df_result, entry):
    evaluation = {
        'json_error': False,
        'grammar_error': False,
    }

    entry_filepath = entry.get('filepath', 'unknown').replace('.pdf', '.csv')
    if '.csv' in entry_filepath:
        df = pd.read_csv(entry_filepath)
    elif '.xlsx' in entry_filepath:
        df = pd.read_excel(entry_filepath)
        
    df = df.map(lambda x: replace_special_characters(x) if isinstance(x, str) else x)

    unit = entry_filepath.split('_')[-4]
    multiplier = unit_list.get(unit, 1)

    # rounds values matching with the one in the pdfs (synthetic tables important)
    # multiplier = 1  # Default multiplier, can be adjusted based on the unit in the entry
    json_str = get_json_string(df, multiplier=multiplier)

    try:
        truth = parse_json(json_str)
    except json.JSONDecodeError as e:
        raise ValueError(f"Error parsing JSON for truth: {json_str}\nError message: {e}")
    # pprint(truth)
    df_truth = pd.DataFrame(truth)
    # df_truth = df_truth.map(lambda x: x.lower() if isinstance(x, str) else x)
    # print(truth)

    df_result = df_result.copy()
    df_result = df_result.map(lambda x: replace_special_characters(x) if isinstance(x, str) else x)
    # print(df_result)

    # Join the ground truth and result dataframes on E1, E2, and E3 for comparison
    df_joined = pd.merge(
        df_truth,
        df_result,
        on=["E1", "E2", "E3"],
        how="left",
        suffixes=("_truth", "_result"),
        indicator=True
    )
    evaluation['df_joined'] = df_joined
    # print(df_joined[['E1', 'E2', 'E3', 'year_truth', 'year_result',]])

    evaluation['NA'] = {
        "true_positive": df_joined[(df_joined['_merge'] == 'both') & (df_joined['year_truth'].isna() & df_joined['year_result'].isna())].shape[0] + df_joined[(df_joined['_merge'] == 'both') & (df_joined['previous_year_truth'].isna() & df_joined['previous_year_result'].isna())].shape[0],
        "false_positive": df_joined[(df_joined['_merge'] == 'both') & (df_joined['year_truth'].notna() & df_joined['year_result'].isna())].shape[0] + df_joined[(df_joined['_merge'] == 'both') & (df_joined['previous_year_truth'].notna() & df_joined['previous_year_result'].isna())].shape[0],
        "false_negative": df_joined[(df_joined['_merge'] == 'both') & (df_joined['year_truth'].isna() & df_joined['year_result'].notna())].shape[0] + df_joined[(df_joined['_merge'] == 'both') & (df_joined['previous_year_truth'].isna() & df_joined['previous_year_result'].notna())].shape[0],
        "true_negative": df_joined[(df_joined['_merge'] == 'both') & (df_joined['year_truth'].notna() & df_joined['year_result'].notna())].shape[0] + df_joined[(df_joined['_merge'] == 'both') & (df_joined['previous_year_truth'].notna() & df_joined['previous_year_result'].notna())].shape[0]
    }

    df_year_non_na = df_joined[(df_joined['_merge'] == 'both') & (df_joined['year_truth'].notna() & df_joined['year_result'].notna())]
    df_year_non_na = df_year_non_na.drop(
        columns=[col for col in df_year_non_na.columns if 'previous_year' in col],
        errors='ignore'
    )
    try:
        df_year_non_na['relative_numeric_difference'] = (
            (df_year_non_na['year_result'] - df_year_non_na['year_truth']).abs() /
            df_year_non_na['year_truth'].abs().replace(0, 1)
        )
    except OverflowError:
        print(f"OverflowError calculating relative numeric difference")
        
    df_year_non_na['levenstein_distance'] = df_year_non_na.apply(
        lambda row: nltk.edit_distance(
        str(row['year_truth']), str(row['year_result'])
        ),
        axis=1
    )

    df_previous_year_non_na = df_joined[(df_joined['_merge'] == 'both') & (df_joined['previous_year_truth'].notna() & df_joined['previous_year_result'].notna())]
    df_previous_year_non_na = df_previous_year_non_na.drop(
        columns=[col for col in df_previous_year_non_na.columns if col.startswith('year')],
        errors='ignore'
    )
    df_previous_year_non_na['relative_numeric_difference'] = (
        (df_previous_year_non_na['previous_year_result'] - df_previous_year_non_na['previous_year_truth']).abs() /
        df_previous_year_non_na['previous_year_truth'].abs().replace(0, 1)
    )
    df_previous_year_non_na['levenstein_distance'] = df_previous_year_non_na.apply(
        lambda row: nltk.edit_distance(
        str(row['previous_year_truth']), str(row['previous_year_result'])
        ),
        axis=1
    )

    # Attach both columns for detailed analysis
    evaluation['relative_numeric_difference'] = {
        "mean": (df_year_non_na['relative_numeric_difference'].mean() + df_previous_year_non_na['relative_numeric_difference'].mean())/2,
        "median": (pd.concat([df_year_non_na['relative_numeric_difference'], df_previous_year_non_na['relative_numeric_difference']]).median())
    }
    evaluation['levenstein_distance'] = {
        "mean": (df_year_non_na['levenstein_distance'].mean() + df_previous_year_non_na['levenstein_distance'].mean())/2,
        "median": (pd.concat([df_year_non_na['levenstein_distance'], df_previous_year_non_na['levenstein_distance']]).median())
    }
    evaluation['correct_numeric'] = df_year_non_na[df_year_non_na['year_truth'] == df_year_non_na['year_result']].shape[0] + df_previous_year_non_na[df_previous_year_non_na['previous_year_truth'] == df_previous_year_non_na['previous_year_result']].shape[0]
    evaluation['incorrect_numeric'] = df_year_non_na[df_year_non_na['year_truth'] != df_year_non_na['year_result']].shape[0] + df_previous_year_non_na[df_previous_year_non_na['previous_year_truth'] != df_previous_year_non_na['previous_year_result']].shape[0]
    evaluation['total_entries'] = df_joined.shape[0]*2

    return evaluation

In [166]:
evaluation = evaluate_single_result(df_result, entry)

In [167]:
df_joined = evaluation['df_joined']
df_joined['match'] = df_joined.apply(lambda row: row['year_truth'] == row['year_result'], axis=1)
df_joined

Unnamed: 0,E1,E2,E3,year_truth,previous_year_truth,year_result,previous_year_result,_merge,match
0,Anlagevermoegen,Immaterielle Vermoegensgegenstaende,Selbst geschaffene gewerbliche Schutzrechte un...,1000.23,7337.6,1000.23,7337.6,both,True
1,Anlagevermoegen,Immaterielle Vermoegensgegenstaende,Geschaefts- oder Firmenwert,6779.06,3317.23,6779.06,3317.23,both,True
2,Anlagevermoegen,Immaterielle Vermoegensgegenstaende,geleistete Anzahlungen,1019.1,8777.59,1019.1,8777.59,both,True
3,Anlagevermoegen,Immaterielle Vermoegensgegenstaende,"entgeltlich erworbene Konzessionen, gewerblich...",8.41,2829.97,8.41,2829.97,both,True
4,Anlagevermoegen,Sachanlagen,"Grundstuecke, grundstuecksgleiche Rechte und B...",4705.04,6603.19,4705.04,6603.19,both,True
5,Anlagevermoegen,Sachanlagen,Technische Anlagen und Maschinen,8166.37,4283.97,8166.37,4283.97,both,True
6,Anlagevermoegen,Sachanlagen,"Andere Anlagen, Betriebs- und Geschaeftsaussta...",35.77,2611.16,35.77,2611.16,both,True
7,Anlagevermoegen,Sachanlagen,geleistete Anzahlungen und Anlagen im Bau,6856.4,9846.82,6856.4,9846.82,both,True
8,Anlagevermoegen,Finanzanlagen,Sonstige Finanzanlagen,5493.46,8893.84,5493.46,8893.84,both,True
9,Anlagevermoegen,Finanzanlagen,Anteile an verbundenen Unternehmen,4366.63,9122.83,4366.63,9122.83,both,True


In [168]:
evaluations = []

for entry in pdf_texts:
    print(f"Processing file {entry['filepath']}", end='\r')
    extracted_rows = extract_numbers(entry['text'])
    df_result = pd.DataFrame(extracted_rows)
    evaluation = evaluate_single_result(df_result, entry)
    evaluations.append(evaluation)


Processing file ../../benchmark_truth/synthetic_tables/separate_files/aktiva_table_5_columns_span_True_thin_True_year_as_text_unit_in_first_cell_True_Tsd. €_enumeration_False_0.pdfpdfff

  "median": (pd.concat([df_year_non_na['relative_numeric_difference'], df_previous_year_non_na['relative_numeric_difference']]).median())
  "median": (pd.concat([df_year_non_na['levenstein_distance'], df_previous_year_non_na['levenstein_distance']]).median())


Processing file ../../benchmark_truth/synthetic_tables/separate_files/aktiva_table_5_columns_span_True_thin_False_year_as_text_unit_in_first_cell_True_EUR_enumeration_True_0.pdff_1.pdf

  "median": (pd.concat([df_year_non_na['relative_numeric_difference'], df_previous_year_non_na['relative_numeric_difference']]).median())
  "median": (pd.concat([df_year_non_na['levenstein_distance'], df_previous_year_non_na['levenstein_distance']]).median())


Processing file ../../benchmark_truth/synthetic_tables/separate_files/aktiva_table_4_columns_span_False_thin_False_year_as_text_unit_in_first_cell_False_TEUR_enumeration_False_0.pdf.pdf

  "median": (pd.concat([df_year_non_na['relative_numeric_difference'], df_previous_year_non_na['relative_numeric_difference']]).median())
  "median": (pd.concat([df_year_non_na['levenstein_distance'], df_previous_year_non_na['levenstein_distance']]).median())


Processing file ../../benchmark_truth/synthetic_tables/separate_files/aktiva_table_3_columns_span_True_thin_True_year_as_date_unit_in_first_cell_False_Tsd. €_enumeration_True_0.pdfpdfdf

  "median": (pd.concat([df_year_non_na['relative_numeric_difference'], df_previous_year_non_na['relative_numeric_difference']]).median())
  "median": (pd.concat([df_year_non_na['levenstein_distance'], df_previous_year_non_na['levenstein_distance']]).median())


Processing file ../../benchmark_truth/synthetic_tables/separate_files/aktiva_table_4_columns_span_False_thin_True_year_as_text_unit_in_first_cell_False_Mio. €_enumeration_True_0.pdfpdff

  "median": (pd.concat([df_year_non_na['relative_numeric_difference'], df_previous_year_non_na['relative_numeric_difference']]).median())
  "median": (pd.concat([df_year_non_na['levenstein_distance'], df_previous_year_non_na['levenstein_distance']]).median())


Processing file ../../benchmark_truth/synthetic_tables/separate_files/aktiva_table_3_columns_span_True_thin_True_year_as_date_unit_in_first_cell_True_Tsd. €_enumeration_True_0.pdffpdfff

In [169]:
with open(f"../../benchmark_results/table_extraction/regex/evaluation_synth_tables.json", "w") as json_file:
    json.dump(
        [{key: value.to_json(orient='records') if hasattr(value, 'to_json') else value for key, value in entry.items()} for entry in evaluations],
        json_file, 
        indent=4, 
        default=str
    )

# real tables

In [170]:
with open(f"../../Python/pdf_texts_real_tables.json", "r") as json_file:
    real_table_texts = json.load(json_file)

pdf_texts = [{'filepath': '../../benchmark_truth/real_tables/'+key.replace('/pvc/benchmark_truth/real_tables/manual_download/', '').replace('/', '__').replace('.pdf', '.xlsx'), 'text': value} for key, value in real_table_texts.items()]
pdf_texts

[{'filepath': '../../benchmark_truth/real_tables/Amt für Statistik Berlin-Brandenburg__AP_Geschaeftsbericht_DE_2016_BBB.xlsx',
  'text': 'Amt für Statistik Berlin-Brandenburg Anstalt des öffentlichen Rechts, Potsdam\r\nBilanz zum 31. Dezember 2016\r\nA K T I V S E I T E 31.12.2016 Vorjahr \r\nEUR EUR EUR \r\nA. ANLAGEVERMÖGEN \r\nI. Immaterielle Vermögensgegenstände \r\nEntgeltlich erworbene Konzessionen, gewerbliche \r\nSchutzrechte und ähnliche Rechte und Werte sowie Lizenzen \r\nan solchen Rechten und Werten 105\xa0541,00 144\xa0713,00 \r\nII. Sachanlagen \r\n1. Grundstücke, grundstücksgleiche Rechte und Bauten \r\neinschließlich der Bauten auf fremden Grundstücken 21\xa0058,00 38\xa0516,00 \r\n2. Andere Anlagen, Betriebs- und Geschäftsausstattung 367\xa0071,00 133\xa0195,00 \r\n388\xa0129,00 171\xa0711,00 \r\nIII. Finanzanlagen \r\n1. Wertpapiere des Anlagevermögens 0,00 2\xa0000\xa0000,00 \r\n2. Sonstige Finanzanlagen 4\xa0400\xa0000,00 1\xa0800\xa0000,00 \r\n4\xa0400\xa0000,00 3\

In [146]:
evaluations_real_tables = []

for entry in pdf_texts:
    print(f"Processing file {entry['filepath']}", end='\r')
    extracted_rows = extract_numbers(entry['text'])
    df_result = pd.DataFrame(extracted_rows)
    evaluation = evaluate_single_result(df_result, entry)
    evaluations_real_tables.append(evaluation)


Processing file ../../benchmark_truth/real_tables/Zoologischer Garten Berlin AG__ZOO-TIERPARK-Jahresbericht2024.xlsxsxxlsxP 2022 BWB.xlsx

  "median": (pd.concat([df_year_non_na['relative_numeric_difference'], df_previous_year_non_na['relative_numeric_difference']]).median())
  "median": (pd.concat([df_year_non_na['levenstein_distance'], df_previous_year_non_na['levenstein_distance']]).median())


In [148]:
with open(f"../../benchmark_results/table_extraction/regex/evaluation_real_tables.json", "w") as json_file:
    json.dump(
        [{key: value.to_json(orient='records') if hasattr(value, 'to_json') else value for key, value in entry.items()} for entry in evaluations_real_tables],
        json_file, 
        indent=4, 
        default=str
    )
