In [1]:
import os, pandas
import xml.etree.ElementTree as ET

In [2]:
def parse_xml_file(file_path):
    """
    Parse an XML file and return a dataframe.
    """
    # Load and parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Initialize empty dataframe to store parsed data
    df = pandas.DataFrame(columns=["pair_id", "label", "articles", "query", "filename"])

    # Iterate through 'pair' elements
    for pair in root.findall("pair"):
        pair_id = pair.get("id")
        pair_label = pair.get("label")
        # Extract text from 't1' and 't2' elements
        articles = pair.find("t1").text.strip()
        query = pair.find("t2").text.strip()
        filename = file_path.split("/")[-1]
        # Concat to dataframe
        df = pandas.concat(
            [df, pandas.DataFrame([[pair_id, pair_label, articles, query, filename]], columns=["pair_id", "label", "articles", "query", "filename"])]
        )
    return df

In [3]:
df = parse_xml_file("COLIEE2023statute_data-English/TestData_en.xml")
df.set_index("pair_id")
df.to_csv("COLIEE2023statute_data-English/test.tsv", sep="\t")

In [4]:
# parse all XML files and save a single dataframe combining all of them
df = pandas.DataFrame(columns=["pair_id", "label", "articles", "query", "filename"])
for xml in os.listdir("COLIEE2023statute_data-English/train"):
    df = pandas.concat([df, parse_xml_file(f"COLIEE2023statute_data-English/train/{xml}")])
df.set_index("pair_id")
df.to_csv("COLIEE2023statute_data-English/train.tsv", sep="\t")

In [5]:
# parse all XML files and save a single dataframe combining all of them
df = pandas.DataFrame(columns=["pair_id", "label", "articles", "query", "filename"])
for xml in os.listdir("COLIEE2023statute_data-English/val"):
    df = pandas.concat([df, parse_xml_file(f"COLIEE2023statute_data-English/val/{xml}")])
df.set_index("pair_id")
df.to_csv("COLIEE2023statute_data-English/val.tsv", sep="\t")

In [6]:
pandas.read_csv("COLIEE2023statute_data-English/train.tsv", sep="\t", index_col="pair_id")

Unnamed: 0_level_0,Unnamed: 0,label,articles,query,filename
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
H18-1-1,0,Y,Article 572\nEven if the seller makes a specia...,A special provision that releases warranty can...,riteval_H18_en.xml
H18-1-2,0,N,Article 565\nThe provisions of the preceding t...,There is a limitation period on pursuance of w...,riteval_H18_en.xml
H18-1-3,0,N,Article 568\n(1) The successful bidder at an a...,"A compulsory auction is also a sale, so warran...",riteval_H18_en.xml
H18-2-1,0,Y,Article 697\n(1) A person that has begun to ma...,In cases where a person plans to prevent crime...,riteval_H18_en.xml
H18-2-2,0,Y,Article 698\nIf a manager engages in benevolen...,In cases where an individual rescues another p...,riteval_H18_en.xml
...,...,...,...,...,...
H29-29-E,0,N,Article 663\n(1) If the parties have not speci...,Even in cases where the timing of the return o...,riteval_H29_en.xml
H29-29-O,0,N,Article 666\n(1) If a bailee may consume the b...,Even if the timing of the return of the Thing ...,riteval_H29_en.xml
H29-33-A,0,N,Article 11\nThe family court may decide to com...,Even with respect to any person who constantly...,riteval_H29_en.xml
H29-36-1,0,N,Article 675\n(1) A creditor of a partnership m...,The creditor of a partnership may not exercise...,riteval_H29_en.xml


In [7]:
pandas.read_csv("COLIEE2023statute_data-English/val.tsv", sep="\t", index_col="pair_id")

Unnamed: 0_level_0,Unnamed: 0,label,articles,query,filename
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
H30-1-A,0,Y,Article 3\n(1) The enjoyment of private rights...,An unborn child may not be given a gift on the...,riteval_H30_en.xml
H30-2-I,0,N,Article 34\nAn not-for-profit association or f...,A juridical person may not be a partner of a c...,riteval_H30_en.xml
H30-2-U,0,N,Article 34\nAn not-for-profit association or f...,A juridical person may not be entitled to dema...,riteval_H30_en.xml
H30-4-A,0,Y,Article 101\n(1) If the validity of a manifest...,In cases an agent is entrusted to perform any ...,riteval_H30_en.xml
H30-4-I,0,N,Article 103\nAn agent who has no specifically ...,An agent who has no specified authority shall ...,riteval_H30_en.xml
...,...,...,...,...,...
R03-36-I,0,N,"Article 413 (1) If the obligee refuses, or is ...",If the obligor has tendered the performance bu...,riteval_R03_en.xml
R03-36-U,0,Y,Article 559 The provisions of this Section app...,The expenses associated with conclusions of le...,riteval_R03_en.xml
R03-37-E,0,N,Article 587-2 (1) Notwithstanding the provisio...,"A contract of monetary loan for consumption, e...",riteval_R03_en.xml
R03-37-I,0,Y,Article 446 (1) A guarantor has the responsibi...,A guarantee contract becomes effective even if...,riteval_R03_en.xml
