# Old Bailey Corpus Data Parsing - Streamlined

In [8]:
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

## Parse XML and Extract Data

In [9]:
xml_folder = "../dataset/OBC2/"
xml_files = glob.glob(os.path.join(xml_folder, "*.xml"))
data = []

for file in xml_files:
    tree = ET.parse(file)
    root = tree.getroot()

    for trial in root.findall(".//div1[@type='trialAccount']"):
        trial_id = trial.get("id", "Unknown")
        trial_date_element = trial.find("interp[@type='date']")
        trial_date = trial_date_element.get("value", "Unknown") if trial_date_element is not None else "Unknown"

        defendant = trial.find(".//persName[@type='defendantName']")
        surname, given = "Unknown", "Unknown"
        if defendant is not None:
            surname_element = defendant.find("interp[@type='surname']")
            surname = surname_element.get("value", "Unknown") if surname_element is not None else "Unknown"
            given_element = defendant.find("interp[@type='given']")
            given = given_element.get("value", "Unknown") if given_element is not None else "Unknown"

        victim = trial.find(".//persName[@type='victimName']")
        victim_surname, victim_given = "Unknown", "Unknown"
        if victim is not None:
            victim_surname_element = victim.find("interp[@type='surname']")
            victim_surname = victim_surname_element.get("value", "Unknown") if victim_surname_element is not None else "Unknown"
            victim_given_element = victim.find("interp[@type='given']")
            victim_given = victim_given_element.get("value", "Unknown") if victim_given_element is not None else "Unknown"

        verdict = trial.find(".//rs[@type='verdictDescription']/interp[@type='verdictCategory']")
        verdict_text = verdict.get("value", "Unknown") if verdict is not None else "Unknown"

        punishment = trial.find(".//rs[@type='punishmentDescription']/interp[@type='punishmentCategory']")
        punishment_text = punishment.get("value", "Unknown") if punishment is not None else "Unknown"

        offence = trial.find(".//rs[@type='offenceDescription']/interp[@type='offenceCategory']")
        offence_text = offence.get("value", "Unknown") if offence is not None else "Unknown"

        crime_date = trial.find(".//rs[@type='crimeDate']")
        crime_date_text = crime_date.text.strip() if crime_date is not None and crime_date.text is not None else "Unknown"

        trial_text = []
        for u in trial.findall(".//u"):
            speaker_role = u.get("role", "Unknown")
            speaker_text = " ".join([p.text.strip() for p in u.findall(".//p") if p.text])
            if speaker_text:
                trial_text.append(f"[{speaker_role}] {speaker_text}")

        full_trial_text = "\n".join(trial_text)

        data.append([trial_id, trial_date, surname, given, victim_surname, victim_given, verdict_text, punishment_text, offence_text, crime_date_text, full_trial_text])

columns = ["Trial_ID", "Date", "Defendant_Surname", "Defendant_Given", "Victim_Surname", "Victim_Given", "Verdict", "Punishment", "Offence", "Crime_Date", "Trial_Text"]
df = pd.DataFrame(data, columns=columns)
print(f"Parsing selesai! {len(df)} rows extracted")

Parsing selesai! 100088 rows extracted


## Save Trial Texts to Files

In [None]:
output_text_folder = "../dataset/trial_texts/"
os.makedirs(output_text_folder, exist_ok=True)

trial_texts_dict = {}
for file in xml_files:
    try:
        tree = ET.parse(file)
        root = tree.getroot()

        for trial in root.findall(".//div1[@type='trialAccount']"):
            trial_id = trial.get("id", "Unknown")
            trial_text = " ".join(trial.itertext()).strip()
            trial_texts_dict[trial_id] = trial_text
            
            text_file_path = os.path.join(output_text_folder, f"{trial_id}.txt")
            with open(text_file_path, "w", encoding="utf-8") as txt_file:
                txt_file.write(trial_text)
    except Exception as e:
        print(f"Error processing {file}: {e}")

print(f"Saved {len(trial_texts_dict)} text files to {output_text_folder}")

✅ Saved 50044 text files to ../dataset/trial_texts/


## Delete Invalid Trial IDs

In [11]:
patterns_to_delete = [
    "a????????-?.txt",
    "f????????-?.txt",
    "f????????.txt",
    "o????????-?.txt",
    "o????????-??.txt",
    "s????????-?.txt"
]

for pattern in patterns_to_delete:
    files_to_delete = glob.glob(os.path.join(output_text_folder, pattern))
    for file in files_to_delete:
        try:
            os.remove(file)
        except Exception as e:
            print(f"Error deleting {file}: {e}")

print("Invalid files deleted!")

Invalid files deleted!


## Filter DataFrame by Pattern

In [12]:
pattern1 = r"^a\d{8}-\d$"
pattern2 = r"^f\d{8}-\d$"
pattern3 = r"^o\d{8}-\d$"
pattern4 = r"^s\d{8}-\d$"
pattern5 = r"^f\d{8}$"
pattern6 = r"^o\d{8}-\d{2}$"

df = df[~df["Trial_ID"].str.match(pattern1, na=False)]
df = df[~df["Trial_ID"].str.match(pattern2, na=False)]
df = df[~df["Trial_ID"].str.match(pattern3, na=False)]
df = df[~df["Trial_ID"].str.match(pattern4, na=False)]
df = df[~df["Trial_ID"].str.match(pattern5, na=False)]
df = df[~df["Trial_ID"].str.match(pattern6, na=False)]

print(f"After filtering: {len(df)} rows")

After filtering: 100086 rows


## Drop Trial_Text Column

In [13]:
df = df.drop(columns=["Trial_Text"], errors="ignore")

## Clean Text Files

In [None]:
cleaned_texts_folder = "../dataset/cleaned_texts/"
os.makedirs(cleaned_texts_folder, exist_ok=True)

pos_tag_pattern = r"\b(\w+)_\w+\b"
punctuation_pattern = r"[_,:;\"'(){}[\]<>]+"

for file_name in os.listdir(output_text_folder):
    if file_name.endswith(".txt"):
        input_path = os.path.join(output_text_folder, file_name)
        output_path = os.path.join(cleaned_texts_folder, file_name)

        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()

        cleaned_text = re.sub(pos_tag_pattern, r"\1", text)
        cleaned_text = re.sub(punctuation_pattern, " ", cleaned_text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

final_cleaned_folder = "../dataset/final_cleaned_texts/"
os.makedirs(final_cleaned_folder, exist_ok=True)

pos_tag_pattern = r"\b(NNU|NNB)\b"
punctuation_pattern = r"\.\s+\."

for file_name in os.listdir(cleaned_texts_folder):
    if file_name.endswith(".txt"):
        input_path = os.path.join(cleaned_texts_folder, file_name)
        output_path = os.path.join(final_cleaned_folder, file_name)

        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()

        cleaned_text = re.sub(pos_tag_pattern, "", text)
        cleaned_text = re.sub(punctuation_pattern, ".", cleaned_text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

print("Text files cleaned!")

Text files cleaned!


## Drop Columns

In [15]:
df = df.drop(columns=["Victim_Surname", "Victim_Given", "Witness_Name"], errors="ignore")

## Convert Date Format

In [16]:
df["Date"] = pd.to_datetime(df["Date"].astype(str), format="%Y%m%d")
print(df[["Date"]].head(10))

        Date
0 1750-01-17
1 1750-01-17
2 1750-01-17
3 1750-01-17
4 1750-01-17
5 1750-01-17
6 1750-01-17
7 1750-01-17
8 1750-01-17
9 1750-01-17


## Remove Duplicates

In [17]:
df = df.drop_duplicates(subset=["Trial_ID"], keep="first")
print(f"After dedup: {len(df)} rows")

After dedup: 50043 rows


## Drop More Columns

In [18]:
df = df.drop(columns=["Defendant_Surname", "Defendant_Given", "Punishment", "Crime_Date"], errors="ignore")

## Remove More Duplicates

In [19]:
df = df.drop_duplicates(subset=["Trial_ID","Date", "Verdict", "Offence"], keep="first")

## Add Year Column

In [20]:
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df["Year"] = df["Date"].dt.year

## Add Trial Text from Files

In [21]:
text_folder = "../dataset/final_cleaned_texts/"
trial_texts = {}
for filename in os.listdir(text_folder):
    if filename.endswith(".txt"):
        trial_id = filename.replace(".txt", "")
        file_path = os.path.join(text_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            trial_texts[trial_id] = f.read().strip()

df["Trial_Text"] = df["Trial_ID"].astype(str).map(trial_texts).fillna("")

## More Deduplication

In [22]:
df = df.drop_duplicates(subset="Trial_ID", keep="first")
df["Date"] = pd.to_datetime(df["Date"])
df["Year"] = df["Date"].dt.year

## Replace Unknown with NaN 

In [23]:
df.replace("unknown", np.nan, inplace=True)
df.replace("Unknown", np.nan, inplace=True)

## Drop NaN and Filter Verdicts 

In [24]:
df.dropna(inplace=True)
df = df[df["Verdict"].isin(["guilty", "notGuilty"])]
print(f"Final rows: {len(df)}")

Final rows: 49365


## Show Stats 

In [25]:
print(df["Verdict"].value_counts())
print()
print(df["Offence"].value_counts())
print()
print(df["Year"].value_counts().sort_index())

Verdict
guilty       35487
notGuilty    13878
Name: count, dtype: int64

Offence
theft            37033
deception         2956
violentTheft      2433
royalOffences     2135
sexual            1434
breakingPeace     1319
kill              1205
miscellaneous      654
damage             196
Name: count, dtype: int64

Year
1720    210
1721    370
1722    320
1723    260
1724    449
       ... 
1909     63
1910    206
1911    204
1912    189
1913    151
Name: count, Length: 174, dtype: int64


## Clean Text in DataFrame 

In [26]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'^[\W\d\s]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['Trial_Text'] = df['Trial_Text'].apply(clean_text)

def clean_text1(text):
    text = str(text)
    text = re.sub(r'[+*FO]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Trial_Text'] = df['Trial_Text'].apply(clean_text1)

def clean_text2(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'(?<=\w)\.(?=[A-Z])', '. ', text)
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    text = re.sub(r"Prisoner s", "Prisoner's", text)
    text = re.sub(r'(\d+)\s*d\s*\.', r'\1d.', text)
    return text.strip()

df['Trial_Text'] = df['Trial_Text'].apply(clean_text2)

## Save Final CSV

In [None]:
output_csv_path = "../dataset/OBC_Cleaned.csv"
df.to_csv(output_csv_path, index=False)
print(f"File disimpan di: {output_csv_path}")
print(df.head())

✅ File disimpan di: ../dataset/OBC_Cleaned.csv
      Trial_ID       Date    Verdict Offence  Year  \
0  t17500117-1 1750-01-17  notGuilty   theft  1750   
1  t17500117-2 1750-01-17     guilty   theft  1750   
2  t17500117-3 1750-01-17  notGuilty   theft  1750   
3  t17500117-4 1750-01-17     guilty   theft  1750   
4  t17500117-5 1750-01-17  notGuilty   theft  1750   

                                          Trial_Text  
0  John Bowen was indicted for that he on the 24t...  
1  Nicholas Bond and William Heyden late of ulham...  
2  Thomas Biggs was indicted for stealing one pai...  
3  Elizabeth Wanless otherwise Newbey spinster wa...  
4  Susannah Lowe and Margaret Richards widows wer...  
