# Old Bailey Corpus Data Parsing - Hybrid Pipeline
This notebook extends the standard data parsing pipeline by extracting additional metadata for the hybrid classification model.

In [1]:
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

## Parse XML and Extract Data with Metadata

In [2]:
xml_folder = "../dataset/OBC2/"
xml_files = glob.glob(os.path.join(xml_folder, "*.xml"))
data = []

for file in xml_files:
    tree = ET.parse(file)
    root = tree.getroot()

    for trial in root.findall(".//div1[@type='trialAccount']"):
        trial_id = trial.get("id", "Unknown")
        trial_date_element = trial.find("interp[@type='date']")
        trial_date = trial_date_element.get("value", "Unknown") if trial_date_element is not None else "Unknown"

        # Extract defendant information
        defendants = trial.findall(".//persName[@type='defendantName']")
        num_defendants = len(defendants)
        defendant_genders = []
        surname, given = "Unknown", "Unknown"
        
        for defendant in defendants:
            gender_element = defendant.find("interp[@type='gender']")
            if gender_element is not None:
                defendant_genders.append(gender_element.get("value", "Unknown"))
            if surname == "Unknown":
                surname_element = defendant.find("interp[@type='surname']")
                surname = surname_element.get("value", "Unknown") if surname_element is not None else "Unknown"
                given_element = defendant.find("interp[@type='given']")
                given = given_element.get("value", "Unknown") if given_element is not None else "Unknown"
        
        # Determine primary defendant gender
        if defendant_genders:
            defendant_gender = defendant_genders[0]
        else:
            defendant_gender = "Unknown"

        # Extract victim information
        victims = trial.findall(".//persName[@type='victimName']")
        num_victims = len(victims)
        victim_genders = []
        victim_surname, victim_given = "Unknown", "Unknown"
        
        for victim in victims:
            gender_element = victim.find("interp[@type='gender']")
            if gender_element is not None:
                victim_genders.append(gender_element.get("value", "Unknown"))
            if victim_surname == "Unknown":
                victim_surname_element = victim.find("interp[@type='surname']")
                victim_surname = victim_surname_element.get("value", "Unknown") if victim_surname_element is not None else "Unknown"
                victim_given_element = victim.find("interp[@type='given']")
                victim_given = victim_given_element.get("value", "Unknown") if victim_given_element is not None else "Unknown"
        
        # Determine primary victim gender
        if victim_genders:
            victim_gender = victim_genders[0]
        else:
            victim_gender = "Unknown"

        # Extract verdict
        verdict = trial.find(".//rs[@type='verdictDescription']/interp[@type='verdictCategory']")
        verdict_text = verdict.get("value", "Unknown") if verdict is not None else "Unknown"

        # Extract punishment
        punishment = trial.find(".//rs[@type='punishmentDescription']/interp[@type='punishmentCategory']")
        punishment_text = punishment.get("value", "Unknown") if punishment is not None else "Unknown"

        # Extract offence category and subcategory
        offence = trial.find(".//rs[@type='offenceDescription']/interp[@type='offenceCategory']")
        offence_text = offence.get("value", "Unknown") if offence is not None else "Unknown"
        
        offence_sub = trial.find(".//rs[@type='offenceDescription']/interp[@type='offenceSubcategory']")
        offence_subcategory = offence_sub.get("value", "Unknown") if offence_sub is not None else "Unknown"

        # Extract crime date
        crime_date = trial.find(".//rs[@type='crimeDate']")
        crime_date_text = crime_date.text.strip() if crime_date is not None and crime_date.text is not None else "Unknown"

        # Extract trial text
        trial_text = []
        for u in trial.findall(".//u"):
            speaker_role = u.get("role", "Unknown")
            speaker_text = " ".join([p.text.strip() for p in u.findall(".//p") if p.text])
            if speaker_text:
                trial_text.append(f"[{speaker_role}] {speaker_text}")

        full_trial_text = "\n".join(trial_text)

        data.append([
            trial_id, trial_date, surname, given, 
            defendant_gender, num_defendants,
            victim_surname, victim_given, victim_gender, num_victims,
            verdict_text, punishment_text, 
            offence_text, offence_subcategory,
            crime_date_text, full_trial_text
        ])

columns = [
    "Trial_ID", "Date", "Defendant_Surname", "Defendant_Given",
    "Defendant_Gender", "Num_Defendants",
    "Victim_Surname", "Victim_Given", "Victim_Gender", "Num_Victims",
    "Verdict", "Punishment", 
    "Offence", "Offence_Subcategory",
    "Crime_Date", "Trial_Text"
]
df = pd.DataFrame(data, columns=columns)
print(f"Parsing complete! {len(df)} rows extracted")
print(f"\nMetadata columns: {columns}")

Parsing complete! 100088 rows extracted

Metadata columns: ['Trial_ID', 'Date', 'Defendant_Surname', 'Defendant_Given', 'Defendant_Gender', 'Num_Defendants', 'Victim_Surname', 'Victim_Given', 'Victim_Gender', 'Num_Victims', 'Verdict', 'Punishment', 'Offence', 'Offence_Subcategory', 'Crime_Date', 'Trial_Text']


## Save Trial Texts to Files

In [3]:
output_text_folder = "../dataset/trial_texts/"
os.makedirs(output_text_folder, exist_ok=True)

trial_texts_dict = {}
for file in xml_files:
    try:
        tree = ET.parse(file)
        root = tree.getroot()

        for trial in root.findall(".//div1[@type='trialAccount']"):
            trial_id = trial.get("id", "Unknown")
            trial_text = " ".join(trial.itertext()).strip()
            trial_texts_dict[trial_id] = trial_text
            
            text_file_path = os.path.join(output_text_folder, f"{trial_id}.txt")
            with open(text_file_path, "w", encoding="utf-8") as txt_file:
                txt_file.write(trial_text)
    except Exception as e:
        print(f"Error processing {file}: {e}")

print(f"Saved {len(trial_texts_dict)} text files to {output_text_folder}")

Saved 50044 text files to ../dataset/trial_texts/


## Delete Invalid Trial IDs

In [4]:
patterns_to_delete = [
    "a????????-?.txt",
    "f????????-?.txt",
    "f????????.txt",
    "o????????-?.txt",
    "o????????-??.txt",
    "s????????-?.txt"
]

for pattern in patterns_to_delete:
    files_to_delete = glob.glob(os.path.join(output_text_folder, pattern))
    for file in files_to_delete:
        try:
            os.remove(file)
        except Exception as e:
            print(f"Error deleting {file}: {e}")

print("Invalid files deleted!")

Invalid files deleted!


## Filter DataFrame by Pattern

In [5]:
pattern1 = r"^a\d{8}-\d$"
pattern2 = r"^f\d{8}-\d$"
pattern3 = r"^o\d{8}-\d$"
pattern4 = r"^s\d{8}-\d$"
pattern5 = r"^f\d{8}$"
pattern6 = r"^o\d{8}-\d{2}$"

df = df[~df["Trial_ID"].str.match(pattern1, na=False)]
df = df[~df["Trial_ID"].str.match(pattern2, na=False)]
df = df[~df["Trial_ID"].str.match(pattern3, na=False)]
df = df[~df["Trial_ID"].str.match(pattern4, na=False)]
df = df[~df["Trial_ID"].str.match(pattern5, na=False)]
df = df[~df["Trial_ID"].str.match(pattern6, na=False)]

print(f"After filtering: {len(df)} rows")

After filtering: 100086 rows


## Drop Trial_Text Column (temporarily)

In [6]:
df = df.drop(columns=["Trial_Text"], errors="ignore")

## Clean Text Files

In [7]:
cleaned_texts_folder = "../dataset/cleaned_texts/"
os.makedirs(cleaned_texts_folder, exist_ok=True)

pos_tag_pattern = r"\b(\w+)_\w+\b"
punctuation_pattern = r"[_,:;\"'(){}[\]<>]+"

for file_name in os.listdir(output_text_folder):
    if file_name.endswith(".txt"):
        input_path = os.path.join(output_text_folder, file_name)
        output_path = os.path.join(cleaned_texts_folder, file_name)

        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()

        cleaned_text = re.sub(pos_tag_pattern, r"\1", text)
        cleaned_text = re.sub(punctuation_pattern, " ", cleaned_text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

final_cleaned_folder = "../dataset/final_cleaned_texts/"
os.makedirs(final_cleaned_folder, exist_ok=True)

pos_tag_pattern = r"\b(NNU|NNB)\b"
punctuation_pattern = r"\.\s+\."

for file_name in os.listdir(cleaned_texts_folder):
    if file_name.endswith(".txt"):
        input_path = os.path.join(cleaned_texts_folder, file_name)
        output_path = os.path.join(final_cleaned_folder, file_name)

        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()

        cleaned_text = re.sub(pos_tag_pattern, "", text)
        cleaned_text = re.sub(punctuation_pattern, ".", cleaned_text)
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

print("Text files cleaned!")

Text files cleaned!


## Drop Unnecessary Name Columns

In [8]:
df = df.drop(columns=["Victim_Surname", "Victim_Given"], errors="ignore")

## Convert Date Format

In [9]:
df["Date"] = pd.to_datetime(df["Date"].astype(str), format="%Y%m%d")
print(df[["Date"]].head(10))

        Date
0 1750-01-17
1 1750-01-17
2 1750-01-17
3 1750-01-17
4 1750-01-17
5 1750-01-17
6 1750-01-17
7 1750-01-17
8 1750-01-17
9 1750-01-17


## Remove Duplicates

In [10]:
df = df.drop_duplicates(subset=["Trial_ID"], keep="first")
print(f"After dedup: {len(df)} rows")

After dedup: 50043 rows


## Drop More Columns

In [11]:
df = df.drop(columns=["Defendant_Surname", "Defendant_Given", "Punishment", "Crime_Date"], errors="ignore")

## Remove More Duplicates

In [12]:
df = df.drop_duplicates(subset=["Trial_ID","Date", "Verdict", "Offence"], keep="first")

## Add Year Column

In [13]:
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df["Year"] = df["Date"].dt.year

## Add Trial Text from Files

In [14]:
text_folder = "../dataset/final_cleaned_texts/"
trial_texts = {}
for filename in os.listdir(text_folder):
    if filename.endswith(".txt"):
        trial_id = filename.replace(".txt", "")
        file_path = os.path.join(text_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            trial_texts[trial_id] = f.read().strip()

df["Trial_Text"] = df["Trial_ID"].astype(str).map(trial_texts).fillna("")

## More Deduplication

In [15]:
df = df.drop_duplicates(subset="Trial_ID", keep="first")
df["Date"] = pd.to_datetime(df["Date"])
df["Year"] = df["Date"].dt.year

## Replace Unknown with NaN

In [16]:
df.replace("unknown", np.nan, inplace=True)
df.replace("Unknown", np.nan, inplace=True)

## Drop NaN and Filter Verdicts

In [17]:
df.dropna(inplace=True)
df = df[df["Verdict"].isin(["guilty", "notGuilty"])]
print(f"Final rows: {len(df)}")

Final rows: 43389


## Show Stats

In [18]:
print("Verdict distribution:")
print(df["Verdict"].value_counts())
print()
print("Offence distribution:")
print(df["Offence"].value_counts())
print()
print("Offence Subcategory distribution:")
print(df["Offence_Subcategory"].value_counts())
print()
print("Defendant Gender distribution:")
print(df["Defendant_Gender"].value_counts())
print()
print("Victim Gender distribution:")
print(df["Victim_Gender"].value_counts())
print()
print("Number of Defendants distribution:")
print(df["Num_Defendants"].value_counts().sort_index())
print()
print("Number of Victims distribution:")
print(df["Num_Victims"].value_counts().sort_index())
print()
print("Year distribution:")
print(df["Year"].value_counts().sort_index())

Verdict distribution:
Verdict
guilty       31253
notGuilty    12136
Name: count, dtype: int64

Offence distribution:
Offence
theft            35684
violentTheft      2410
deception         1436
breakingPeace     1204
sexual            1069
kill              1050
royalOffences      234
miscellaneous      152
damage             150
Name: count, dtype: int64

Offence Subcategory distribution:
Offence_Subcategory
grandLarceny                    13539
simpleLarceny                    4829
theftFromPlace                   4248
pocketpicking                    2955
burglary                         2290
stealingFromMaster               1765
other                            1669
highwayRobbery                   1469
shoplifting                      1197
animalTheft                      1035
wounding                          953
robbery                           941
fraud                             865
housebreaking                     774
murder                            659
embezzlement     

## Clean Text in DataFrame

In [19]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'^[\W\d\s]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['Trial_Text'] = df['Trial_Text'].apply(clean_text)

def clean_text1(text):
    text = str(text)
    text = re.sub(r'[+*FO]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Trial_Text'] = df['Trial_Text'].apply(clean_text1)

def clean_text2(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'(?<=\w)\.(?=[A-Z])', '. ', text)
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    text = re.sub(r"Prisoner s", "Prisoner's", text)
    text = re.sub(r'(\d+)\s*d\s*\.', r'\1d.', text)
    return text.strip()

df['Trial_Text'] = df['Trial_Text'].apply(clean_text2)

## Save Final CSV with Metadata

In [20]:
output_csv_path = "../dataset/OBC_Cleaned_Hybrid.csv"
df.to_csv(output_csv_path, index=False)
print(f"File saved to: {output_csv_path}")
print(f"\nColumns in final dataset: {list(df.columns)}")
print(f"\nDataset shape: {df.shape}")
print(df.head())

File saved to: ../dataset/OBC_Cleaned_Hybrid.csv

Columns in final dataset: ['Trial_ID', 'Date', 'Defendant_Gender', 'Num_Defendants', 'Victim_Gender', 'Num_Victims', 'Verdict', 'Offence', 'Offence_Subcategory', 'Year', 'Trial_Text']

Dataset shape: (43389, 11)
      Trial_ID       Date Defendant_Gender  Num_Defendants Victim_Gender  \
0  t17500117-1 1750-01-17             male               1          male   
1  t17500117-2 1750-01-17             male               2          male   
2  t17500117-3 1750-01-17             male               1          male   
3  t17500117-4 1750-01-17           female               1          male   
4  t17500117-5 1750-01-17           female               2          male   

   Num_Victims    Verdict Offence Offence_Subcategory  Year  \
0            1  notGuilty   theft            burglary  1750   
1            1     guilty   theft      theftFromPlace  1750   
2            1  notGuilty   theft        grandLarceny  1750   
3            1     guilty   t