## Step 1: Count XML Files

In [25]:
import os

folder_path = "../dataset/OBC2"
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

print(f"Number of XML files: {file_count}")

Number of XML files: 1274


## Step 2: Parse XML Files to DataFrame

In [26]:
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd

xml_folder = "../dataset/OBC2/"
xml_files = glob.glob(os.path.join(xml_folder, "*.xml"))
xml_files = [f for f in xml_files if "POS" not in os.path.basename(f)]
data = []

print(f"Parsing {len(xml_files)} XML files...")

def extract_clean_text(element):
    """Extract only the trial narrative from <p> paragraph tags"""
    import re
    
    # Find all paragraph elements - this is where the actual trial narrative is
    paragraphs = element.findall('.//p')
    
    text_parts = []
    for p in paragraphs:
        # Get all text from paragraph including nested elements
        p_text = ' '.join(p.itertext()).strip()
        if p_text:
            text_parts.append(p_text)
    
    # Join paragraphs and clean up whitespace
    full_text = ' '.join(text_parts)
    full_text = re.sub(r'\s+', ' ', full_text)  # Normalize whitespace
    
    return full_text.strip()

for file in xml_files:
    try:
        tree = ET.parse(file)
        root = tree.getroot()
        
        for trial in root.findall(".//div1[@type='trialAccount']"):
            trial_id = trial.get("id", "Unknown")
            
            trial_date_element = trial.find("interp[@type='date']")
            trial_date = trial_date_element.get("value", "Unknown") if trial_date_element is not None else "Unknown"
            
            defendants = trial.findall(".//persName[@type='defendantName']")
            victims = trial.findall(".//persName[@type='victimName']")
            num_defendants = len(defendants)
            num_victims = len(victims)
            
            defendant = defendants[0] if defendants else None
            surname, given, gender = "Unknown", "Unknown", "Unknown"
            if defendant is not None:
                surname_el = defendant.find("interp[@type='surname']")
                surname = surname_el.get("value", "Unknown") if surname_el is not None else "Unknown"
                given_el = defendant.find("interp[@type='given']")
                given = given_el.get("value", "Unknown") if given_el is not None else "Unknown"
                gender_el = defendant.find("interp[@type='gender']")
                gender = gender_el.get("value", "Unknown") if gender_el is not None else "Unknown"
            
            victim = victims[0] if victims else None
            victim_surname, victim_given, victim_gender = "Unknown", "Unknown", "Unknown"
            if victim is not None:
                vs_el = victim.find("interp[@type='surname']")
                victim_surname = vs_el.get("value", "Unknown") if vs_el is not None else "Unknown"
                vg_el = victim.find("interp[@type='given']")
                victim_given = vg_el.get("value", "Unknown") if vg_el is not None else "Unknown"
                vgen_el = victim.find("interp[@type='gender']")
                victim_gender = vgen_el.get("value", "Unknown") if vgen_el is not None else "Unknown"
            
            offence_el = trial.find(".//rs[@type='offenceDescription']/interp[@type='offenceCategory']")
            offence_category = offence_el.get("value", "Unknown") if offence_el is not None else "Unknown"
            
            offence_sub_el = trial.find(".//rs[@type='offenceDescription']/interp[@type='offenceSubcategory']")
            offence_subcategory = offence_sub_el.get("value", "Unknown") if offence_sub_el is not None else "Unknown"
            
            verdict_el = trial.find(".//rs[@type='verdictDescription']/interp[@type='verdictCategory']")
            verdict_category = verdict_el.get("value", "Unknown") if verdict_el is not None else "Unknown"
            
            trial_text = extract_clean_text(trial)
            text_length = len(trial_text)
            
            data.append([
                trial_id, trial_date, surname, given, gender, num_defendants, victim_surname, victim_given, victim_gender, num_victims,
                offence_category, offence_subcategory, verdict_category, text_length, trial_text
            ])
            
    except Exception as e:
        print(f"Error processing {file}: {e}")

columns = [
    "Trial_ID", "Date", "Defendant_Surname", "Defendant_Given", "Defendant_Gender",
    "Num_Defendants", "Victim_Surname", "Victim_Given", "Victim_Gender", "Num_Victims", "Offence",
    "Offence_Subcategory",  "Verdict", "Text_Length", "Trial_Text"
]

df = pd.DataFrame(data, columns=columns)
print(f"\nParsing completed! Dataset shape: {df.shape}")
display(df.head())

Parsing 637 XML files...

Parsing completed! Dataset shape: (50044, 15)


Unnamed: 0,Trial_ID,Date,Defendant_Surname,Defendant_Given,Defendant_Gender,Num_Defendants,Victim_Surname,Victim_Given,Victim_Gender,Num_Victims,Offence,Offence_Subcategory,Verdict,Text_Length,Trial_Text
0,t18300218-1,18300218,SAGGERS,THOMAS,male,2,Leach,John,male,2,theft,theftFromPlace,guilty,12257,First London Jury. - Before Mr. Recorder. 498....
1,t18300218-2,18300218,WILLIAMS,GEORGE,male,1,Matthews,Henry,male,2,theft,housebreaking,guilty,6849,499. GEORGE WILLIAMS was indicted for feloniou...
2,t18300218-3,18300218,FARTHING,ROBERT,male,1,Slinn,Charles,male,1,theft,theftFromPlace,guilty,4063,First London Jury. - Before Mr. Recorder. 500....
3,t18300218-4,18300218,RICHARDS,HENRY,male,2,Houson,Susannah,female,4,theft,housebreaking,guilty,10693,501. HENRY RICHARDS and THOMAS BURDETT were in...
4,t18300218-5,18300218,JONES,WILLIAM,male,1,Pearson,John Piper,male,1,theft,animalTheft,guilty,2245,Second Middlesex Jury - Before Mr. Justice Bay...


## Step 3: Clean DataFrame

### 3.1: Remove Invalid Trial_IDs and Drop Unnecessary Columns

In [27]:
import numpy as np

print("="*50)
print("DATA CLEANING PIPELINE")
print("="*50)
print(f"\nInitial shape: {df.shape}")

# Remove invalid Trial_ID patterns (non-trial records)
patterns = [r"^a\d{8}-\d$", r"^f\d{8}-\d$", r"^o\d{8}-\d$", r"^s\d{8}-\d$", r"^f\d{8}$", r"^o\d{8}-\d{2}$"]
for pattern in patterns:
    df = df[~df["Trial_ID"].str.match(pattern, na=False)]
print(f"After removing invalid IDs: {df.shape}")

# Drop unnecessary columns
cols_to_drop = ["Defendant_Surname", "Defendant_Given", "Victim_Surname", "Victim_Given"]
df = df.drop(columns=cols_to_drop, errors="ignore")
print(f"After dropping unused columns: {list(df.columns)}")

DATA CLEANING PIPELINE

Initial shape: (50044, 15)
After removing invalid IDs: (50043, 15)
After dropping unused columns: ['Trial_ID', 'Date', 'Defendant_Gender', 'Num_Defendants', 'Victim_Gender', 'Num_Victims', 'Offence', 'Offence_Subcategory', 'Verdict', 'Text_Length', 'Trial_Text']


### 3.2: Convert Date and Extract Year

In [28]:
df["Date"] = pd.to_datetime(df["Date"].astype(str), format="%Y%m%d", errors="coerce")
df["Year"] = df["Date"].dt.year
print(f"Year range: {df['Year'].min()} - {df['Year'].max()}")

Year range: 1720 - 1913


### 3.3: Remove Duplicates and Filter Verdicts

In [29]:
df = df.drop_duplicates(subset="Trial_ID", keep="first")
print(f"After removing duplicates: {df.shape}")

df.replace(["unknown", "Unknown"], np.nan, inplace=True)

df = df.dropna(subset=["Verdict", "Offence", "Trial_Text"])

df = df[df["Verdict"].isin(["guilty", "notGuilty"])]
print(f"After filtering verdicts: {df.shape}")
print(f"\nVerdict distribution:\n{df['Verdict'].value_counts()}")

After removing duplicates: (50043, 12)
After filtering verdicts: (49365, 12)

Verdict distribution:
Verdict
guilty       35487
notGuilty    13878
Name: count, dtype: int64


## Step 4: Final Dataset Summary

In [30]:
print("\n" + "="*50)
print("FINAL DATASET SUMMARY")
print("="*50)
print(f"\nShape: {df.shape}")
print(f"Columns: {list(df.columns)}")

print("\nVerdict distribution:")
print(df["Verdict"].value_counts())

print("\nTop 5 Offences:")
print(df["Offence"].value_counts().head(5))

print("\nYear range:", df['Year'].min(), "-", df['Year'].max())

print("\nSample Trial Text (first 300 chars):")
print(df['Trial_Text'].iloc[0][:300] + "...")

display(df.head())


FINAL DATASET SUMMARY

Shape: (49365, 12)
Columns: ['Trial_ID', 'Date', 'Defendant_Gender', 'Num_Defendants', 'Victim_Gender', 'Num_Victims', 'Offence', 'Offence_Subcategory', 'Verdict', 'Text_Length', 'Trial_Text', 'Year']

Verdict distribution:
Verdict
guilty       35487
notGuilty    13878
Name: count, dtype: int64

Top 5 Offences:
Offence
theft            37033
deception         2956
violentTheft      2433
royalOffences     2135
sexual            1434
Name: count, dtype: int64

Year range: 1720 - 1913

Sample Trial Text (first 300 chars):
First London Jury. - Before Mr. Recorder. 498. THOMAS SAGGERS and JEREMIAH CRANE were indicted for stealing, on the 22nd of January , at St. Alban, Wood-street , 7 pieces of flannel, value 23l., the goods of John Leach and others, the masters and employers of the said Thomas Saggers , in their dwell...


Unnamed: 0,Trial_ID,Date,Defendant_Gender,Num_Defendants,Victim_Gender,Num_Victims,Offence,Offence_Subcategory,Verdict,Text_Length,Trial_Text,Year
0,t18300218-1,1830-02-18,male,2,male,2,theft,theftFromPlace,guilty,12257,First London Jury. - Before Mr. Recorder. 498....,1830
1,t18300218-2,1830-02-18,male,1,male,2,theft,housebreaking,guilty,6849,499. GEORGE WILLIAMS was indicted for feloniou...,1830
2,t18300218-3,1830-02-18,male,1,male,1,theft,theftFromPlace,guilty,4063,First London Jury. - Before Mr. Recorder. 500....,1830
3,t18300218-4,1830-02-18,male,2,female,4,theft,housebreaking,guilty,10693,501. HENRY RICHARDS and THOMAS BURDETT were in...,1830
4,t18300218-5,1830-02-18,male,1,male,1,theft,animalTheft,guilty,2245,Second Middlesex Jury - Before Mr. Justice Bay...,1830


## Step 5: Save Dataset

In [31]:
output_path = "../dataset/OBC_Cleaned.csv"
df.to_csv(output_path, index=False)

print(f"Dataset saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")
print(f"Total rows: {len(df)}")
print("\n✓ Preprocessing complete!")

Dataset saved to: ../dataset/OBC_Cleaned.csv
File size: 150.65 MB
Total rows: 49365

✓ Preprocessing complete!
