## Step 1: Count XML Files

In [1]:
import os

folder_path = "../dataset/OBC2"
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

print(f"Number of XML files: {file_count}")

Number of XML files: 1274


## Step 2: Parse XML Files to DataFrame

In [2]:
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd

xml_folder = "../dataset/OBC2/"
xml_files = glob.glob(os.path.join(xml_folder, "*.xml"))
data = []

print(f"Parsing {len(xml_files)} XML files...")

for file in xml_files:
    tree = ET.parse(file)
    root = tree.getroot()

    for trial in root.findall(".//div1[@type='trialAccount']"):
        trial_id = trial.get("id", "Unknown")
        trial_date_element = trial.find("interp[@type='date']")
        trial_date = trial_date_element.get("value", "Unknown") if trial_date_element is not None else "Unknown"

        defendant = trial.find(".//persName[@type='defendantName']")
        surname, given = "Unknown", "Unknown"
        if defendant is not None:
            surname_element = defendant.find("interp[@type='surname']")
            surname = surname_element.get("value", "Unknown") if surname_element is not None else "Unknown"
            given_element = defendant.find("interp[@type='given']")
            given = given_element.get("value", "Unknown") if given_element is not None else "Unknown"

        victim = trial.find(".//persName[@type='victimName']")
        victim_surname, victim_given = "Unknown", "Unknown"
        if victim is not None:
            victim_surname_element = victim.find("interp[@type='surname']")
            victim_surname = victim_surname_element.get("value", "Unknown") if victim_surname_element is not None else "Unknown"
            victim_given_element = victim.find("interp[@type='given']")
            victim_given = victim_given_element.get("value", "Unknown") if victim_given_element is not None else "Unknown"

        verdict = trial.find(".//rs[@type='verdictDescription']/interp[@type='verdictCategory']")
        verdict_text = verdict.get("value", "Unknown") if verdict is not None else "Unknown"

        punishment = trial.find(".//rs[@type='punishmentDescription']/interp[@type='punishmentCategory']")
        punishment_text = punishment.get("value", "Unknown") if punishment is not None else "Unknown"

        offence = trial.find(".//rs[@type='offenceDescription']/interp[@type='offenceCategory']")
        offence_text = offence.get("value", "Unknown") if offence is not None else "Unknown"

        crime_date = trial.find(".//rs[@type='crimeDate']")
        crime_date_text = crime_date.text.strip() if crime_date is not None and crime_date.text is not None else "Unknown"

        trial_text = []
        for u in trial.findall(".//u"):
            speaker_role = u.get("role", "Unknown")
            speaker_text = " ".join([p.text.strip() for p in u.findall(".//p") if p.text])
            if speaker_text:
                trial_text.append(f"[{speaker_role}] {speaker_text}")

        full_trial_text = "\n".join(trial_text)

        data.append([trial_id, trial_date, surname, given, victim_surname, victim_given, verdict_text, punishment_text, offence_text, crime_date_text, full_trial_text])

columns = ["Trial_ID", "Date", "Defendant_Surname", "Defendant_Given", "Victim_Surname", "Victim_Given", "Verdict", "Punishment", "Offence", "Crime_Date", "Trial_Text"]
df = pd.DataFrame(data, columns=columns)

print(f"\nParsing completed! Initial dataset shape: {df.shape}")

display(df.head())

Parsing 1274 XML files...

Parsing completed! Initial dataset shape: (100088, 11)


Unnamed: 0,Trial_ID,Date,Defendant_Surname,Defendant_Given,Victim_Surname,Victim_Given,Verdict,Punishment,Offence,Crime_Date,Trial_Text
0,t17500117-1,17500117,Bowen,John,Gwinn,William,notGuilty,Unknown,theft,24th_MD of_IO February_NPM1,
1,t17500117-2,17500117,Bond,Nicholas,Page,Henry,guilty,transport,theft,Jan._NPM1 9_MC,
2,t17500117-3,17500117,Biggs,Thomas,Gordon,William,notGuilty,Unknown,theft,Unknown,
3,t17500117-4,17500117,Wanless,Elizabeth,Broadhurst,Thomas,guilty,transport,theft,October_NPM1 25_MC,
4,t17500117-5,17500117,Lowe,Susannah,Wolse,Ezekiel,notGuilty,Unknown,theft,Dec._NPM1 20_MC,


## Step 3: Extract and Clean Trial Texts to Files

In [3]:
import os
import glob
import xml.etree.ElementTree as ET

xml_folder = "../dataset/OBC2/"
output_text_folder = "../dataset/trial_texts/"
os.makedirs(output_text_folder, exist_ok=True)
xml_files = glob.glob(os.path.join(xml_folder, "*.xml"))

trial_count = 0
for file in xml_files:
    try:
        tree = ET.parse(file)
        root = tree.getroot()

        for trial in root.findall(".//div1[@type='trialAccount']"):
            trial_id = trial.get("id", "Unknown")
            trial_text = " ".join(trial.itertext()).strip()

            text_file_path = os.path.join(output_text_folder, f"{trial_id}.txt")
            with open(text_file_path, "w", encoding="utf-8") as txt_file:
                txt_file.write(trial_text)
            trial_count += 1
    except Exception as e:
        print(f"Error processing {file}: {e}")

print(f"Extracted {trial_count} trial texts to {output_text_folder}")

Extracted 100088 trial texts to ../dataset/trial_texts/


### Remove Invalid Trial ID Patterns

In [4]:
import os
import glob

output_text_folder = "../dataset/trial_texts/"

patterns = [
    "a????????-?.txt",
    "f????????-?.txt",
    "f????????.txt",
    "o????????-?.txt",
    "o????????-??.txt",
    "s????????-?.txt"
]

total_deleted = 0
for pattern in patterns:
    full_pattern = os.path.join(output_text_folder, pattern)
    files_to_delete = glob.glob(full_pattern)
    
    for file in files_to_delete:
        try:
            os.remove(file)
            total_deleted += 1
        except Exception as e:
            print(f"Error deleting {file}: {e}")

remaining_files = len([f for f in os.listdir(output_text_folder) if os.path.isfile(os.path.join(output_text_folder, f))])
print(f"Deleted {total_deleted} files with invalid patterns")
print(f"Remaining trial text files: {remaining_files}")

Deleted 1 files with invalid patterns
Remaining trial text files: 50043


## Step 4: DataFrame Cleaning Pipeline

### 4.1: Remove Invalid Trial_ID Patterns

In [5]:
import pandas as pd
import numpy as np

print("Starting DataFrame cleaning pipeline...\n")
print(f"Initial shape: {df.shape}")

print("\nRemoving invalid Trial_ID patterns...")
patterns = [r"^a\d{8}-\d$", r"^f\d{8}-\d$", r"^o\d{8}-\d$", r"^s\d{8}-\d$", r"^f\d{8}$", r"^o\d{8}-\d{2}$"]
for pattern in patterns:
    df = df[~df["Trial_ID"].str.match(pattern, na=False)]
print(f"Shape after removing invalid IDs: {df.shape}")

Starting DataFrame cleaning pipeline...

Initial shape: (100088, 11)

Removing invalid Trial_ID patterns...
Shape after removing invalid IDs: (100086, 11)


### 4.2: Drop Trial_Text Column

In [6]:
print("Dropping Trial_Text column...")
df = df.drop(columns=["Trial_Text"], errors="ignore")
print(f"Columns: {list(df.columns)}")

Dropping Trial_Text column...
Columns: ['Trial_ID', 'Date', 'Defendant_Surname', 'Defendant_Given', 'Victim_Surname', 'Victim_Given', 'Verdict', 'Punishment', 'Offence', 'Crime_Date']


### 4.3: Drop Unnecessary Columns

In [7]:
print("Dropping unnecessary columns...")
df = df.drop(columns=["Victim_Surname", "Victim_Given", "Witness_Name"], errors="ignore")
print(f"Columns: {list(df.columns)}")

Dropping unnecessary columns...
Columns: ['Trial_ID', 'Date', 'Defendant_Surname', 'Defendant_Given', 'Verdict', 'Punishment', 'Offence', 'Crime_Date']


### 4.4: Convert Date to Datetime

In [8]:
print("Converting Date to datetime format...")
df["Date"] = pd.to_datetime(df["Date"].astype(str), format="%Y%m%d")
print(f"Date sample: {df['Date'].head(3).tolist()}")

Converting Date to datetime format...
Date sample: [Timestamp('1750-01-17 00:00:00'), Timestamp('1750-01-17 00:00:00'), Timestamp('1750-01-17 00:00:00')]


### 4.5: Remove Duplicate Trial_IDs

In [9]:
print("Checking for duplicates...")
duplicates = df[df.duplicated(subset=["Trial_ID"], keep=False)]
print(f"Found {len(duplicates)} duplicate rows")
df = df.drop_duplicates(subset=["Trial_ID"], keep="first")
print(f"Shape after removing duplicates: {df.shape}")

Checking for duplicates...
Found 100086 duplicate rows
Shape after removing duplicates: (50043, 8)


### 4.6: Drop Defendant and Crime Details

In [10]:
print("Dropping defendant and crime details...")
df = df.drop(columns=["Defendant_Surname", "Defendant_Given", "Punishment", "Crime_Date"], errors="ignore")
print(f"Columns: {list(df.columns)}")

Dropping defendant and crime details...
Columns: ['Trial_ID', 'Date', 'Verdict', 'Offence']


### 4.7: Remove Multi-Column Duplicates

In [11]:
print("Removing duplicates by Trial_ID, Date, Verdict, Offence...")
df = df.drop_duplicates(subset=["Trial_ID", "Date", "Verdict", "Offence"], keep="first")
print(f"Shape: {df.shape}")

Removing duplicates by Trial_ID, Date, Verdict, Offence...
Shape: (50043, 4)


### 4.8: Extract Year from Date

In [12]:
print("Extracting Year from Date...")
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df["Year"] = df["Date"].dt.year
print(f"Year range: {df['Year'].min()} - {df['Year'].max()}")

Extracting Year from Date...
Year range: 1720 - 1913


### 4.9: Load Cleaned Trial Texts from Files

In [13]:
import os
import re

print("Loading and cleaning trial texts from files...")
text_folder = "../dataset/trial_texts/"

def apply_first_pass(text):
    text = re.sub(r"\b(\w+)_\w+\b", r"\1", text)
    text = re.sub(r"[_,:;\"'(){}[\]<>]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def apply_second_pass(text):
    text = re.sub(r"\b(NNU|NNB)\b", "", text)
    text = re.sub(r"\.\s+\.", ".", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

trial_texts = {}
for filename in os.listdir(text_folder):
    if filename.endswith(".txt"):
        trial_id = filename.replace(".txt", "")
        file_path = os.path.join(text_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
            text = apply_first_pass(text)
            text = apply_second_pass(text)
            trial_texts[trial_id] = text

df["Trial_Text"] = df["Trial_ID"].astype(str).map(trial_texts).fillna("")
print(f"Trial texts loaded and cleaned: {len(trial_texts)}")
print(f"Rows with text: {(df['Trial_Text'] != '').sum()}")

Loading and cleaning trial texts from files...
Trial texts loaded and cleaned: 50043
Rows with text: 50042


### 4.10: Final Duplicate Removal

In [14]:
print("Final duplicate removal...")
df = df.drop_duplicates(subset="Trial_ID", keep="first")
df["Date"] = pd.to_datetime(df["Date"])
df["Year"] = df["Date"].dt.year
print(f"Shape: {df.shape}")

Final duplicate removal...
Shape: (50043, 6)


### 4.11: Replace Unknown Values with NaN

In [15]:
print("Replacing 'unknown' values with NaN...")
df.replace("unknown", np.nan, inplace=True)
df.replace("Unknown", np.nan, inplace=True)
print(f"Missing values per column:")
print(df.isnull().sum())

Replacing 'unknown' values with NaN...
Missing values per column:
Trial_ID        0
Date            0
Verdict       561
Offence        27
Year            0
Trial_Text      0
dtype: int64


### 4.12: Filter to Valid Verdicts Only

In [16]:
print("Dropping NaN rows and filtering verdicts...")
df.dropna(inplace=True)
df = df[df["Verdict"].isin(["guilty", "notGuilty"])]
print(f"Shape after filtering: {df.shape}")
print(f"\nVerdict distribution:")
print(df["Verdict"].value_counts())
print(f"\nOffence distribution:")
print(df["Offence"].value_counts())

print("\n" + "="*50)
print("DataFrame cleaning complete!")
print(f"Final shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
display(df.head())

Dropping NaN rows and filtering verdicts...
Shape after filtering: (49365, 6)

Verdict distribution:
Verdict
guilty       35487
notGuilty    13878
Name: count, dtype: int64

Offence distribution:
Offence
theft            37033
deception         2956
violentTheft      2433
royalOffences     2135
sexual            1434
breakingPeace     1319
kill              1205
miscellaneous      654
damage             196
Name: count, dtype: int64

DataFrame cleaning complete!
Final shape: (49365, 6)
Columns: ['Trial_ID', 'Date', 'Verdict', 'Offence', 'Year', 'Trial_Text']


Unnamed: 0,Trial_ID,Date,Verdict,Offence,Year,Trial_Text
0,t17500117-1,1750-01-17,notGuilty,theft,1750,109. John Bowen was indicted for that he on th...
1,t17500117-2,1750-01-17,guilty,theft,1750,110 111. Nicholas Bond and William Heyden late...
2,t17500117-3,1750-01-17,notGuilty,theft,1750,112. Thomas Biggs was indicted for stealing on...
3,t17500117-4,1750-01-17,guilty,theft,1750,113. Elizabeth Wanless otherwise Newbey spinst...
4,t17500117-5,1750-01-17,notGuilty,theft,1750,114 115. Susannah Lowe and Margaret Richards w...


## Step 5: Text Cleaning Functions

### 5.1: Remove Leading Non-Alphanumeric Characters

In [17]:
import pandas as pd
import re

def clean_text(text):
    text = str(text)
    text = re.sub(r'^[\W\d\s]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

print("[1/3] Removing leading non-alphanumeric characters...")
df['Trial_Text'] = df['Trial_Text'].apply(clean_text)
print(f"Sample: {df['Trial_Text'].iloc[0][:100]}...")

[1/3] Removing leading non-alphanumeric characters...
Sample: John Bowen was indicted for that he on the 24th of February between the hour of twelve and one in th...


### 5.2: Remove Special Characters

In [18]:
import re

def clean_text1(text):
    text = str(text)
    text = re.sub(r'[+*FO]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("[2/3] Removing special characters (+*FO)...")
df['Trial_Text'] = df['Trial_Text'].apply(clean_text1)
print(f"Sample: {df['Trial_Text'].iloc[0][:100]}...")

[2/3] Removing special characters (+*FO)...
Sample: John Bowen was indicted for that he on the 24th of ebruary between the hour of twelve and one in the...


### 5.3: Fix Spacing and Apostrophes

In [19]:
import pandas as pd
import re

def clean_text2(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'(?<=\w)\.(?=[A-Z])', '. ', text)
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    text = re.sub(r"Prisoner s", "Prisoner's", text)
    text = re.sub(r'(\d+)\s*d\s*\.', r'\1d.', text)
    return text.strip()

print("[3/3] Fixing spacing and apostrophes...")
df['Trial_Text'] = df['Trial_Text'].apply(clean_text2)
print(f"Sample: {df['Trial_Text'].iloc[0][:100]}...")

print("\nText cleaning complete!")
print(f"Average text length: {df['Trial_Text'].str.len().mean():.0f} characters")

[3/3] Fixing spacing and apostrophes...
Sample: John Bowen was indicted for that he on the 24th of ebruary between the hour of twelve and one in the...

Text cleaning complete!
Average text length: 3028 characters


## Step 6: NLTK Text Processing

### 6.1: Download and Setup NLTK Resources

In [20]:
import nltk
import os

nltk_data_dir = "./nltk_data"
nltk.data.path.append(nltk_data_dir)
os.makedirs(nltk_data_dir, exist_ok=True)

nltk_resources = [
    "punkt",
    "punkt_tab",
    "wordnet",
    "omw-1.4",
    "averaged_perceptron_tagger_eng",
    "stopwords"
]

print("Downloading NLTK resources...")
for resource in nltk_resources:
    try:
        nltk.data.find(resource)
        print(f"  {resource}: already present")
    except LookupError:
        print(f"  {resource}: downloading...")
        nltk.download(resource, download_dir=nltk_data_dir)

Downloading NLTK resources...
  punkt: downloading...


[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to ./nltk_data...


  punkt_tab: downloading...


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to ./nltk_data...


  wordnet: downloading...
  omw-1.4: downloading...


[nltk_data] Downloading package omw-1.4 to ./nltk_data...


  averaged_perceptron_tagger_eng: downloading...


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     ./nltk_data...


  stopwords: downloading...


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### 6.2: Setup NLTK Tools

In [21]:
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

print(f"Stopwords loaded: {len(stop_words)}")

Stopwords loaded: 198


### 6.3: Define NLTK Cleaning Function

In [22]:
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag

def get_wordnet_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text_nltk(text):
    try:
        if not isinstance(text, str):
            return ""
        text = re.sub(r"^\s*[TM]\.\s*", "", text)
        text = re.sub(r"[^a-zA-Z\s]", " ", text)
        text = re.sub(r"\b\w{1,2}\b", " ", text)

        sentences = sent_tokenize(text)
        cleaned_sentences = []

        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            words = [word for word in words if word not in stop_words]
            tagged_words = pos_tag(words)
            lemmatized_words = [
                lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                for word, tag in tagged_words
            ]
            cleaned_sentences.append(" ".join(lemmatized_words))

        return " ".join(cleaned_sentences)

    except Exception as e:
        print(f"[ERROR] Failed to process text: {e}")
        return text

print("NLTK cleaning function defined")

NLTK cleaning function defined


### 6.4: Apply NLTK Processing to All Texts

In [23]:
print("Applying NLTK processing (this may take a while)...")
print(f"Processing {len(df)} trials...")

df["Trial_Text"] = df["Trial_Text"].astype(str).apply(clean_text_nltk)

print("\nNLTK processing complete!")
print(f"Sample processed text: {df['Trial_Text'].iloc[0][:200]}...")
print(f"\nAverage processed text length: {df['Trial_Text'].str.len().mean():.0f} characters")

Applying NLTK processing (this may take a while)...
Processing 49365 trials...

NLTK processing complete!
Sample processed text: john bowen indict ebruary hour twelve one morning dwelling house william gwinn break enter steal thence twenty yard linnen cheque thirty cotton handkerchief seventy nine yard stripe cotton six pair wo...

Average processed text length: 1606 characters


## Step 7: Final Processing and Review

### 7.1: Drop Trial_ID Column

In [24]:
print("Dropping Trial_ID column...")
df = df.drop(columns=["Trial_ID"])
print(f"Columns: {list(df.columns)}")

Dropping Trial_ID column...
Columns: ['Date', 'Verdict', 'Offence', 'Year', 'Trial_Text']


### 7.2: Display Final Dataset Summary

In [25]:
print("\n" + "="*50)
print("FINAL DATASET SUMMARY")
print("="*50)
print(f"\nShape: {df.shape}")
print(f"Columns: {list(df.columns)}")

print("\nData types:")
print(df.dtypes)

print("\nVerdict distribution:")
print(df["Verdict"].value_counts())

print("\nTop 10 Offences:")
print(df["Offence"].value_counts().head(10))

print("\nYear range:")
print(f"Min: {df['Year'].min()}, Max: {df['Year'].max()}")

print("\nMissing values:")
print(df.isnull().sum())


FINAL DATASET SUMMARY

Shape: (49365, 5)
Columns: ['Date', 'Verdict', 'Offence', 'Year', 'Trial_Text']

Data types:
Date          datetime64[ns]
Verdict               object
Offence               object
Year                   int32
Trial_Text            object
dtype: object

Verdict distribution:
Verdict
guilty       35487
notGuilty    13878
Name: count, dtype: int64

Top 10 Offences:
Offence
theft            37033
deception         2956
violentTheft      2433
royalOffences     2135
sexual            1434
breakingPeace     1319
kill              1205
miscellaneous      654
damage             196
Name: count, dtype: int64

Year range:
Min: 1720, Max: 1913

Missing values:
Date          0
Verdict       0
Offence       0
Year          0
Trial_Text    0
dtype: int64


### 7.3: Preview Dataset

In [26]:
print("First few rows:")
display(df.head())

print("\nLast few rows:")
display(df.tail())

First few rows:


Unnamed: 0,Date,Verdict,Offence,Year,Trial_Text
0,1750-01-17,notGuilty,theft,1750,john bowen indict ebruary hour twelve one morn...
1,1750-01-17,guilty,theft,1750,nicholas bond william heyden late ulham indict...
2,1750-01-17,notGuilty,theft,1750,thomas biggs indict steal one pair single chan...
3,1750-01-17,guilty,theft,1750,elizabeth wanless otherwise newbey spinster in...
4,1750-01-17,notGuilty,theft,1750,susannah lowe margaret richards widow indict s...



Last few rows:


Unnamed: 0,Date,Verdict,Offence,Year,Trial_Text
76749,1723-05-30,notGuilty,theft,1723,darius humphreys parish whitechapel indict ste...
76750,1723-05-30,guilty,theft,1723,john jones james hix ralph barrow alias arlow ...
76751,1723-05-30,notGuilty,theft,1723,hannah coleman giles ields indict feloniously ...
76752,1723-05-30,guilty,kill,1723,william hawksworth martin ields indict murder ...
76753,1723-05-30,guilty,miscellaneous,1723,john smith indict misdemeanour take four guine...


## Step 8: Save Final Dataset

In [27]:
output_path = "../dataset/OBC_Cleaned.csv"
df.to_csv(output_path, index=False)

print(f"Final dataset saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")
print(f"\nTotal rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print("\nPreprocessing pipeline complete!")

Final dataset saved to: ../dataset/OBC_Cleaned.csv
File size: 77.11 MB

Total rows: 49365
Total columns: 5

Preprocessing pipeline complete!
