In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import re
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


In [43]:
data = []
folder = "dataset"

for file in tqdm(os.listdir(folder)):
    if file.endswith(".xml"):
        path = os.path.join(folder, file)

        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()

        # Fix malformed catchphrase attributes
        raw_text = raw_text.replace('"id=', 'id=')

        try:
            soup = BeautifulSoup(raw_text, "xml")
        except Exception as e:
            print(f"Skipping {file} due to soup parse error: {e}")
            continue

        case_name = soup.find("name").text if soup.find("name") else None

        # Collect raw catchphrases
        raw_catchphrases = [c.text.strip() for c in soup.find_all("catchphrase") if c.text]

        # Clean step: remove c0">, c1"> etc. and split
        cleaned_catchphrases = []
        for cp in raw_catchphrases:
            cleaned = re.sub(r'c\d+">', '', cp)  # remove c0">, c1">
            parts = [p.strip() for p in cleaned.split(",") if p.strip()]
            cleaned_catchphrases.extend(parts)

        # Join back into a single string
        catchphrases_str = " | ".join(cleaned_catchphrases)

        # Collect sentences
        sentences = " ".join([s.text.strip() for s in soup.find_all("sentence") if s.text])

        if cleaned_catchphrases:
            data.append({
                "filename": file,
                "case_name": case_name,
                "catchphrases": catchphrases_str,   # all catchphrases cleaned and joined
                "text": sentences
            })

df = pd.DataFrame(data)
print("Parsed cases:", len(df))

100%|██████████| 4193/4193 [00:56<00:00, 74.79it/s] 

Parsed cases: 3890





In [44]:
df.head(10)

Unnamed: 0,filename,case_name,catchphrases,text
0,06_1.xml,Sharman Networks Ltd v Universal Music Austral...,application for leave to appeal | authorisatio...,Background to the current application \n \n1 T...
1,06_100.xml,Lawrance v Human Rights and Equal Opportunity ...,no point of principle | administrative law and...,1 These are two applications for orders of rev...
2,06_1001.xml,Citrus Queensland Pty Ltd v Sunstate Orchards ...,discovery | whether inclusion of a document as...,1 I have before me two notices of motion both ...
3,06_1004.xml,Martech International Pty Ltd v Energy World C...,variation | termination | interpretation | man...,Introduction \n \n1 In 1985 Mr Fletcher Brand ...
4,06_1005.xml,Commissioner of Taxation v Milne (with Corrige...,context of liability to income tax | review of...,Context to the present application by way of a...
5,06_1006.xml,SZCCX v Minister for Immigration & Multicultur...,no point in principle | migration,1 This is an appeal against the judgment of Fe...
6,06_1015.xml,Douglas v Queensland [2006] FCA 1015 (8 August...,vacation of trial date | inconvenience to the ...,1 The notice of motion filed by the applicants...
7,06_1017.xml,Regional Publishers Pty Limited v Elkington [2...,compulsory acquisition of shares | shareholder...,"1 The first and second defendants, Gordon Brad..."
8,06_1018.xml,SZFBU v Minister for Immigration and Multicult...,application for extension of time within which...,1 This is an application for extension of time...
9,06_102.xml,"Gidley, in the matter of Aliance Motor Body Pt...",administration | administrator and administrat...,"1 This is an application by the plaintiff, Pau..."


In [45]:

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Enable probability calculation
topic_model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True)

# Combine case_name and catchphrases for richer embeddings
df["case_input"] = df["case_name"].fillna("") + " " + df["catchphrases"].fillna("")

# Run BERTopic
topics, probs = topic_model.fit_transform(df["case_input"].tolist())

# Save topic assignments
df["topic_id"] = topics

# Save probability of the assigned topic
df["probability"] = [
    float(max(p)) if hasattr(p, "__iter__") else float(p)
    for p in probs
]

In [47]:
topic_model.save("bertopic_model")

# Later load without retraining
from bertopic import BERTopic
topic_model = BERTopic.load("bertopic_model")



In [49]:
# -----------------------------
# Step 3: Assign main label per case (directly)
# -----------------------------
labels_df = df[["filename", "case_name", "topic_id", "probability"]].copy()

# Handle outliers and assign label names
labels_df["label"] = labels_df["topic_id"].apply(
    lambda t: "Miscellaneous" if t == -1 else f"Topic_{t}"
)


In [50]:
# -----------------------------
# Step 4: Prepare final dataset
# -----------------------------
# Drop helper column used for BERTopic input
if "case_input" in df.columns:
    df.drop(columns=["case_input"], inplace=True)

# Rename for clarity
final_df = df.copy()

# -----------------------------
# Step 5: Save to CSV
# -----------------------------
final_df.to_csv("case_classification_dataset.csv", index=False)
print("Saved case_classification_dataset.csv")



Saved case_classification_dataset.csv


In [51]:
final_df.head(10)

Unnamed: 0,filename,case_name,catchphrases,text,topic_id,probability
0,06_1.xml,Sharman Networks Ltd v Universal Music Austral...,application for leave to appeal | authorisatio...,Background to the current application \n \n1 T...,-1,0.024296
1,06_100.xml,Lawrance v Human Rights and Equal Opportunity ...,no point of principle | administrative law and...,1 These are two applications for orders of rev...,13,1.0
2,06_1001.xml,Citrus Queensland Pty Ltd v Sunstate Orchards ...,discovery | whether inclusion of a document as...,1 I have before me two notices of motion both ...,42,0.404301
3,06_1004.xml,Martech International Pty Ltd v Energy World C...,variation | termination | interpretation | man...,Introduction \n \n1 In 1985 Mr Fletcher Brand ...,-1,0.073596
4,06_1005.xml,Commissioner of Taxation v Milne (with Corrige...,context of liability to income tax | review of...,Context to the present application by way of a...,1,0.396194
5,06_1006.xml,SZCCX v Minister for Immigration & Multicultur...,no point in principle | migration,1 This is an appeal against the judgment of Fe...,0,0.94117
6,06_1015.xml,Douglas v Queensland [2006] FCA 1015 (8 August...,vacation of trial date | inconvenience to the ...,1 The notice of motion filed by the applicants...,6,0.018468
7,06_1017.xml,Regional Publishers Pty Limited v Elkington [2...,compulsory acquisition of shares | shareholder...,"1 The first and second defendants, Gordon Brad...",-1,0.05105
8,06_1018.xml,SZFBU v Minister for Immigration and Multicult...,application for extension of time within which...,1 This is an application for extension of time...,0,0.899686
9,06_102.xml,"Gidley, in the matter of Aliance Motor Body Pt...",administration | administrator and administrat...,"1 This is an application by the plaintiff, Pau...",-1,0.055601


In [52]:
final_df["topic_id"].value_counts()

topic_id
-1     964
 0     946
 1     185
 2     183
 3     128
 4     126
 5     124
 6      90
 7      59
 8      57
 9      56
 10     55
 11     51
 12     48
 13     44
 14     43
 15     36
 16     35
 17     35
 18     34
 19     32
 20     32
 21     31
 22     30
 23     29
 24     25
 25     24
 26     22
 27     22
 28     21
 29     20
 30     20
 31     20
 32     19
 33     19
 34     18
 35     18
 36     17
 37     17
 38     17
 39     17
 40     16
 41     16
 42     15
 43     15
 44     14
 45     14
 46     11
 47     10
 48     10
Name: count, dtype: int64

In [54]:
# Show overall topic distribution
topic_info = topic_model.get_topic_info()
print(topic_info)

# Example: inspect keywords for a few clusters
for t in [0, 1, 2, 3]:
    print(f"\nTopic {t} keywords:")
    print(topic_model.get_topic(t))


    Topic  Count                                               Name  \
0      -1    964                                   -1_of_to_and_the   
1       0    946          0_migration_minister_immigration_tribunal   
2       1    185                 1_tax_taxation_income_commissioner   
3       2    183             2_industrial_union_workplace_relations   
4       3    128                  3_native_title_land_determination   
5       4    126                 4_bankruptcy_bankrupt_1966_trustee   
6       5    124             5_practices_trade_consumer_competition   
7       6     90                     6_administrative_crime_act_the   
8       7     59            7_copyright_design_infringement_damages   
9       8     57           8_veterans_repatriation_war_entitlements   
10      9     56                     9_patent_patents_enantiomer_of   
11     10     55                     10_discovery_documents_ltd_pty   
12     11     51                  11_up_winding_deputy_corporations   
13    

In [55]:
topic_mapping = {
    # Immigration & Citizenship
    0: "Immigration & Citizenship", 17: "Immigration & Citizenship",

    # Taxation & Finance
    1: "Taxation & Finance", 5: "Taxation & Finance", 16: "Taxation & Finance",
    23: "Taxation & Finance", 24: "Taxation & Finance", 31: "Taxation & Finance",

    # Employment, Unions & Workplace Relations
    2: "Employment, Unions & Workplace Relations", 46: "Employment, Unions & Workplace Relations",

    # Property, Land & Native Title
    3: "Property, Land & Native Title", 4: "Property, Land & Native Title", 25: "Property, Land & Native Title",

    # Intellectual Property
    7: "Intellectual Property", 8: "Intellectual Property", 9: "Intellectual Property",
    19: "Intellectual Property", 20: "Intellectual Property", 29: "Intellectual Property", 48: "Intellectual Property",

    # Corporate & Insolvency
    11: "Corporate & Insolvency", 12: "Corporate & Insolvency", 21: "Corporate & Insolvency",
    22: "Corporate & Insolvency", 26: "Corporate & Insolvency", 27: "Corporate & Insolvency",
    28: "Corporate & Insolvency", 35: "Corporate & Insolvency", 37: "Corporate & Insolvency",
    40: "Corporate & Insolvency", 41: "Corporate & Insolvency", 47: "Corporate & Insolvency",

    # Environment & Conservation
    14: "Environment & Conservation", 15: "Environment & Conservation", 43: "Environment & Conservation",

    # Compensation & Insurance
    6: "Compensation & Insurance", 21: "Compensation & Insurance", 36: "Compensation & Insurance",
    38: "Compensation & Insurance", 39: "Compensation & Insurance", 44: "Compensation & Insurance",

    # General Civil Procedure
    10: "General Civil Procedure", 13: "General Civil Procedure", 30: "General Civil Procedure",
    32: "General Civil Procedure", 33: "General Civil Procedure", 34: "General Civil Procedure",
    42: "General Civil Procedure", 45: "General Civil Procedure"
}

# Apply mapping
final_df["Category"] = final_df["topic_id"].map(topic_mapping).fillna("Other")

# Check counts per new category
print(final_df["Category"].value_counts())


Category
Other                                       998
Immigration & Citizenship                   981
Taxation & Finance                          418
Property, Land & Native Title               278
Corporate & Insolvency                      271
Intellectual Property                       266
General Civil Procedure                     204
Employment, Unions & Workplace Relations    194
Compensation & Insurance                    186
Environment & Conservation                   94
Name: count, dtype: int64


In [56]:
final_df.head(10)

Unnamed: 0,filename,case_name,catchphrases,text,topic_id,probability,Category
0,06_1.xml,Sharman Networks Ltd v Universal Music Austral...,application for leave to appeal | authorisatio...,Background to the current application \n \n1 T...,-1,0.024296,Other
1,06_100.xml,Lawrance v Human Rights and Equal Opportunity ...,no point of principle | administrative law and...,1 These are two applications for orders of rev...,13,1.0,General Civil Procedure
2,06_1001.xml,Citrus Queensland Pty Ltd v Sunstate Orchards ...,discovery | whether inclusion of a document as...,1 I have before me two notices of motion both ...,42,0.404301,General Civil Procedure
3,06_1004.xml,Martech International Pty Ltd v Energy World C...,variation | termination | interpretation | man...,Introduction \n \n1 In 1985 Mr Fletcher Brand ...,-1,0.073596,Other
4,06_1005.xml,Commissioner of Taxation v Milne (with Corrige...,context of liability to income tax | review of...,Context to the present application by way of a...,1,0.396194,Taxation & Finance
5,06_1006.xml,SZCCX v Minister for Immigration & Multicultur...,no point in principle | migration,1 This is an appeal against the judgment of Fe...,0,0.94117,Immigration & Citizenship
6,06_1015.xml,Douglas v Queensland [2006] FCA 1015 (8 August...,vacation of trial date | inconvenience to the ...,1 The notice of motion filed by the applicants...,6,0.018468,Compensation & Insurance
7,06_1017.xml,Regional Publishers Pty Limited v Elkington [2...,compulsory acquisition of shares | shareholder...,"1 The first and second defendants, Gordon Brad...",-1,0.05105,Other
8,06_1018.xml,SZFBU v Minister for Immigration and Multicult...,application for extension of time within which...,1 This is an application for extension of time...,0,0.899686,Immigration & Citizenship
9,06_102.xml,"Gidley, in the matter of Aliance Motor Body Pt...",administration | administrator and administrat...,"1 This is an application by the plaintiff, Pau...",-1,0.055601,Other


In [60]:
sample = final_df.sample(n=10, random_state=42)  # change n for more/less
print(sample)

         filename                                          case_name  \
3798   09_845.xml  Spencer v Neo Rock Pty Ltd ACN 110 874 283 (In...   
2320  08_1543.xml  Alfred v Wakelin (No. 2) [2008] FCA 1543 (17 O...   
2431  08_1789.xml  SZLBE v Minister for Immigration and Citizensh...   
2533  08_1973.xml  Macks v Garrett [2008] FCA 1973 (15 December 2...   
315   06_1470.xml  SZFMB v Minister for Immigration & Multicultur...   
3642   09_601.xml    Davidova v Murphy [2009] FCA 601 (10 June 2009)   
351   06_1531.xml  Anya Holdings Pty Ltd v Idohage Pty Ltd [2006]...   
149   06_1253.xml  Tomkins v CASA [2006] FCA 1253 (21 September 2...   
2962   08_926.xml  Sebastian v State of Western Australia [2008] ...   
457    06_171.xml  New Guinea Line Pty Limited v Board of Trustee...   

                                           catchphrases  \
3798  consideration of an application for leave purs...   
2320  whether conduct of union delegate made union l...   
2431  whether breach of proced

In [63]:
# Filter only confident predictions
filtered_df = final_df[final_df["probability"] >= 0.4].copy()

# Count how many per category
category_counts = filtered_df["Category"].value_counts()

print("Number of cases per category (prob >= 0.6):")
print(category_counts)

# Optional: save filtered dataset
filtered_df.to_csv("filtered_cases.csv", index=False)


Number of cases per category (prob >= 0.6):
Category
Immigration & Citizenship                   904
Taxation & Finance                          184
Property, Land & Native Title               177
Corporate & Insolvency                      159
Intellectual Property                       131
General Civil Procedure                     105
Employment, Unions & Workplace Relations     88
Compensation & Insurance                     76
Environment & Conservation                   47
Other                                        17
Name: count, dtype: int64
