In [30]:
import os
import fitz  # PyMuPDF for PDFs
from docx import Document
import re

# ---------- PDF Extraction ----------
def extract_pdf(path):
    text = ""
    doc = fitz.open(path)
    for page in doc:
        text += page.get_text("text") + "\n"
    doc.close()
    return text

# ---------- Word Extraction ----------
def extract_docx(path):
    doc = Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

# ---------- Cleaning ----------
def clean_text(text):
    text = re.sub(r'-\n', '', text)    # fix broken words
    text = re.sub(r'\s+', ' ', text)   # normalize spaces
    return text.strip()

# ---------- Section Chunking ----------

def chunk_tribunal_robust(text):
    """
    Splits tribunal text into main sections using keywords.
    Falls back to full text if nothing matches.
    """
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\s+', ' ', text)

    # Keywords commonly used in rulings
    keywords = [
        r'\b[A-Z]\.\s+[A-Z][A-Za-z0-9 ,\-&()]+',  # A. Background
        'Background', 'Dispute Background',
        'Issues', 'Issues for Determination',
        'Analysis', 'Determination',
        'Orders', 'Conclusion'
    ]

    # Build regex pattern
    pattern = '|'.join(keywords)
    
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE))
    if not matches:
        # No headings found → treat as one section
        return [{"section": "Full Text", "text": text}]
    
    chunks = []
    for i, match in enumerate(matches):
        start = match.start()
        heading = match.group(0).strip()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        section_text = text[start:end].strip()
        chunks.append({"section": heading, "text": section_text})

    return chunks
# ---------- Process Folder ----------
folder_path = "documents_folder"

documents = []   # <-- ADD THIS
processed = 0

for file in os.listdir(folder_path):
    path = os.path.join(folder_path, file)

    if not (file.endswith(".pdf") or file.endswith(".docx")):
        continue

    # Extract text
    if file.endswith(".pdf"):
        text = extract_pdf(path)
    else:
        text = extract_docx(path)

    # Clean text
    text = clean_text(text)

    # Store in documents list  ✅
    documents.append({
        "filename": file,
        "text": text
    })

    processed += 1

print(f"\nProcessed {processed} documents.")
print(f"Total documents stored: {len(documents)}")

print(f"\nProcessed {processed} documents.")
print(f"Total documents stored: {len(documents)}")


Processed 5 documents.
Total documents stored: 5

Processed 5 documents.
Total documents stored: 5


In [31]:
total_sections_all_files = 0

for doc in documents:
    text = doc["text"]
    sections = chunk_tribunal_robust(text)  # split into sections
    num_sections = len(sections)
    total_sections_all_files += num_sections
    print(f"{doc['filename']} → {num_sections} sections")  # optional per-file count

print(f"\nTotal sections across all files: {total_sections_all_files}")

Esmail v Cine Investment Limited  2 others (Tribunal Case E101of2025) 2025KELAT194(KLR) (8October2025) (Ruling).pdf → 7 sections
Kleen Homes Security Services Ltd v Pet Care Services Limited (Tribunal Case E550E360of2025 (Consolidated)) 2025KEBPRT423(KLR) (9October2025) (Ruling).pdf → 14 sections
Ledilas  6 others v National Land Commission Legal Advice Centre ta Kituo Cha Sheria (Amicus Curiae) (Tribunal Case E010of2025) 2026KELAT5(KLR) (9February2026) (Judgment).docx → 29 sections
Munyingi v Thuo (Tribunal Case E105of2024) 2025KEBPRT425(KLR) (9October2025) (Judgment).pdf → 23 sections
T Gas Limited v Baluch  3 others (Tribunal Case E017of2025) 2025KEBPRT424(KLR) (8October2025) (Ruling).pdf → 17 sections

Total sections across all files: 90


In [9]:
for doc in documents:
    text = doc["text"]
    sections = chunk_tribunal_robust(text)  # split text into sections

    print(f"\n=== {doc['filename']} ===")
    print(f"Total sections: {len(sections)}")

    # Print only section names / headings
    for sec in sections:
        print(f"- {sec['section']}")


=== Esmail v Cine Investment Limited  2 others (Tribunal Case E101of2025) 2025KELAT194(KLR) (8October2025) (Ruling).pdf ===
Total sections: 7
- a. The instant suit is similar to another suit which was struck out by this tribunal for lack of jurisdiction being Esmail v Abdeel Enterprises Limited & 3 others
- determination
- b. The present suit similarly invites the Tribunal to venture into matters beyond its jurisdiction
- c. Even though the Claimant purports to
- background
- orders
- M. ORINA PhD CHAIRPERSON Before

=== Kleen Homes Security Services Ltd v Pet Care Services Limited (Tribunal Case E550E360of2025 (Consolidated)) 2025KEBPRT423(KLR) (9October2025) (Ruling).pdf ===
Total sections: 14
- A. Background 1
- B. Issues for Determination 4
- issues
- determination
- i. Whether the Preliminary Objection meets the established legal threshold
- C. Analysis and Determination i
- A. stated
- issues
- t. The provision states that the Tribunal may
- D. Final Orders 23
- a. The Prelimina

In [10]:
# Remove duplicates in the documents list
seen = set()
unique_documents = []

for doc in documents:
    if doc["filename"] not in seen:
        unique_documents.append(doc)
        seen.add(doc["filename"])

documents = unique_documents  # now documents has only unique files

In [11]:
import re
import os

def extract_from_filename(filename):
    name = os.path.splitext(filename)[0]

    # Normalize spacing (insert spaces before capital letters if missing)
    clean = re.sub(r'(\d)([A-Z])', r'\1 \2', name)
    clean = re.sub(r'([a-z])([A-Z])', r'\1 \2', clean)
    clean = re.sub(r'\s+', ' ', clean)

    # Case name (before Tribunal Case)
    case_match = re.search(r'^(.*?)\s+\(Tribunal Case', clean)
    case_name = case_match.group(1).strip() if case_match else None

    # Tribunal case
    tribunal_match = re.search(r'E\d+\w*\s*of\s*\d{4}', clean, re.IGNORECASE)
    tribunal_case = tribunal_match.group(0).upper() if tribunal_match else None

    # Neutral citation
    citation_match = re.search(r'\d{4}\s*[A-Z]+\s*\d+', clean)
    neutral_citation = citation_match.group(0) if citation_match else None

    # Date
    date_match = re.search(r'\d{1,2}\s*[A-Za-z]+\s*\d{4}', clean)
    decision_date = date_match.group(0) if date_match else None

    # Extract parties from case name
    petitioner = None
    respondents = []

    if case_name and " v " in case_name:
        parts = case_name.split(" v ")
        petitioner = parts[0].strip()
        respondents = [parts[1].strip()]

    return {
        "case_name": case_name,
        "tribunal_case": tribunal_case,
        "neutral_citation": neutral_citation,
        "decision_date": decision_date,
        "petitioner": petitioner,
        "respondents": respondents
    }

In [12]:
for doc in documents:
    doc['metadata'] = extract_from_filename(doc["filename"])

    print("\n======================")
    print("File:", doc["filename"])
    print(doc['metadata'])


File: Esmail v Cine Investment Limited  2 others (Tribunal Case E101of2025) 2025KELAT194(KLR) (8October2025) (Ruling).pdf
{'case_name': 'Esmail v Cine Investment Limited 2 others', 'tribunal_case': 'E101OF2025', 'neutral_citation': '2025 KELAT194', 'decision_date': '01of2025', 'petitioner': 'Esmail', 'respondents': ['Cine Investment Limited 2 others']}

File: Kleen Homes Security Services Ltd v Pet Care Services Limited (Tribunal Case E550E360of2025 (Consolidated)) 2025KEBPRT423(KLR) (9October2025) (Ruling).pdf
{'case_name': 'Kleen Homes Security Services Ltd v Pet Care Services Limited', 'tribunal_case': 'E360OF2025', 'neutral_citation': '2025 KEBPRT423', 'decision_date': '60of2025', 'petitioner': 'Kleen Homes Security Services Ltd', 'respondents': ['Pet Care Services Limited']}

File: Ledilas  6 others v National Land Commission Legal Advice Centre ta Kituo Cha Sheria (Amicus Curiae) (Tribunal Case E010of2025) 2026KELAT5(KLR) (9February2026) (Judgment).docx
{'case_name': 'Ledilas 6 

In [13]:
for doc in documents:
    print(doc["filename"])
    print(doc.get("metadata", "No metadata attached"))
    print("---")

Esmail v Cine Investment Limited  2 others (Tribunal Case E101of2025) 2025KELAT194(KLR) (8October2025) (Ruling).pdf
{'case_name': 'Esmail v Cine Investment Limited 2 others', 'tribunal_case': 'E101OF2025', 'neutral_citation': '2025 KELAT194', 'decision_date': '01of2025', 'petitioner': 'Esmail', 'respondents': ['Cine Investment Limited 2 others']}
---
Kleen Homes Security Services Ltd v Pet Care Services Limited (Tribunal Case E550E360of2025 (Consolidated)) 2025KEBPRT423(KLR) (9October2025) (Ruling).pdf
{'case_name': 'Kleen Homes Security Services Ltd v Pet Care Services Limited', 'tribunal_case': 'E360OF2025', 'neutral_citation': '2025 KEBPRT423', 'decision_date': '60of2025', 'petitioner': 'Kleen Homes Security Services Ltd', 'respondents': ['Pet Care Services Limited']}
---
Ledilas  6 others v National Land Commission Legal Advice Centre ta Kituo Cha Sheria (Amicus Curiae) (Tribunal Case E010of2025) 2026KELAT5(KLR) (9February2026) (Judgment).docx
{'case_name': 'Ledilas 6 others v Nati

In [14]:
metadata = doc.get("metadata", {})
case_name = metadata.get("case_name", "Unknown Case")

In [15]:
all_sections = []

for doc in documents:
    text = doc["text"]
    metadata = doc.get("metadata", {})  # get the metadata you just stored

    sections = chunk_tribunal_robust(text)  # or your preferred chunking function

    for sec in sections:
        all_sections.append({
            **metadata,        # attach all metadata fields
            "section": sec["section"],
            "text": sec["text"]
        })

In [16]:
print(f"Total sections across all files: {len(all_sections)}\n")

# Preview first few sections
for sec in sections:
    all_sections.append({
        "case_name": case_name,
        "tribunal_case": metadata.get("tribunal_case", ""),
        "neutral_citation": metadata.get("neutral_citation", ""),
        "decision_date": metadata.get("decision_date", ""),
        "petitioner": metadata.get("petitioner", ""),
        "respondents": metadata.get("respondents", ""),
        "section": sec["section"],
        "text": sec["text"]
    })

Total sections across all files: 90



In [17]:
# Preview first 3 sections
for sec in all_sections[:3]:
    print(f"Case Name: {sec['case_name']}")
    print(f"Tribunal Case: {sec['tribunal_case']}")
    print(f"Decision Date: {sec['decision_date']}")
    print(f"Section: {sec['section']}")
    print(f"Text (first 100 chars): {sec['text'][:100]}...\n")

Case Name: Esmail v Cine Investment Limited 2 others
Tribunal Case: E101OF2025
Decision Date: 01of2025
Section: a. The instant suit is similar to another suit which was struck out by this tribunal for lack of jurisdiction being Esmail v Abdeel Enterprises Limited & 3 others
Text (first 100 chars): a. The instant suit is similar to another suit which was struck out by this tribunal for lack of jur...

Case Name: Esmail v Cine Investment Limited 2 others
Tribunal Case: E101OF2025
Decision Date: 01of2025
Section: determination
Text (first 100 chars): determination that the dispute in the case involved a question of the legal status of the then 3rd R...

Case Name: Esmail v Cine Investment Limited 2 others
Tribunal Case: E101OF2025
Decision Date: 01of2025
Section: b. The present suit similarly invites the Tribunal to venture into matters beyond its jurisdiction
Text (first 100 chars): b. The present suit similarly invites the Tribunal to venture into matters beyond its jurisdiction. ...



In [18]:
for doc in documents:
    print(doc.get("filename"))
    print(doc.get("metadata", "No metadata attached"))
    print("---")

Esmail v Cine Investment Limited  2 others (Tribunal Case E101of2025) 2025KELAT194(KLR) (8October2025) (Ruling).pdf
{'case_name': 'Esmail v Cine Investment Limited 2 others', 'tribunal_case': 'E101OF2025', 'neutral_citation': '2025 KELAT194', 'decision_date': '01of2025', 'petitioner': 'Esmail', 'respondents': ['Cine Investment Limited 2 others']}
---
Kleen Homes Security Services Ltd v Pet Care Services Limited (Tribunal Case E550E360of2025 (Consolidated)) 2025KEBPRT423(KLR) (9October2025) (Ruling).pdf
{'case_name': 'Kleen Homes Security Services Ltd v Pet Care Services Limited', 'tribunal_case': 'E360OF2025', 'neutral_citation': '2025 KEBPRT423', 'decision_date': '60of2025', 'petitioner': 'Kleen Homes Security Services Ltd', 'respondents': ['Pet Care Services Limited']}
---
Ledilas  6 others v National Land Commission Legal Advice Centre ta Kituo Cha Sheria (Amicus Curiae) (Tribunal Case E010of2025) 2026KELAT5(KLR) (9February2026) (Judgment).docx
{'case_name': 'Ledilas 6 others v Nati

In [19]:
# store all sections in a single dataset (all_sections) with metadata attached.
all_sections = []

for doc in documents:
    # doc["metadata"] has case_name, tribunal_case, etc.
    metadata = doc["metadata"]

    # chunk the document into sections
    sections = chunk_tribunal_robust(doc["text"])

    # attach metadata to each section
    for sec in sections:
        all_sections.append({
            **metadata,      # all metadata fields
            "section": sec["section"],
            "text": sec["text"]
        })

In [20]:
# ---------- Verify total sections and sample ----------
print(f"Total sections across all files: {len(all_sections)}")
for s in all_sections[:5]:  # preview first 5 sections
    print(f"\nCase: {s['case_name']} | Section: {s['section']}")
    print(s['text'][:200] + "...")

Total sections across all files: 90

Case: Esmail v Cine Investment Limited 2 others | Section: a. The instant suit is similar to another suit which was struck out by this tribunal for lack of jurisdiction being Esmail v Abdeel Enterprises Limited & 3 others
a. The instant suit is similar to another suit which was struck out by this tribunal for lack of jurisdiction being Esmail v Abdeel Enterprises Limited & 3 others [2025] KELAT 173 (KLR), https://new.k...

Case: Esmail v Cine Investment Limited 2 others | Section: determination
determination that the dispute in the case involved a question of the legal status of the then 3rd Respondent who is now the 1st Respondent....

Case: Esmail v Cine Investment Limited 2 others | Section: b. The present suit similarly invites the Tribunal to venture into matters beyond its jurisdiction
b. The present suit similarly invites the Tribunal to venture into matters beyond its jurisdiction. The Claimant purports that the 1st Respondent is engaged in 

In [21]:
print(text[:1000])

T Gas Limited v Baluch & 3 others (Tribunal Case E017 of 2025) [2025] KEBPRT 424 (KLR) (8 October 2025) (Ruling) Neutral citation: [2025] KEBPRT 424 (KLR) REPUBLIC OF KENYA IN THE BUSINESS PREMISES RENT TRIBUNAL TRIBUNAL CASE E017 OF 2025 J OSODO, CHAIR & GAKUHI CHEGE, MEMBER OCTOBER 8, 2025 BETWEEN T GAS LIMITED ...................................................................................... APPLICANT AND OLE SHERO BALUCH .................................................................. 1ST RESPONDENT PETER MWANGI KIINIKI ......................................................... 2ND RESPONDENT MUSTAFA HUSSEIN BALUCH ................................................... 3RD RESPONDENT MORAN AUCTIONEERS ............................................................ 4TH RESPONDENT RULING A. Dispute Background 1. The Tenant/Applicant, led a Reference dated 29th May 2025 under Section 12(4) of the Landlord and Tenant (Shops, Hotels and Catering Establishments) Act, Cap 301, together wi

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Prepare your data
texts = [section['text'] for section in all_sections]

# 2. Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# 3. Create the vectors
tfidf_matrix = vectorizer.fit_transform(texts)
embeddings = tfidf_matrix.toarray() 

# 4. FIX for the "get_feature_names" error
try:
    feature_names = vectorizer.get_feature_names_out()
except AttributeError:
    feature_names = vectorizer.get_feature_names()

print(f"Success! Created a vector matrix of shape: {embeddings.shape}")
print(f"Vocabulary size: {len(feature_names)} unique legal terms.")

Success! Created a vector matrix of shape: (90, 2375)
Vocabulary size: 2375 unique legal terms.


In [23]:
# Create a flat list of all sections across all documents
all_sections = []

for doc in documents:
    # Get the metadata we extracted from the filename earlier
    file_metadata = doc.get('metadata', {}) 
    
    # Extract the chunks for this specific file
    sections = chunk_tribunal_robust(doc['text'])
    
    for sec in sections:
        # Create a combined object for this chunk
        chunk_entry = {
            "heading": sec['section'],
            "text": sec['text'],
            "metadata": file_metadata  # Carry the file metadata into the section
        }
        all_sections.append(chunk_entry)

print(f"Prepared {len(all_sections)} sections with linked metadata.")

Prepared 90 sections with linked metadata.


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def search_rulings(query, top_n=3):
    # 1. Vectorize the query
    query_vec = vectorizer.transform([query])
    
    # 2. Calculate similarity against our tfidf_matrix
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # 3. Get top indices
    related_indices = similarities.argsort()[::-1][:top_n]
    
    print(f"Results for query: '{query}'\n" + "="*50)
    for idx in related_indices:
        score = similarities[idx]
        if score > 0:
            section = all_sections[idx]
            # Safely get metadata fields to avoid future KeyErrors
            meta = section.get('metadata', {})
            case = meta.get('case_name', 'Unknown Case')
            cit = meta.get('neutral_citation', 'No Citation')
            
            print(f"[Score: {score:.2f}] {case} ({cit})")
            print(f"Heading: {section['heading']}")
            print(f"Snippet: {section['text'][:200]}...")
            print("-" * 30)
        else:
            print("No more relevant results.")
            break

# Test it
search_rulings("preliminary objection threshold")

Results for query: 'preliminary objection threshold'
[Score: 0.44] Kleen Homes Security Services Ltd v Pet Care Services Limited (2025 KEBPRT423)
Heading: a. The Preliminary Objection dated 18th August 2025 does not meet the threshold of a pure point of law
Snippet: a. The Preliminary Objection dated 18th August 2025 does not meet the threshold of a pure point of law....
------------------------------
[Score: 0.33] Kleen Homes Security Services Ltd v Pet Care Services Limited (2025 KEBPRT423)
Heading: i. Whether the Preliminary Objection meets the established legal threshold
Snippet: i. Whether the Preliminary Objection meets the established legal threshold. https://new.kenyalaw.org/akn/ke/judgment/kebprt/2025/423/eng@2025-10-09 1 ii. Whether the tenancies between the parties qual...
------------------------------
[Score: 0.30] Kleen Homes Security Services Ltd v Pet Care Services Limited (2025 KEBPRT423)
Heading: A. stated
Snippet: A. stated: “A preliminary objection consists of a poi

In [25]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 241.69it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [33]:
# Create the 'chunks' list from your processed sections
chunks = [section['text'] for section in all_sections]

# Now this will work
embeddings = model.encode(
    chunks,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]


In [34]:
print(len(chunks))

90


In [35]:
from sklearn.metrics.pairwise import cosine_similarity

def semantic_search(query, top_n=3):
    # 1. Convert query to the same vector space as your chunks
    query_embedding = model.encode([query])
    
    # 2. Calculate similarity between query and all stored chunks
    # 'embeddings' is the matrix you created in the previous step
    scores = cosine_similarity(query_embedding, embeddings).flatten()
    
    # 3. Get the indices of the highest scores
    top_indices = scores.argsort()[-top_n:][::-1]
    
    return top_indices, scores

In [39]:
query = "What are the rules for termination of a tenancy?"
indices, scores = semantic_search(query)

for idx in indices:
    section = all_sections[idx]
    print(f"Match Found in: {section['metadata']['case_name']}")
    print(f"Section: {section['heading']}")
    print(f"Similarity Score: {scores[idx]:.4f}")
    print(f"Text: {section['text'][:300]}...")
    print("-" * 50)

Match Found in: Kleen Homes Security Services Ltd v Pet Care Services Limited
Section: b. The leases between the parties contain termination clauses exercisable within
Similarity Score: 0.7110
Text: b. The leases between the parties contain termination clauses exercisable within ve years, thereby qualifying as controlled tenancies under Section 2(1)(b)(ii) of the Landlord and Tenant (Shops, Hotels and Catering Establishments) Act (Cap 301)....
--------------------------------------------------
Match Found in: Kleen Homes Security Services Ltd v Pet Care Services Limited
Section: issues
Similarity Score: 0.6214
Text: issues requiring the Tribunal to examine the actual lease documents, their wording, and the intention of the parties. 8. Such examination takes the objection beyond the realm of pure law into factual inquiry, contrary to the principles laid down in Mukisa Biscuit and Oraro v Mbaja. Therefore, the ob...
--------------------------------------------------
Match Found in: Klee

In [37]:
import numpy as np
import pickle

# Save embeddings
np.save('case_embeddings.npy', embeddings)

# Save the text and metadata
with open('processed_sections.pkl', 'wb') as f:
    pickle.dump(all_sections, f)