In [15]:
import pypdf
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import argparse
import google.genai as genai
from google.genai import types
import json
from docx import Document
from typing import List, Dict, Tuple 
import pydantic

# NLTK Downloads
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bhara\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
path = 'DepostionForPersisYu_LinkPDF.pdf'
print(f"Processing PDF: {path}...")

reader = pypdf.PdfReader(path)
num_pages = len(reader.pages)
page_dict = {}

for i in range(num_pages):
    end = False
    page = reader.pages[i]
    text = page.extract_text()
    
    if text is None:
        page_dict[i + 1] = "" # Use 1-based indexing for pages
        continue

    # Clean the text
    text = text.lower()
    if re.search(pattern='witness signature', string=text):
        end = True
    text = re.sub(pattern=r'\b\d{2}:\d{2}\b', string=text, repl='')
    text = re.sub(pattern=r"page (\d+)", repl='', string=text)
    
    page_dict[i + 1] = text
    if(end):
        break

print(f"Done. Created a dictionary with {len(page_dict)} pages.")

print(f"Loading Table of Contents from: {'toc.json'}...")

try:
    with open('toc.json', 'r') as f:
        toc_data = json.load(f)
    print(f"Done. Loaded {len(toc_data)} topics.")
except (FileNotFoundError, json.JSONDecodeError) as e:
    print(f"[!] ERROR: Could not read or parse the JSON file. {e}")
    toc_data = [] # Ensure toc_data exists even on failure


Processing PDF: DepostionForPersisYu_LinkPDF.pdf...
Done. Created a dictionary with 89 pages.
Loading Table of Contents from: toc.json...
Done. Loaded 60 topics.


In [29]:
print("Starting validation...")

# --- Configuration ---
# A topic is considered valid if at least 50% of its keywords are found.
# You can adjust this threshold to be more or less strict.

stop_words = set(stopwords.words('english'))
validation_score_pages = []
for i, item in enumerate(toc_data):
    topic_str = item.get("topic")
    page_num = item.get("page_start")

    # Skip any malformed entries in the JSON
    if not all([topic_str, page_num]):
        continue

    # Get the text for the specified page
    page_text = page_dict.get(page_num)
    if page_text is None:
        continue

    # Extract keywords from the topic string (e.g., "Analysis of ITT" -> ["analysis", "itt"])
    tokens = word_tokenize(topic_str.lower())
    keywords = [word for word in tokens if word.isalpha() and word not in stop_words]

    if not keywords:
        continue

    # Check for the presence of each keyword on the page
    found_count = sum(1 for keyword in keywords if keyword in page_text)
    
    # Determine if the topic is valid based on the threshold
    found_ratio = found_count / len(keywords)
    validation_score_pages.append(found_ratio)

print("...Validation logic complete.")

Starting validation...
...Validation logic complete.


In [32]:
# --- Final Report ---
KEYWORD_MATCH_THRESHOLD = 0.5 
print(f"The validation ratios are: {validation_score_pages}")
total_found_ratio = (sum(validation_score_pages) / len(validation_score_pages)) * 100
match_accuracy = (sum(1 for i in validation_score_pages if i>=KEYWORD_MATCH_THRESHOLD)/len(validation_score_pages))*100
print(f'The total found ration is: {total_found_ratio}')
print(f'The final validation accuracy is: {match_accuracy}')

The validation ratios are: [1.0, 0.5, 0.5, 0.0, 0.3333333333333333, 0.6666666666666666, 0.3333333333333333, 0.0, 0.3333333333333333, 0.4, 0.8, 0.8333333333333334, 1.0, 1.0, 0.8, 0.75, 1.0, 1.0, 1.0, 0.5, 0.75, 1.0, 0.8, 1.0, 0.8333333333333334, 1.0, 1.0, 1.0, 1.0, 0.8571428571428571, 0.8, 1.0, 1.0, 0.8333333333333334, 1.0, 0.5, 0.8, 0.8, 0.8571428571428571, 0.6666666666666666, 1.0, 0.6666666666666666, 1.0, 1.0, 0.6666666666666666, 1.0, 1.0, 0.8, 1.0, 0.75, 0.6, 0.6666666666666666, 1.0, 1.0, 0.75, 1.0, 1.0, 1.0, 1.0, 0.6666666666666666]
The total found ration is: 79.69047619047619
The final validation accuracy is: 90.0


In [33]:
print(len(validation_score_pages))

60
