In [17]:
#removes part_of_speech
#adds entry_id
import json
from collections import OrderedDict
from copy import deepcopy

# Load the dictionary
with open('berntsen_dictionary.json', 'r', encoding='utf-8') as f:
    original_data = json.load(f)

print(f"Loaded {len(original_data)} entries")

# Create new data with proper field ordering
data = []
for entry in original_data:
    # Create new ordered dictionary with entry_id first
    new_entry = OrderedDict()
    new_entry['entry_id'] = f"berntsen_{entry['headword_devanagari']}_{entry['source_page']}"
    new_entry['headword_devanagari'] = entry['headword_devanagari']
    new_entry['headword_romanized'] = entry['headword_romanized']
    new_entry['full_entry'] = entry['full_entry']
    new_entry['source_page'] = entry['source_page']
    # Note: part_of_speech field is intentionally not included (removed)
    
    data.append(new_entry)

# Verify the result
print(f"\nProcessed {len(data)} entries")
print("\nExample entry (index 300):")
print(json.dumps(data[300], ensure_ascii=False, indent=2))

Loaded 9836 entries

Processed 9836 entries

Example entry (index 300):
{
  "entry_id": "berntsen_अवलंबून_5",
  "headword_devanagari": "अवलंबून",
  "headword_romanized": "avalambūna",
  "full_entry": "अवलंबून avalambūna , असणे asaṇē to be dependent on.",
  "source_page": 5
}


In [18]:
#Berntsen Dictionary, how part of speech is represented
# Pattern 1: Sequential POS sections (most common)
#       form: headword romanized POS1 definition(s). POS2 definition(s).
#       example: "अति ati adv. too much. pref. extremely, too much, over-."
#       description: Each POS gets its own definition section which is separated by periods
# Pattern 2: Numbered definitions within same POS
#       form: headword romanized POS 1. def1. 2. def2. 3. def3.
#       example: "अंक aṅka m. 1. number. 2. issue (of a magazine, newspaper). 3. act (of a play)."
#       description: Numbers belong to the SAME POS. All numbered definitions share the first POS mentioned
# Pattern 3: Mixed - numbered defs THEN new POS
#       form: headword romanized POS1 1. def1. 2. def2. POS2 def3.
#       example: "अखंड akhaṇḍa adj. inv. 1. entire, in one piece. 2. continuous. adv. continuously."
#       explanation of example: Definitions 1 and 2 belong to adj. inv. 
#       When adv. appears, it starts a NEW section with its own definition

In [23]:
#definitions[]
import re
from collections import OrderedDict

def parse_definitions(entry):
    """
    Parse the full_entry to extract definitions with their POS and numbers.
    
    Args:
        entry: Dictionary containing 'full_entry', 'headword_romanized'
    
    Returns:
        List of definition dictionaries with 'definition', 'pos', 'number'
    """
    full_entry = entry['full_entry']
    romanized = entry['headword_romanized']
    
    # Step 1: Extract the definition part (everything after romanized headword)
    parts = full_entry.split(romanized, 1)
    if len(parts) <= 1:
        # Edge case: romanized not found, return empty
        return []
    
    definition_text = parts[1].strip()
    
    # If definition_text starts with comma, remove it
    if definition_text.startswith(','):
        definition_text = definition_text[1:].strip()
    
    # If there's no text at all, return empty
    if not definition_text:
        return []
    
    # Step 2: Define all possible POS markers (order matters - longest first to avoid partial matches)
    # IMPORTANT: Include both "v. t." and "v.t." formats
    pos_markers = [
        'adj. inv.', 'adv. suff.', 'adj. suff. inv.', 'adj. suff.',
        'n. suff.', 'n.suff.', 'm. suff.', 'f. suff.', 
        'v. t.', 'v. i.', 'v.i.', 'v.t.',  # Added both formats with and without space
        'adj.', 'adv.', 'pref.', 'suff.',
        'conj.', 'post.', 'pron.', 'interj.', 'poss.',
        'f.(i)', 'f.(e)', 'm.(i)', 'm.(e)',
        'n.', 'm.', 'f.'
    ]
    
    # Step 3: Find all POS markers and their positions in the text
    pos_locations = []
    for pos in pos_markers:
        # Find all occurrences of this POS marker
        index = 0
        while True:
            index = definition_text.find(pos, index)
            if index == -1:
                break
            # Check if this is at the start or preceded by space/period
            if index == 0 or definition_text[index-1] in [' ', '.']:
                # Check if followed by space or end of string
                end_index = index + len(pos)
                if end_index >= len(definition_text) or definition_text[end_index] == ' ':
                    pos_locations.append({
                        'pos': pos,
                        'start': index,
                        'end': end_index
                    })
            index += 1
    
    # Sort by position
    pos_locations.sort(key=lambda x: x['start'])
    
    # Step 4: If no POS found, treat entire text as definition with null POS
    if not pos_locations:
        # Remove trailing period
        cleaned_text = definition_text.strip()
        if cleaned_text.endswith('.'):
            cleaned_text = cleaned_text[:-1].strip()
        
        if cleaned_text:
            return [{
                'definition': cleaned_text,
                'pos': None,
                'number': None
            }]
        else:
            return []
    
    # Step 5: Split text into sections by POS
    definitions = []
    
    for i, pos_info in enumerate(pos_locations):
        current_pos = pos_info['pos']
        start = pos_info['end']
        
        # Find where this POS section ends (where next POS starts, or end of text)
        if i + 1 < len(pos_locations):
            end = pos_locations[i + 1]['start']
        else:
            end = len(definition_text)
        
        # Extract the definition text for this POS
        section_text = definition_text[start:end].strip()
        
        # Step 6: Check if this section has numbered definitions (1., 2., 3., etc.)
        numbered_pattern = r'(\d+)\.\s*([^0-9]+?)(?=\s*\d+\.|$)'
        numbered_matches = list(re.finditer(numbered_pattern, section_text))
        
        if numbered_matches:
            # Parse numbered definitions
            for match in numbered_matches:
                number = int(match.group(1))
                def_text = match.group(2).strip()
                
                # Clean up: remove trailing period if present
                if def_text.endswith('.'):
                    def_text = def_text[:-1].strip()
                
                # Skip empty definitions
                if def_text:
                    definitions.append({
                        'definition': def_text,
                        'pos': current_pos,
                        'number': number
                    })
        else:
            # No numbers - this is a single unnumbered definition
            # Clean the section text
            section_text = section_text.strip()
            
            # Remove trailing period
            if section_text.endswith('.'):
                section_text = section_text[:-1].strip()
            
            # Skip empty definitions
            if section_text:
                definitions.append({
                    'definition': section_text,
                    'pos': current_pos,
                    'number': None
                })
    
    return definitions


# Add definitions to all entries
for entry in data:
    entry['definitions'] = parse_definitions(entry)

# Calculate metrics
total_entries = len(data)
entries_with_defs = sum(1 for entry in data if entry['definitions'])
entries_without_defs = total_entries - entries_with_defs
total_defs = sum(len(e['definitions']) for e in data)
entries_with_null_pos = sum(1 for entry in data if entry['definitions'] and any(d['pos'] is None for d in entry['definitions']))

print(f"Total entries: {total_entries}")
print(f"Entries with definitions: {entries_with_defs} ({entries_with_defs/total_entries*100:.1f}%)")
print(f"Entries without definitions: {entries_without_defs} ({entries_without_defs/total_entries*100:.1f}%)")
print(f"Entries with null POS: {entries_with_null_pos} ({entries_with_null_pos/total_entries*100:.1f}%)")
print(f"Total definitions parsed: {total_defs}")
print(f"Average definitions per entry: {total_defs/total_entries:.2f}")

# Show examples of entries with null POS
print("\n" + "=" * 80)
print("EXAMPLES OF ENTRIES WITH NULL POS:")
print("=" * 80)
null_pos_entries = [e for e in data if e['definitions'] and any(d['pos'] is None for d in e['definitions'])]
for entry in null_pos_entries[:10]:
    print(f"\n{entry['headword_devanagari']}: {entry['full_entry']}")
    for d in entry['definitions']:
        print(f"  POS: {d['pos']}, Definition: {d['definition']}")


Total entries: 9836
Entries with definitions: 9836 (100.0%)
Entries without definitions: 0 (0.0%)
Entries with null POS: 348 (3.5%)
Total definitions parsed: 12167
Average definitions per entry: 1.24

EXAMPLES OF ENTRIES WITH NULL POS:

अडीच: अडीच aḍīca two and one half.
  POS: None, Definition: two and one half

अदपाव: अदपाव adapāva one-eighth of a शेर or kilo.
  POS: None, Definition: one-eighth of a शेर or kilo

अलीकडे: अलीकडे alīkaḍē See अलिकडे .
  POS: None, Definition: See अलिकडे

अवलंबून: अवलंबून avalambūna , असणे asaṇē to be dependent on.
  POS: None, Definition: असणे asaṇē to be dependent on

असेना: असेना asēnā , का kā so what.
  POS: None, Definition: का kā so what

अहोजाहो: अहोजाहो ahōjāhō respectful address.
  POS: None, Definition: respectful address

आ: आ ā , करणे karaṇē to open one's mouth (to say `ah').
  POS: None, Definition: करणे karaṇē to open one's mouth (to say `ah')

आटपाट: आटपाट āṭapāṭa , नगर nagara a big town.
  POS: None, Definition: नगर nagara a big town

आटो

In [19]:
# *1.headword_devanagari
# *2.headword_romanized 
# *4.full_entry: everything within the div from the webpage, including devanagari, romanized, and the full text
#   of the definition
# *5.source_page
# *6.entry_id: a unique id to represent this particular word; eg. berntsen_पाणी_5. The
#   dictionaryname_headword_devanagari_pagenumber
# 7.definition(array): an array of the definitions
# 8.pos_normalized: e.g. noun.feminine; Use for metadata filtering in vector DB
# 9.gender: ;Optional fine-grained filters
# 10.declension_class: ;Optional fine-grained filters
# 11.pos_display: the symbol for the part of speech like adj...; 
#    Include in content shown to LLM (keeps linguistic accuracy)  