In [84]:
# Todo
#1.Remove भोकणे and सुरुवात

In [85]:
# ==============================================================================
# CELL 1: Import and Schema Setup (FIXED - handles duplicate headwords)
# ==============================================================================
import json
from collections import OrderedDict
from copy import deepcopy

# Load the dictionary
with open('berntsen_dictionary.json', 'r', encoding='utf-8') as f:
    original_data = json.load(f)

print(f"Loaded {len(original_data)} entries")

# Track seen base_ids to handle duplicates
seen_base_ids = {}

# Create new data with proper field ordering and complete schema
data = []
for entry in original_data:
    new_entry = OrderedDict()
    
    # Generate base_id from headword + page
    base_id = f"berntsen_{entry['headword_devanagari']}_{entry['source_page']}"
    
    # Handle duplicates by adding sequence number
    if base_id in seen_base_ids:
        seen_base_ids[base_id] += 1
        entry_id = f"{base_id}_{seen_base_ids[base_id]}"
    else:
        seen_base_ids[base_id] = 1
        entry_id = base_id
    
    # Core identification fields
    new_entry['entry_id'] = entry_id
    new_entry['headword_devanagari'] = entry['headword_devanagari']
    new_entry['headword_romanized'] = entry['headword_romanized']
    new_entry['full_entry'] = entry['full_entry']
    new_entry['source_page'] = entry['source_page']
    new_entry['entry_type'] = 'headword'
    new_entry['base_word'] = None
    new_entry['search_text'] = None
    
    data.append(new_entry)

# Verify no duplicates
entry_ids = [e['entry_id'] for e in data]
duplicates = len(entry_ids) - len(set(entry_ids))
print(f"\nProcessed {len(data)} entries")
print(f"Duplicate entry_ids: {duplicates}")
if duplicates == 0:
    print("✅ All entry_ids are unique")

print("\nExample entry (index 300):")
print(json.dumps(data[300], ensure_ascii=False, indent=2))

Loaded 9836 entries

Processed 9836 entries
Duplicate entry_ids: 0
✅ All entry_ids are unique

Example entry (index 300):
{
  "entry_id": "berntsen_अवलंबून_5",
  "headword_devanagari": "अवलंबून",
  "headword_romanized": "avalambūna",
  "full_entry": "अवलंबून avalambūna , असणे asaṇē to be dependent on.",
  "source_page": 5,
  "entry_type": "headword",
  "base_word": null,
  "search_text": null
}


In [86]:
#Berntsen Dictionary, how part of speech is represented
# Pattern 1: Sequential POS sections (most common)
#       form: headword romanized POS1 definition(s). POS2 definition(s).
#       example: "अति ati adv. too much. pref. extremely, too much, over-."
#       description: Each POS gets its own definition section which is separated by periods
# Pattern 2: Numbered definitions within same POS
#       form: headword romanized POS 1. def1. 2. def2. 3. def3.
#       example: "अंक aṅka m. 1. number. 2. issue (of a magazine, newspaper). 3. act (of a play)."
#       description: Numbers belong to the SAME POS. All numbered definitions share the first POS mentioned
# Pattern 3: Mixed - numbered defs THEN new POS
#       form: headword romanized POS1 1. def1. 2. def2. POS2 def3.
#       example: "अखंड akhaṇḍa adj. inv. 1. entire, in one piece. 2. continuous. adv. continuously."
#       explanation of example: Definitions 1 and 2 belong to adj. inv. 
#       When adv. appears, it starts a NEW section with its own definition

In [87]:
# ==============================================================================
# CELL 2: Parse Definitions
# ==============================================================================
# This cell:
# - Parses full_entry to extract individual definitions
# - Identifies POS markers and numbered definitions
# - Creates definitions[] array with full schema:
#   definition, pos_display, number, pos, gender, declension_class, referenced_entry
# - referenced_entry is initialized as None, populated in later cells

import re

def parse_definitions(entry):
    """
    Parse the full_entry to extract definitions with their POS and numbers.
    
    Args:
        entry: Dictionary containing 'full_entry', 'headword_romanized'
    
    Returns:
        List of definition dictionaries with full schema
    """
    full_entry = entry['full_entry']
    romanized = entry['headword_romanized']
    
    # Step 1: Extract the definition part (everything after romanized headword)
    parts = full_entry.split(romanized, 1)
    if len(parts) <= 1:
        return []
    
    definition_text = parts[1].strip()
    
    # If definition_text starts with comma, remove it
    if definition_text.startswith(','):
        definition_text = definition_text[1:].strip()
    
    if not definition_text:
        return []
    
    # Step 2: Define all possible POS markers (order matters - longest first)
    pos_markers = [
        'adj. inv.', 'adv. suff.', 'adj. suff. inv.', 'adj. suff.',
        'n. suff.', 'n.suff.', 'm. suff.', 'f. suff.', 
        'v. t.', 'v. i.', 'v.i.', 'v.t.',
        'adj.', 'adv.', 'pref.', 'suff.',
        'conj.', 'post.', 'pron.', 'interj.', 'poss.',
        'f.(i)', 'f.(e)', 'm.(i)', 'm.(e)',
        'n.', 'm.', 'f.'
    ]
    
    # Step 3: Find all POS markers and their positions
    pos_locations = []
    for pos in pos_markers:
        index = 0
        while True:
            index = definition_text.find(pos, index)
            if index == -1:
                break
            if index == 0 or definition_text[index-1] in [' ', '.']:
                end_index = index + len(pos)
                if end_index >= len(definition_text) or definition_text[end_index] == ' ':
                    pos_locations.append({
                        'pos_display': pos,
                        'start': index,
                        'end': end_index
                    })
            index += 1
    
    pos_locations.sort(key=lambda x: x['start'])
    
    # Step 4: If no POS found, treat entire text as definition with null POS
    if not pos_locations:
        cleaned_text = definition_text.strip()
        if cleaned_text.endswith('.'):
            cleaned_text = cleaned_text[:-1].strip()
        
        if cleaned_text:
            return [{
                'definition': cleaned_text,
                'pos_display': None,
                'number': None,
                'pos': None,
                'gender': None,
                'declension_class': None,
                'referenced_entry': None
            }]
        else:
            return []
    
    # Step 5: Split text into sections by POS
    definitions = []
    
    for i, pos_info in enumerate(pos_locations):
        current_pos = pos_info['pos_display']
        start = pos_info['end']
        
        if i + 1 < len(pos_locations):
            end = pos_locations[i + 1]['start']
        else:
            end = len(definition_text)
        
        section_text = definition_text[start:end].strip()
        
        # Step 6: Check if this section has numbered definitions
        numbered_pattern = r'(\d+)\.\s*([^0-9]+?)(?=\s*\d+\.|$)'
        numbered_matches = list(re.finditer(numbered_pattern, section_text))
        
        if numbered_matches:
            for match in numbered_matches:
                number = int(match.group(1))
                def_text = match.group(2).strip()
                
                if def_text.endswith('.'):
                    def_text = def_text[:-1].strip()
                
                if def_text:
                    definitions.append({
                        'definition': def_text,
                        'pos_display': current_pos,
                        'number': number,
                        'pos': None,
                        'gender': None,
                        'declension_class': None,
                        'referenced_entry': None
                    })
        else:
            section_text = section_text.strip()
            if section_text.endswith('.'):
                section_text = section_text[:-1].strip()
            
            if section_text:
                definitions.append({
                    'definition': section_text,
                    'pos_display': current_pos,
                    'number': None,
                    'pos': None,
                    'gender': None,
                    'declension_class': None,
                    'referenced_entry': None
                })
    
    return definitions


# Add definitions to all entries
for entry in data:
    entry['definitions'] = parse_definitions(entry)

# Calculate metrics
total_entries = len(data)
entries_with_defs = sum(1 for entry in data if entry['definitions'])
entries_without_defs = total_entries - entries_with_defs
total_defs = sum(len(e['definitions']) for e in data)
entries_with_null_pos = sum(1 for entry in data if entry['definitions'] and any(d['pos_display'] is None for d in entry['definitions']))

print(f"Total entries: {total_entries}")
print(f"Entries with definitions: {entries_with_defs} ({entries_with_defs/total_entries*100:.1f}%)")
print(f"Entries without definitions: {entries_without_defs} ({entries_without_defs/total_entries*100:.1f}%)")
print(f"Entries with null POS: {entries_with_null_pos} ({entries_with_null_pos/total_entries*100:.1f}%)")
print(f"Total definitions parsed: {total_defs}")
print(f"Average definitions per entry: {total_defs/total_entries:.2f}")

Total entries: 9836
Entries with definitions: 9836 (100.0%)
Entries without definitions: 0 (0.0%)
Entries with null POS: 348 (3.5%)
Total definitions parsed: 12167
Average definitions per entry: 1.24


In [88]:
# ==============================================================================
# CELL 3: Normalize POS and Extract Gender
# ==============================================================================
# This cell:
# - Converts pos_display (e.g., 'm.', 'f.(i)') to standardized pos (e.g., 'noun.masculine')
# - Extracts gender from noun POS markers
# - Extracts declension_class from markers like 'f.(i)', 'f.(e)'

def normalize_pos(pos_display):
    """
    Convert display POS to standardized POS, gender, and declension class.
    
    Returns:
        tuple: (pos, gender, declension_class)
    """
    if pos_display is None:
        return None, None, None
    
    pos_mapping = {
        # Nouns with gender
        'm.': ('noun.masculine', 'masculine', None),
        'f.': ('noun.feminine', 'feminine', None),
        'n.': ('noun.neuter', 'neuter', None),
        
        # Nouns with declension class
        'f.(i)': ('noun.feminine.i', 'feminine', 'i'),
        'f.(e)': ('noun.feminine.e', 'feminine', 'e'),
        'm.(i)': ('noun.masculine.i', 'masculine', 'i'),
        'm.(e)': ('noun.masculine.e', 'masculine', 'e'),
        
        # Verbs
        'v.t.': ('verb.transitive', None, None),
        'v. t.': ('verb.transitive', None, None),
        'v.i.': ('verb.intransitive', None, None),
        'v. i.': ('verb.intransitive', None, None),
        
        # Adjectives
        'adj.': ('adjective', None, None),
        'adj. inv.': ('adjective.invariant', None, None),
        'adj. suff.': ('adjective.suffix', None, None),
        'adj. suff. inv.': ('adjective.suffix.invariant', None, None),
        
        # Adverbs
        'adv.': ('adverb', None, None),
        'adv. suff.': ('adverb.suffix', None, None),
        
        # Other POS
        'pref.': ('prefix', None, None),
        'suff.': ('suffix', None, None),
        'n. suff.': ('noun.suffix', None, None),
        'n.suff.': ('noun.suffix', None, None),
        'm. suff.': ('noun.masculine.suffix', 'masculine', None),
        'f. suff.': ('noun.feminine.suffix', 'feminine', None),
        'conj.': ('conjunction', None, None),
        'post.': ('postposition', None, None),
        'pron.': ('pronoun', None, None),
        'interj.': ('interjection', None, None),
        'poss.': ('possessive', None, None),
    }
    
    return pos_mapping.get(pos_display, (None, None, None))


# Apply normalization to all definitions
normalized_count = 0
for entry in data:
    for defn in entry['definitions']:
        pos, gender, declension = normalize_pos(defn['pos_display'])
        defn['pos'] = pos
        defn['gender'] = gender
        defn['declension_class'] = declension
        if pos is not None:
            normalized_count += 1

print(f"Normalized {normalized_count} definitions with POS")
print(f"Definitions without POS: {total_defs - normalized_count}")

# Show example
print("\nExample normalized entry:")
example = next(e for e in data if e['definitions'] and e['definitions'][0]['pos'])
print(json.dumps(example, ensure_ascii=False, indent=2))

Normalized 11819 definitions with POS
Definitions without POS: 348

Example normalized entry:
{
  "entry_id": "berntsen_अ_1",
  "headword_devanagari": "अ",
  "headword_romanized": "a",
  "full_entry": "अ a pref. negative.",
  "source_page": 1,
  "entry_type": "headword",
  "base_word": null,
  "search_text": null,
  "definitions": [
    {
      "definition": "negative",
      "pos_display": "pref.",
      "number": null,
      "pos": "prefix",
      "gender": null,
      "declension_class": null,
      "referenced_entry": null
    }
  ]
}


In [89]:
# ==============================================================================
# CELL 4: Process Cross-References ("See X")
# ==============================================================================
# This cell:
# - Finds definitions that are cross-references (e.g., "See हुजत")
# - Looks up the referenced entry in the dictionary
# - Populates referenced_entry with relationship: 'cross_reference'
# - Embeds the full referenced entry for self-contained RAG chunks

import re

def find_entry_by_headword(headword_devanagari, data):
    """Find an entry by its Devanagari headword."""
    for entry in data:
        if entry['headword_devanagari'] == headword_devanagari:
            return entry
    return None


def create_referenced_entry(source_entry, relationship, grammatical_form=None):
    """
    Create a referenced_entry object from a source entry.
    
    Args:
        source_entry: The entry being referenced
        relationship: Type of relationship (cross_reference, base_form, etc.)
        grammatical_form: Optional grammatical form (perfective, plural, etc.)
    
    Returns:
        OrderedDict with referenced entry structure
    """
    if source_entry is None:
        return None
    
    ref = OrderedDict()
    ref['relationship'] = relationship
    ref['grammatical_form'] = grammatical_form
    ref['entry_id'] = source_entry['entry_id']
    ref['headword_devanagari'] = source_entry['headword_devanagari']
    ref['headword_romanized'] = source_entry['headword_romanized']
    ref['full_entry'] = source_entry['full_entry']
    ref['source_page'] = source_entry['source_page']
    ref['definitions'] = deepcopy(source_entry['definitions'])
    
    return ref


# Pattern to match "See X" where X is Devanagari
# Also handles "See X , def. 1" format
see_pattern = re.compile(r'^See\s+([\u0900-\u097F]+)(?:\s*,\s*def\.\s*(\d+))?\.?$')

cross_ref_count = 0
cross_ref_not_found = []

for entry in data:
    for defn in entry['definitions']:
        match = see_pattern.match(defn['definition'])
        if match:
            referenced_headword = match.group(1)
            def_number = match.group(2)  # May be None
            
            # Find the referenced entry
            referenced_entry = find_entry_by_headword(referenced_headword, data)
            
            if referenced_entry:
                defn['referenced_entry'] = create_referenced_entry(
                    referenced_entry, 
                    relationship='cross_reference'
                )
                cross_ref_count += 1
            else:
                cross_ref_not_found.append({
                    'entry': entry['headword_devanagari'],
                    'looking_for': referenced_headword
                })

print(f"Cross-references processed: {cross_ref_count}")
print(f"Cross-references not found: {len(cross_ref_not_found)}")

if cross_ref_not_found:
    print("\nCross-references not found (first 10):")
    for item in cross_ref_not_found[:10]:
        print(f"  {item['entry']} -> {item['looking_for']}")

# Verify with example
print("\nExample cross-reference:")
example = next((e for e in data if e['definitions'] and 
                e['definitions'][0].get('referenced_entry') and
                e['definitions'][0]['referenced_entry']['relationship'] == 'cross_reference'), None)
if example:
    print(json.dumps(example, ensure_ascii=False, indent=2))

Cross-references processed: 81
Cross-references not found: 2

Cross-references not found (first 10):
  भोकणे -> भुकंणे
  सुरुवात -> सुरवात

Example cross-reference:
{
  "entry_id": "berntsen_अलीकडे_5",
  "headword_devanagari": "अलीकडे",
  "headword_romanized": "alīkaḍē",
  "full_entry": "अलीकडे alīkaḍē See अलिकडे .",
  "source_page": 5,
  "entry_type": "headword",
  "base_word": null,
  "search_text": null,
  "definitions": [
    {
      "definition": "See अलिकडे",
      "pos_display": null,
      "number": null,
      "pos": null,
      "gender": null,
      "declension_class": null,
      "referenced_entry": {
        "relationship": "cross_reference",
        "grammatical_form": null,
        "entry_id": "berntsen_अलिकडे_5",
        "headword_devanagari": "अलिकडे",
        "headword_romanized": "alikaḍē",
        "full_entry": "अलिकडे alikaḍē adv. 1. on this side. 2. recently. post. on this side.",
        "source_page": 5,
        "definitions": [
          {
            "definitio

In [90]:
# ==============================================================================
# CELL 5: Process Grammatical Forms
# ==============================================================================
# This cell:
# - Finds definitions that reference base forms (e.g., "perf. of गाणे")
# - Handles: perf. of, past tense of, pres. of, hab. of, negative of,
#   obl. of, instr. of, loc. of, voc. sg. of, pl. of
# - Populates referenced_entry with relationship: 'base_form'
# - Includes grammatical_form field (e.g., 'perfective', 'plural')

import re

# Pattern to match grammatical form references
# Captures: (grammatical indicator) of (Devanagari word)
grammatical_patterns = [
    (r'^perf\.\s+of\s+([\u0900-\u097F]+)\.?$', 'perfective'),
    (r'^past tense of\s+([\u0900-\u097F]+)\.?$', 'past_tense'),
    (r'^pres\.\s+of\s+([\u0900-\u097F]+)\.?$', 'present_tense'),
    (r'^hab\.\s+of\s+([\u0900-\u097F]+)\.?$', 'habitual'),
    (r'^negative of\s+([\u0900-\u097F]+)\.?$', 'negative'),
    (r'^obl\.?\s*(?:form\s+)?of\s+([\u0900-\u097F]+)\.?$', 'oblique'),
    (r'^instr\.\s+of\s+([\u0900-\u097F]+)\.?$', 'instrumental'),
    (r'^loc\.\s+of\s+([\u0900-\u097F]+)\.?$', 'locative'),
    (r'^voc\.\s+sg\.\s+of\s+([\u0900-\u097F]+)\.?$', 'vocative_singular'),
    (r'^pl\.\s+of\s+([\u0900-\u097F]+)\.?$', 'plural'),
    (r'^pl\.\s+and\s+obl\.\s+of\s+([\u0900-\u097F]+)\.?$', 'plural_oblique'),
    (r'^of\s+([\u0900-\u097F]+)$', 'possessive_of'),  # For possessives like "of आम्ही"
]

# Compile patterns
compiled_patterns = [(re.compile(pattern, re.IGNORECASE), form) for pattern, form in grammatical_patterns]

grammatical_count = 0
grammatical_not_found = []

for entry in data:
    for defn in entry['definitions']:
        # Skip if already has a referenced_entry (e.g., cross-reference)
        if defn['referenced_entry'] is not None:
            continue
        
        definition = defn['definition']
        
        for pattern, grammatical_form in compiled_patterns:
            match = pattern.match(definition)
            if match:
                referenced_headword = match.group(1)
                
                # Find the referenced entry
                referenced_entry = find_entry_by_headword(referenced_headword, data)
                
                if referenced_entry:
                    defn['referenced_entry'] = create_referenced_entry(
                        referenced_entry,
                        relationship='base_form',
                        grammatical_form=grammatical_form
                    )
                    grammatical_count += 1
                else:
                    grammatical_not_found.append({
                        'entry': entry['headword_devanagari'],
                        'looking_for': referenced_headword,
                        'form': grammatical_form
                    })
                break  # Found a match, no need to check other patterns

print(f"Grammatical forms processed: {grammatical_count}")
print(f"Grammatical forms not found: {len(grammatical_not_found)}")

if grammatical_not_found:
    print("\nGrammatical forms not found (first 10):")
    for item in grammatical_not_found[:10]:
        print(f"  {item['entry']} -> {item['looking_for']} ({item['form']})")

# Show counts by grammatical form
form_counts = {}
for entry in data:
    for defn in entry['definitions']:
        ref = defn.get('referenced_entry')
        if ref and ref['relationship'] == 'base_form':
            form = ref['grammatical_form']
            form_counts[form] = form_counts.get(form, 0) + 1

print("\nGrammatical forms by type:")
for form, count in sorted(form_counts.items(), key=lambda x: -x[1]):
    print(f"  {form}: {count}")

Grammatical forms processed: 39
Grammatical forms not found: 0

Grammatical forms by type:
  perfective: 18
  possessive_of: 8
  plural: 3
  oblique: 3
  past_tense: 2
  present_tense: 1
  instrumental: 1
  vocative_singular: 1
  negative: 1
  plural_oblique: 1


In [91]:
# ==============================================================================
# CELL 6: Process Reduplications
# ==============================================================================
# This cell:
# - Finds definitions that are reduplications (e.g., "redupl. of खरे")
# - Handles: redupl. of, redupl. loc. of
# - Populates referenced_entry with relationship: 'reduplication_of'

import re

# Patterns for reduplication
redupl_patterns = [
    (r'^redupl\.\s+of\s+([\u0900-\u097F]+)\.?(?:\s+.*)?$', None),
    (r'^redupl\.\s+loc\.\s+of\s+([\u0900-\u097F]+)\.?(?:\s+.*)?$', 'locative'),
]

compiled_redupl = [(re.compile(pattern, re.IGNORECASE), form) for pattern, form in redupl_patterns]

redupl_count = 0
redupl_not_found = []

for entry in data:
    for defn in entry['definitions']:
        # Skip if already has a referenced_entry
        if defn['referenced_entry'] is not None:
            continue
        
        definition = defn['definition']
        
        for pattern, grammatical_form in compiled_redupl:
            match = pattern.match(definition)
            if match:
                referenced_headword = match.group(1)
                
                referenced_entry = find_entry_by_headword(referenced_headword, data)
                
                if referenced_entry:
                    defn['referenced_entry'] = create_referenced_entry(
                        referenced_entry,
                        relationship='reduplication_of',
                        grammatical_form=grammatical_form
                    )
                    redupl_count += 1
                else:
                    redupl_not_found.append({
                        'entry': entry['headword_devanagari'],
                        'looking_for': referenced_headword
                    })
                break

print(f"Reduplications processed: {redupl_count}")
print(f"Reduplications not found: {len(redupl_not_found)}")

if redupl_not_found:
    print("\nReduplications not found:")
    for item in redupl_not_found:
        print(f"  {item['entry']} -> {item['looking_for']}")

Reduplications processed: 16
Reduplications not found: 1

Reduplications not found:
  निजानीज -> नीज


In [92]:
# ==============================================================================
# CELL 7: Process Abbreviations
# ==============================================================================
# This cell:
# - Finds definitions that are abbreviations (e.g., "abbrev. of इत्यादी")
# - Populates referenced_entry with relationship: 'abbreviation_of'

import re

# Pattern for abbreviations
abbrev_pattern = re.compile(r'^abbrev\.?\s+of\.?\s+([\u0900-\u097F]+)\.?$', re.IGNORECASE)

abbrev_count = 0
abbrev_not_found = []

for entry in data:
    for defn in entry['definitions']:
        # Skip if already has a referenced_entry
        if defn['referenced_entry'] is not None:
            continue
        
        definition = defn['definition']
        match = abbrev_pattern.match(definition)
        
        if match:
            referenced_headword = match.group(1)
            
            referenced_entry = find_entry_by_headword(referenced_headword, data)
            
            if referenced_entry:
                defn['referenced_entry'] = create_referenced_entry(
                    referenced_entry,
                    relationship='abbreviation_of'
                )
                abbrev_count += 1
            else:
                abbrev_not_found.append({
                    'entry': entry['headword_devanagari'],
                    'looking_for': referenced_headword
                })

print(f"Abbreviations processed: {abbrev_count}")
print(f"Abbreviations not found: {len(abbrev_not_found)}")

if abbrev_not_found:
    print("\nAbbreviations not found:")
    for item in abbrev_not_found:
        print(f"  {item['entry']} -> {item['looking_for']}")

Abbreviations processed: 4
Abbreviations not found: 0


In [93]:
# ==============================================================================
# CELL 8: Process Variants
# ==============================================================================
# This cell:
# - Finds definitions that are variants (e.g., "var. of माझ्या")
# - Populates referenced_entry with relationship: 'variant_of'

import re

# Pattern for variants
variant_pattern = re.compile(r'^var\.?\s+of\s+([\u0900-\u097F]+)\.?$', re.IGNORECASE)

variant_count = 0
variant_not_found = []

for entry in data:
    for defn in entry['definitions']:
        # Skip if already has a referenced_entry
        if defn['referenced_entry'] is not None:
            continue
        
        definition = defn['definition']
        match = variant_pattern.match(definition)
        
        if match:
            referenced_headword = match.group(1)
            
            referenced_entry = find_entry_by_headword(referenced_headword, data)
            
            if referenced_entry:
                defn['referenced_entry'] = create_referenced_entry(
                    referenced_entry,
                    relationship='variant_of'
                )
                variant_count += 1
            else:
                variant_not_found.append({
                    'entry': entry['headword_devanagari'],
                    'looking_for': referenced_headword
                })

print(f"Variants processed: {variant_count}")
print(f"Variants not found: {len(variant_not_found)}")

if variant_not_found:
    print("\nVariants not found:")
    for item in variant_not_found:
        print(f"  {item['entry']} -> {item['looking_for']}")

Variants processed: 0
Variants not found: 1

Variants not found:
  मज -> माझ्या


In [94]:
# ==============================================================================
# CELL 9: Process Collocations (Create New Entries) - FIXED VERSION
# ==============================================================================
# This cell:
# - Finds definitions with collocation patterns (e.g., "० करणे to urge")
# - Creates NEW entries for each collocation with entry_type: 'collocation'
# - Populates base_word with the parent entry's information
# - Handles complex patterns:
#   1. Simple: '० करणे to urge'
#   2. Multi-verb same meaning: '० पडणे , पाडणे , घालणे to frown'
#   3. Multi-० same meaning: '० होणे , ० पावणे to contract'
#   4. Multiple separate collocations: '० करणे to X. ० होणे to Y'
#   5. Attributive: '० चे final'
#   6. Multi-word verbs: '० करून देणे to introduce'
# - Adds new collocation entries to the data list

import re
from collections import OrderedDict
from copy import deepcopy


def parse_collocations(definition_text, headword_devanagari):
    """
    Parse collocations from a definition string containing ० patterns.
    
    Args:
        definition_text: The definition string containing ० markers
        headword_devanagari: The headword in Devanagari script
    
    Returns:
        List of collocation dicts with: phrase_devanagari, verb, meaning
    """
    collocations = []
    
    if '०' not in definition_text:
        return []
    
    # Split on ० to get segments
    parts = definition_text.split('०')
    
    # Skip the first part (base definition before first ०)
    segments = parts[1:]
    
    for idx, segment in enumerate(segments):
        segment = segment.strip()
        if not segment:
            continue
        
        # Check if segment ends with comma (indicates verb is pending, meaning comes later)
        segment_stripped = segment.rstrip()
        ends_with_comma = segment_stripped.endswith(',')
        
        if ends_with_comma:
            segment_work = segment_stripped[:-1].strip()
        else:
            segment_work = segment_stripped
        
        # Find the boundary between Devanagari and English
        # Everything up to last Devanagari char is the verb part
        # Everything after is the meaning
        last_deva_pos = -1
        for i, char in enumerate(segment_work):
            if '\u0900' <= char <= '\u097F':
                last_deva_pos = i
        
        if last_deva_pos == -1:
            # No Devanagari in segment, skip
            continue
        
        deva_part = segment_work[:last_deva_pos + 1].strip()
        meaning_part = segment_work[last_deva_pos + 1:].strip()
        
        # Clean up meaning
        meaning_part = meaning_part.strip().lstrip('.').strip()
        if meaning_part.endswith('.'):
            meaning_part = meaning_part[:-1].strip()
        
        # Handle comma-separated verbs (e.g., 'पडणे , पाडणे , घालणे')
        if ',' in deva_part:
            verbs = [v.strip() for v in deva_part.split(',') if v.strip()]
        else:
            verbs = [deva_part] if deva_part else []
        
        # Determine meaning
        if meaning_part:
            meaning = meaning_part
        elif ends_with_comma:
            # Look ahead for meaning in next segment(s)
            meaning = None
            for future_idx in range(idx + 1, len(segments)):
                future_seg = segments[future_idx].strip()
                # Find meaning in future segment
                last_deva = -1
                for i, char in enumerate(future_seg):
                    if '\u0900' <= char <= '\u097F':
                        last_deva = i
                if last_deva >= 0:
                    future_meaning = future_seg[last_deva + 1:].strip().lstrip('.').strip()
                    if future_meaning:
                        meaning = future_meaning.rstrip('.').strip()
                        break
        else:
            meaning = None
        
        # Create collocations for each verb
        for verb in verbs:
            if verb and meaning:
                phrase = f'{headword_devanagari} {verb}'
                collocations.append({
                    'phrase_devanagari': phrase,
                    'verb': verb,
                    'meaning': meaning
                })
    
    return collocations


def get_clean_base_definition(entry):
    """
    Extract a clean base definition from entry, removing collocation patterns.
    
    Args:
        entry: The dictionary entry
    
    Returns:
        String with cleaned definition text
    """
    definitions = []
    for defn in entry['definitions']:
        def_text = defn['definition']
        # Remove everything from first ० onwards for cleaner base definition
        if '०' in def_text:
            clean_part = def_text.split('०')[0].strip()
            if clean_part:
                definitions.append(clean_part.rstrip('.').strip())
        else:
            definitions.append(def_text)
    
    result = '; '.join(definitions)
    # Clean up any trailing punctuation or whitespace
    result = re.sub(r'[;\s]+$', '', result)
    return result if result else '; '.join(d['definition'] for d in entry['definitions'])


# Track statistics
collocation_entries_created = 0
entries_with_collocations = 0

# Store new collocation entries to add
new_collocation_entries = []

# Track which collocations we've created to avoid duplicates
seen_collocations = set()

for entry in data:
    entry_has_collocation = False
    entry_collocations = []
    
    for defn in entry['definitions']:
        definition = defn['definition']
        
        # Check if definition contains ०
        if '०' not in definition:
            continue
        
        entry_has_collocation = True
        
        # Parse collocations from this definition
        collocations = parse_collocations(
            definition,
            entry['headword_devanagari']
        )
        
        entry_collocations.extend(collocations)
    
    # Create new entries for unique collocations
    colloc_counter = 1
    for colloc in entry_collocations:
        # Create unique key to avoid duplicates
        colloc_key = (entry['entry_id'], colloc['phrase_devanagari'])
        if colloc_key in seen_collocations:
            continue
        seen_collocations.add(colloc_key)
        
        # Skip collocations with very short/invalid meanings
        if len(colloc['meaning']) < 2:
            continue
        
        new_entry = OrderedDict()
        
        # Generate unique entry_id
        new_entry['entry_id'] = f"{entry['entry_id']}_c{colloc_counter}"
        new_entry['headword_devanagari'] = colloc['phrase_devanagari']
        new_entry['headword_romanized'] = None  # Would need transliteration
        new_entry['full_entry'] = f"{colloc['phrase_devanagari']} {colloc['meaning']}"
        new_entry['source_page'] = entry['source_page']
        new_entry['entry_type'] = 'collocation'
        
        # Create definition for the collocation
        new_entry['definitions'] = [{
            'definition': colloc['meaning'],
            'pos_display': None,
            'number': None,
            'pos': None,
            'gender': None,
            'declension_class': None,
            'referenced_entry': None
        }]
        
        # Populate base_word with parent entry info
        base_def_clean = get_clean_base_definition(entry)
        
        new_entry['base_word'] = OrderedDict()
        new_entry['base_word']['entry_id'] = entry['entry_id']
        new_entry['base_word']['headword_devanagari'] = entry['headword_devanagari']
        new_entry['base_word']['headword_romanized'] = entry['headword_romanized']
        new_entry['base_word']['definition'] = base_def_clean
        new_entry['base_word']['pos'] = entry['definitions'][0]['pos'] if entry['definitions'] else None
        
        new_entry['search_text'] = None  # Computed later in Cell 10
        
        new_collocation_entries.append(new_entry)
        collocation_entries_created += 1
        colloc_counter += 1
    
    if entry_has_collocation:
        entries_with_collocations += 1

# Add new collocation entries to data
data.extend(new_collocation_entries)

print(f"Entries containing collocations: {entries_with_collocations}")
print(f"Collocation entries created: {collocation_entries_created}")
print(f"Total entries now: {len(data)}")

# Show examples
print("\nExample collocation entries:")
colloc_examples = [e for e in data if e['entry_type'] == 'collocation'][:5]
for ex in colloc_examples:
    print(f"  {ex['headword_devanagari']}: \"{ex['definitions'][0]['definition']}\"")
    print(f"    Base: {ex['base_word']['headword_devanagari']} ({ex['base_word']['definition'][:40]}...)")

# Verify quality
print("\n--- Quality Check ---")
short_meanings = [e for e in data if e['entry_type'] == 'collocation' and len(e['definitions'][0]['definition']) < 3]
print(f"Collocations with very short meanings (<3 chars): {len(short_meanings)}")

deva_pattern = re.compile(r'[\u0900-\u097F]')
deva_in_meaning = [e for e in data if e['entry_type'] == 'collocation' and deva_pattern.search(e['definitions'][0]['definition'])]
print(f"Collocations with Devanagari in meaning: {len(deva_in_meaning)}")

Entries containing collocations: 486
Collocation entries created: 624
Total entries now: 10460

Example collocation entries:
  अखेर चे: "final"
    Base: अखेर (end; finally...)
  अंग चे: "natural, inborn"
    Base: अंग (body; part; side; capacity, skill...)
  अंग चोरणे: "to contract one's body, shirk"
    Base: अंग (body; part; side; capacity, skill...)
  अंग धरणे: "to have rheumatic pains, gain weight"
    Base: अंग (body; part; side; capacity, skill...)
  अंग वळणी असणे to be used to. अंगात येणे: "to be possessed (by a god, evil spirit)"
    Base: अंग (body; part; side; capacity, skill...)

--- Quality Check ---
Collocations with very short meanings (<3 chars): 0
Collocations with Devanagari in meaning: 0


In [95]:
# ==============================================================================
# CELL 10: Build search_text
# ==============================================================================
# This cell:
# - Constructs the search_text field for each entry
# - Concatenates all relevant fields for vector embedding
# - For entries with referenced_entry, includes the referenced entry's info
# - For collocations, includes base_word info

def build_search_text(entry):
    """
    Build a comprehensive search_text for vector embedding.
    
    Includes:
    - Headword (Devanagari and romanized)
    - All definitions
    - POS information
    - Referenced entry info (if present)
    - Base word info (for collocations)
    """
    parts = []
    
    # Add headword
    parts.append(entry['headword_devanagari'])
    if entry['headword_romanized']:
        parts.append(entry['headword_romanized'])
    
    # Add entry type
    parts.append(entry['entry_type'])
    
    # Add definitions and their metadata
    for defn in entry['definitions']:
        parts.append(defn['definition'])
        
        if defn['pos']:
            parts.append(defn['pos'])
        
        if defn['gender']:
            parts.append(defn['gender'])
        
        # Add referenced entry info
        ref = defn.get('referenced_entry')
        if ref:
            parts.append(ref['relationship'])
            if ref['grammatical_form']:
                parts.append(ref['grammatical_form'])
            parts.append(ref['headword_devanagari'])
            if ref['headword_romanized']:
                parts.append(ref['headword_romanized'])
            # Add referenced definitions
            for ref_defn in ref.get('definitions', []):
                parts.append(ref_defn['definition'])
                if ref_defn.get('pos'):
                    parts.append(ref_defn['pos'])
    
    # Add base_word info for collocations
    base = entry.get('base_word')
    if base:
        parts.append(base['headword_devanagari'])
        if base['headword_romanized']:
            parts.append(base['headword_romanized'])
        parts.append(base['definition'])
        if base['pos']:
            parts.append(base['pos'])
    
    # Join all parts with spaces
    search_text = ' '.join(str(p) for p in parts if p)
    
    # Clean up multiple spaces
    search_text = re.sub(r'\s+', ' ', search_text).strip()
    
    return search_text


# Build search_text for all entries
for entry in data:
    entry['search_text'] = build_search_text(entry)

# Verify
print("search_text built for all entries")
print(f"\nTotal entries: {len(data)}")

# Show examples
print("\n" + "="*80)
print("Example search_text (headword entry):")
print("="*80)
headword_example = next(e for e in data if e['entry_type'] == 'headword' and e['definitions'])
print(f"Entry: {headword_example['headword_devanagari']}")
print(f"search_text: {headword_example['search_text'][:200]}...")

print("\n" + "="*80)
print("Example search_text (entry with referenced_entry):")
print("="*80)
ref_example = next((e for e in data if e['definitions'] and 
                    e['definitions'][0].get('referenced_entry')), None)
if ref_example:
    print(f"Entry: {ref_example['headword_devanagari']}")
    print(f"search_text: {ref_example['search_text'][:300]}...")

print("\n" + "="*80)
print("Example search_text (collocation entry):")
print("="*80)
colloc_example = next((e for e in data if e['entry_type'] == 'collocation'), None)
if colloc_example:
    print(f"Entry: {colloc_example['headword_devanagari']}")
    print(f"search_text: {colloc_example['search_text']}")

search_text built for all entries

Total entries: 10460

Example search_text (headword entry):
Entry: अ
search_text: अ a headword negative prefix...

Example search_text (entry with referenced_entry):
Entry: अलीकडे
search_text: अलीकडे alīkaḍē headword See अलिकडे cross_reference अलिकडे alikaḍē on this side adverb recently adverb on this side postposition...

Example search_text (collocation entry):
Entry: अखेर चे
search_text: अखेर चे collocation final अखेर akhēra end; finally noun.feminine.i


In [96]:
# ==============================================================================
# CELL 11: Summary Statistics
# ==============================================================================
# This cell:
# - Provides final statistics on processed data
# - Counts entries by type
# - Counts referenced_entry by relationship type

print("="*80)
print("FINAL PROCESSING SUMMARY")
print("="*80)

# Entry type counts
headword_count = sum(1 for e in data if e['entry_type'] == 'headword')
collocation_count = sum(1 for e in data if e['entry_type'] == 'collocation')

print(f"\nTotal entries: {len(data)}")
print(f"  - Headword entries: {headword_count}")
print(f"  - Collocation entries: {collocation_count}")

# Referenced entry counts by relationship
relationship_counts = {}
for entry in data:
    for defn in entry['definitions']:
        ref = defn.get('referenced_entry')
        if ref:
            rel = ref['relationship']
            relationship_counts[rel] = relationship_counts.get(rel, 0) + 1

print(f"\nReferenced entries by relationship:")
for rel, count in sorted(relationship_counts.items(), key=lambda x: -x[1]):
    print(f"  - {rel}: {count}")

# Entries with search_text
has_search_text = sum(1 for e in data if e['search_text'])
print(f"\nEntries with search_text: {has_search_text}")

# Average search_text length
avg_len = sum(len(e['search_text']) for e in data if e['search_text']) / has_search_text
print(f"Average search_text length: {avg_len:.1f} characters")

# Entries with base_word (collocations)
has_base_word = sum(1 for e in data if e['base_word'])
print(f"\nEntries with base_word: {has_base_word}")

print("\n" + "="*80)
print("SCHEMA VERIFICATION")
print("="*80)

# Verify all entries have required fields
required_fields = ['entry_id', 'headword_devanagari', 'headword_romanized', 
                   'full_entry', 'source_page', 'entry_type', 'definitions',
                   'base_word', 'search_text']

missing_fields = []
for entry in data:
    for field in required_fields:
        if field not in entry:
            missing_fields.append((entry['entry_id'], field))

if missing_fields:
    print(f"WARNING: {len(missing_fields)} entries missing required fields")
    for entry_id, field in missing_fields[:10]:
        print(f"  {entry_id} missing {field}")
else:
    print("✓ All entries have required fields")

# Verify definition schema
def_required = ['definition', 'pos_display', 'number', 'pos', 'gender', 
                'declension_class', 'referenced_entry']

def_missing = []
for entry in data:
    for i, defn in enumerate(entry['definitions']):
        for field in def_required:
            if field not in defn:
                def_missing.append((entry['entry_id'], i, field))

if def_missing:
    print(f"WARNING: {len(def_missing)} definitions missing required fields")
    for entry_id, idx, field in def_missing[:10]:
        print(f"  {entry_id} def[{idx}] missing {field}")
else:
    print("✓ All definitions have required fields")

FINAL PROCESSING SUMMARY

Total entries: 10460
  - Headword entries: 9836
  - Collocation entries: 624

Referenced entries by relationship:
  - cross_reference: 81
  - base_form: 39
  - reduplication_of: 16
  - abbreviation_of: 4

Entries with search_text: 10460
Average search_text length: 69.7 characters

Entries with base_word: 624

SCHEMA VERIFICATION
✓ All entries have required fields
✓ All definitions have required fields


In [97]:
# ==============================================================================
# CELL 12: Export to JSON
# ==============================================================================
# This cell:
# - Exports the processed data to a JSON file
# - Ready for import into vector database

import json

output_filename = 'berntsen_dictionary_processed.json'

with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Exported {len(data)} entries to {output_filename}")

# Also export a sample for inspection
sample_filename = 'berntsen_dictionary_sample.json'
sample_data = data[:50]  # First 50 entries

with open(sample_filename, 'w', encoding='utf-8') as f:
    json.dump(sample_data, f, ensure_ascii=False, indent=2)

print(f"Exported {len(sample_data)} sample entries to {sample_filename}")

Exported 10460 entries to berntsen_dictionary_processed.json
Exported 50 sample entries to berntsen_dictionary_sample.json


In [98]:
## Classification of All entries with devanagari script

#| Category | Count (approx) | Action | `entry_type` | Needs `referenced_entry`? |
#|----------|----------------|--------|--------------|---------------------------|
#| **A. Cross-references** ("See X") | ~50 | Keep as-is, add relationship | `headword` | Yes: `relationship: 'cross_reference'` |
#| **B. Collocations** (० patterns) | ~400 | Create NEW entries | `collocation` | No - uses `base_word` instead |
#| **C. Grammatical forms** (perf. of, past tense of, etc.) | ~40 | Add referenced_entry | `headword` | Yes: `relationship: 'base_form'` |
#| **D. Reduplication** (redupl. of) | ~20 | Add referenced_entry | `headword` | Yes: `relationship: 'reduplication_of'` |
#| **E. Abbreviations** (abbrev. of) | ~15 | Add referenced_entry | `headword` | Yes: `relationship: 'abbreviation_of'` |
#| **F. Variants** (var. of) | ~5 | Add referenced_entry | `headword` | Yes: `relationship: 'variant_of'` |
#| **G. Self-contained definitions** (measurements, cultural terms, suffixes, intensifiers, etc.) | ~400+ | No structural change | `headword` | No - Devanagari is just part of definition text |