In [49]:
# Todo
#1.Remove भोकणे and सुरुवात

In [51]:
# Import and schema setup
# - Removes part_of_speech
# - Adds entry_id
# - Adds entry_type
# - Adds base_word (for collocations)
# - Adds search_text

import json
from collections import OrderedDict
from copy import deepcopy

# Load the dictionary
with open('berntsen_dictionary.json', 'r', encoding='utf-8') as f:
    original_data = json.load(f)

print(f"Loaded {len(original_data)} entries")

# Create new data with proper field ordering
data = []
for entry in original_data:
    # Create new ordered dictionary with entry_id first
    new_entry = OrderedDict()
    new_entry['entry_id'] = f"berntsen_{entry['headword_devanagari']}_{entry['source_page']}"
    new_entry['entry_type'] = 'headword'  # default; collocations created separately
    # Note: part_of_speech field is intentionally not included (removed)
    # Note: definitions[] added in next cell by parse_definitions()
    # Note: base_word will be set for collocations only
    new_entry['headword_devanagari'] = entry['headword_devanagari']
    new_entry['headword_romanized'] = entry['headword_romanized']
    new_entry['full_entry'] = entry['full_entry']
    new_entry['source_page'] = entry['source_page']
    new_entry['base_word'] = None
    # Note: search_text computed at end
    new_entry['search_text'] = None
    
    data.append(new_entry)

# Verify the result
print(f"\nProcessed {len(data)} entries")
print("\nExample entry (index 300):")
print(json.dumps(data[300], ensure_ascii=False, indent=2))

Loaded 9836 entries

Processed 9836 entries

Example entry (index 300):
{
  "entry_id": "berntsen_अवलंबून_5",
  "entry_type": "headword",
  "headword_devanagari": "अवलंबून",
  "headword_romanized": "avalambūna",
  "full_entry": "अवलंबून avalambūna , असणे asaṇē to be dependent on.",
  "source_page": 5,
  "base_word": null,
  "search_text": null
}


In [13]:
#Berntsen Dictionary, how part of speech is represented
# Pattern 1: Sequential POS sections (most common)
#       form: headword romanized POS1 definition(s). POS2 definition(s).
#       example: "अति ati adv. too much. pref. extremely, too much, over-."
#       description: Each POS gets its own definition section which is separated by periods
# Pattern 2: Numbered definitions within same POS
#       form: headword romanized POS 1. def1. 2. def2. 3. def3.
#       example: "अंक aṅka m. 1. number. 2. issue (of a magazine, newspaper). 3. act (of a play)."
#       description: Numbers belong to the SAME POS. All numbered definitions share the first POS mentioned
# Pattern 3: Mixed - numbered defs THEN new POS
#       form: headword romanized POS1 1. def1. 2. def2. POS2 def3.
#       example: "अखंड akhaṇḍa adj. inv. 1. entire, in one piece. 2. continuous. adv. continuously."
#       explanation of example: Definitions 1 and 2 belong to adj. inv. 
#       When adv. appears, it starts a NEW section with its own definition

In [53]:
#definitions[]
import re
from collections import OrderedDict

def parse_definitions(entry):
    """
    Parse the full_entry to extract definitions with their POS and numbers.
    
    Args:
        entry: Dictionary containing 'full_entry', 'headword_romanized'
    
    Returns:
        List of definition dictionaries with 'definition', 'pos_display', 'number',
        'pos', 'gender', 'declension_class', 'referenced_entry'
    """
    full_entry = entry['full_entry']
    romanized = entry['headword_romanized']
    
    # Step 1: Extract the definition part (everything after romanized headword)
    parts = full_entry.split(romanized, 1)
    if len(parts) <= 1:
        # Edge case: romanized not found, return empty
        return []
    
    definition_text = parts[1].strip()
    
    # If definition_text starts with comma, remove it
    if definition_text.startswith(','):
        definition_text = definition_text[1:].strip()
    
    # If there's no text at all, return empty
    if not definition_text:
        return []
    
    # Step 2: Define all possible POS markers (order matters - longest first to avoid partial matches)
    # IMPORTANT: Include both "v. t." and "v.t." formats
    pos_markers = [
        'adj. inv.', 'adv. suff.', 'adj. suff. inv.', 'adj. suff.',
        'n. suff.', 'n.suff.', 'm. suff.', 'f. suff.', 
        'v. t.', 'v. i.', 'v.i.', 'v.t.',  # Added both formats with and without space
        'adj.', 'adv.', 'pref.', 'suff.',
        'conj.', 'post.', 'pron.', 'interj.', 'poss.',
        'f.(i)', 'f.(e)', 'm.(i)', 'm.(e)',
        'n.', 'm.', 'f.'
    ]
    
    # Step 3: Find all POS markers and their positions in the text
    pos_locations = []
    for pos in pos_markers:
        # Find all occurrences of this POS marker
        index = 0
        while True:
            index = definition_text.find(pos, index)
            if index == -1:
                break
            # Check if this is at the start or preceded by space/period
            if index == 0 or definition_text[index-1] in [' ', '.']:
                # Check if followed by space or end of string
                end_index = index + len(pos)
                if end_index >= len(definition_text) or definition_text[end_index] == ' ':
                    pos_locations.append({
                        'pos_display': pos,
                        'start': index,
                        'end': end_index
                    })
            index += 1
    
    # Sort by position
    pos_locations.sort(key=lambda x: x['start'])
    
    # Step 4: If no POS found, treat entire text as definition with null POS
    if not pos_locations:
        # Remove trailing period
        cleaned_text = definition_text.strip()
        if cleaned_text.endswith('.'):
            cleaned_text = cleaned_text[:-1].strip()
        
        if cleaned_text:
            return [{
                'definition': cleaned_text,
                'pos_display': None,
                'number': None,
                'pos': None,
                'gender': None,
                'declension_class': None,
                'referenced_entry': None
            }]
        else:
            return []
    
    # Step 5: Split text into sections by POS
    definitions = []
    
    for i, pos_info in enumerate(pos_locations):
        current_pos = pos_info['pos_display']
        start = pos_info['end']
        
        # Find where this POS section ends (where next POS starts, or end of text)
        if i + 1 < len(pos_locations):
            end = pos_locations[i + 1]['start']
        else:
            end = len(definition_text)
        
        # Extract the definition text for this POS
        section_text = definition_text[start:end].strip()
        
        # Step 6: Check if this section has numbered definitions (1., 2., 3., etc.)
        numbered_pattern = r'(\d+)\.\s*([^0-9]+?)(?=\s*\d+\.|$)'
        numbered_matches = list(re.finditer(numbered_pattern, section_text))
        
        if numbered_matches:
            # Parse numbered definitions
            for match in numbered_matches:
                number = int(match.group(1))
                def_text = match.group(2).strip()
                
                # Clean up: remove trailing period if present
                if def_text.endswith('.'):
                    def_text = def_text[:-1].strip()
                
                # Skip empty definitions
                if def_text:
                    definitions.append({
                        'definition': def_text,
                        'pos_display': current_pos,
                        'number': number,
                        'pos': None,
                        'gender': None,
                        'declension_class': None,
                        'referenced_entry': None
                    })
        else:
            # No numbers - this is a single unnumbered definition
            # Clean the section text
            section_text = section_text.strip()
            
            # Remove trailing period
            if section_text.endswith('.'):
                section_text = section_text[:-1].strip()
            
            # Skip empty definitions
            if section_text:
                definitions.append({
                    'definition': section_text,
                    'pos_display': current_pos,
                    'number': None,
                    'pos': None,
                    'gender': None,
                    'declension_class': None,
                    'referenced_entry': None
                })
    
    return definitions


# Add definitions to all entries
for entry in data:
    entry['definitions'] = parse_definitions(entry)

# Calculate metrics
total_entries = len(data)
entries_with_defs = sum(1 for entry in data if entry['definitions'])
entries_without_defs = total_entries - entries_with_defs
total_defs = sum(len(e['definitions']) for e in data)
entries_with_null_pos = sum(1 for entry in data if entry['definitions'] and any(d['pos_display'] is None for d in entry['definitions']))

print(f"Total entries: {total_entries}")
print(f"Entries with definitions: {entries_with_defs} ({entries_with_defs/total_entries*100:.1f}%)")
print(f"Entries without definitions: {entries_without_defs} ({entries_without_defs/total_entries*100:.1f}%)")
print(f"Entries with null POS: {entries_with_null_pos} ({entries_with_null_pos/total_entries*100:.1f}%)")
print(f"Total definitions parsed: {total_defs}")
print(f"Average definitions per entry: {total_defs/total_entries:.2f}")

Total entries: 9836
Entries with definitions: 9836 (100.0%)
Entries without definitions: 0 (0.0%)
Entries with null POS: 348 (3.5%)
Total definitions parsed: 12167
Average definitions per entry: 1.24


In [54]:
from collections import Counter

# Count POS frequency across all definitions
pos_counter = Counter()

for entry in data:
    for definition in entry['definitions']:
        pos = definition['pos_display']
        pos_counter[pos] += 1

# Sort by frequency (descending)
sorted_pos = sorted(pos_counter.items(), key=lambda x: x[1], reverse=True)

print("POS FREQUENCY DISTRIBUTION")
print("=" * 80)
print(f"Total unique POS values: {len(pos_counter)}")
print(f"Total definitions: {sum(pos_counter.values())}")
print("\n")

# Display all POS with their frequencies
for pos, count in sorted_pos:
    percentage = (count / sum(pos_counter.values())) * 100
    pos_display = f"'{pos}'" if pos is not None else "None (no POS)"
    print(f"{pos_display:30s} {count:6d} ({percentage:5.2f}%)")

POS FREQUENCY DISTRIBUTION
Total unique POS values: 18
Total definitions: 12167


'm.'                             3051 (25.08%)
'adj.'                           2219 (18.24%)
'f.'                             1632 (13.41%)
'n.'                             1583 (13.01%)
'v.i.'                            780 ( 6.41%)
'v.t.'                            745 ( 6.12%)
'f.(i)'                           672 ( 5.52%)
'adv.'                            525 ( 4.31%)
None (no POS)                     348 ( 2.86%)
'f.(e)'                           207 ( 1.70%)
'post.'                           123 ( 1.01%)
'suff.'                           103 ( 0.85%)
'pref.'                            90 ( 0.74%)
'pron.'                            31 ( 0.25%)
'interj.'                          29 ( 0.24%)
'conj.'                            23 ( 0.19%)
'poss.'                             5 ( 0.04%)
'v. t.'                             1 ( 0.01%)


In [55]:
#Normalizing the one entry that is 'v. t.' to 'v.t.' 

for entry in data:
    for definition in entry['definitions']:
        if definition['pos_display'] == 'v. t.':
            definition['pos_display'] = 'v.t.'

In [56]:
# POS Mapping:
# m. → noun.masculine
# f. → noun.feminine
# f.(i) → noun.feminine.i
# f.(e) → noun.feminine.e
# n. → noun.neuter
# v.t. → verb.transitive
# v.i. → verb.intransitive
# adj. → adjective
# adj. inv. → adjective.invariable
# adv. → adverb
# post. → postposition
# pref. → prefix
# suff. → suffix
# conj. → conjunction
# pron. → pronoun
# interj. → interjection
# poss. → possessive

for entry in data:
    for definition in entry['definitions']:
        pos_display = definition.get('pos_display') or definition.get('pos')
        
        # Map to new pos format
        if pos_display == 'm.':
            definition['pos'] = 'noun.masculine'
            definition['gender'] = 'masculine'
            definition['declension_class'] = None
        elif pos_display == 'f.':
            definition['pos'] = 'noun.feminine'
            definition['gender'] = 'feminine'
            definition['declension_class'] = None
        elif pos_display == 'f.(i)':
            definition['pos'] = 'noun.feminine.i'
            definition['gender'] = 'feminine'
            definition['declension_class'] = 'i'
        elif pos_display == 'f.(e)':
            definition['pos'] = 'noun.feminine.e'
            definition['gender'] = 'feminine'
            definition['declension_class'] = 'e'
        elif pos_display == 'n.':
            definition['pos'] = 'noun.neuter'
            definition['gender'] = 'neuter'
            definition['declension_class'] = None
        elif pos_display == 'v.t.':
            definition['pos'] = 'verb.transitive'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'v.i.':
            definition['pos'] = 'verb.intransitive'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'adj.':
            definition['pos'] = 'adjective'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'adj. inv.':
            definition['pos'] = 'adjective.invariable'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'adv.':
            definition['pos'] = 'adverb'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'post.':
            definition['pos'] = 'postposition'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'pref.':
            definition['pos'] = 'prefix'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'suff.':
            definition['pos'] = 'suffix'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'conj.':
            definition['pos'] = 'conjunction'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'pron.':
            definition['pos'] = 'pronoun'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'interj.':
            definition['pos'] = 'interjection'
            definition['gender'] = None
            definition['declension_class'] = None
        elif pos_display == 'poss.':
            definition['pos'] = 'possessive'
            definition['gender'] = None
            definition['declension_class'] = None
        else:
            definition['pos'] = None
            definition['gender'] = None
            definition['declension_class'] = None

In [48]:
## Classification of All 948 Entries

#| Category | Count (approx) | Action | `entry_type` | Needs `referenced_entry`? |
#|----------|----------------|--------|--------------|---------------------------|
#| **A. Cross-references** ("See X") | ~50 | Keep as-is, add relationship | `headword` | Yes: `relationship: 'cross_reference'` |
#| **B. Collocations** (० patterns) | ~400 | Create NEW entries | `collocation` | No - uses `base_word` instead |
#| **C. Grammatical forms** (perf. of, past tense of, etc.) | ~40 | Add referenced_entry | `headword` | Yes: `relationship: 'base_form'` |
#| **D. Reduplication** (redupl. of) | ~20 | Add referenced_entry | `headword` | Yes: `relationship: 'reduplication_of'` |
#| **E. Abbreviations** (abbrev. of) | ~15 | Add referenced_entry | `headword` | Yes: `relationship: 'abbreviation_of'` |
#| **F. Variants** (var. of) | ~5 | Add referenced_entry | `headword` | Yes: `relationship: 'variant_of'` |
#| **G. Self-contained definitions** (measurements, cultural terms, suffixes, intensifiers, etc.) | ~400+ | No structural change | `headword` | No - Devanagari is just part of definition text |

In [35]:
import re

def extract_see_reference(definition_text):
    """Extract the Devanagari word that follows 'See' in a definition"""
    # Look for Devanagari characters after "See"
    match = re.search(r'See\s+([\u0900-\u097F]+)', definition_text)
    if match:
        return match.group(1)
    return None

# Add inline see_also definitions
failures = []

for entry in data:
    for definition in entry['definitions']:
        see_ref = extract_see_reference(definition['definition'])
        if see_ref:
            # Look up by DEVANAGARI headword, not romanized
            referenced_entry = next((e for e in data 
                                    if e['headword_devanagari'] == see_ref), None)
            
            if referenced_entry:
                # Successful lookup - embed the entire entry
                definition['see_also'] = {
                    'entry_id': referenced_entry['entry_id'],
                    'headword_devanagari': referenced_entry['headword_devanagari'],
                    'headword_romanized': referenced_entry['headword_romanized'],
                    'full_entry': referenced_entry['full_entry'],
                    'source_page': referenced_entry['source_page'],
                    'definitions': referenced_entry['definitions']
                }
            else:
                # Lookup failed
                definition['see_also'] = None
                failures.append({
                    'entry': entry['headword_devanagari'],
                    'romanized': entry['headword_romanized'],
                    'looking_for': see_ref,
                    'definition': definition['definition']
                })
        else:
            definition['see_also'] = None

# Report results
total_see_refs = sum(1 for e in data for d in e['definitions'] 
                     if 'See' in d['definition'])
successful = sum(1 for e in data for d in e['definitions'] 
                 if d.get('see_also') is not None)

print(f"Total 'See' references found: {total_see_refs}")
print(f"Successfully resolved: {successful}")
print(f"Failed lookups: {len(failures)}")

if failures:
    print("\n⚠️ FAILED LOOKUPS:")
    print("=" * 80)
    for f in failures:
        print(f"\nEntry: {f['entry']} ({f['romanized']})")
        print(f"Looking for: '{f['looking_for']}'")
        print(f"Definition: {f['definition']}")

Total 'See' references found: 92
Successfully resolved: 87
Failed lookups: 3

⚠️ FAILED LOOKUPS:

Entry: भागुबाई (bhāgubāī)
Looking for: 'मित्री'
Definition: See मित्री भागुबाई

Entry: भोकणे (bhōkaṇē)
Looking for: 'भुकंणे'
Definition: See भुकंणे

Entry: सुरुवात (suruvāta)
Looking for: 'सुरवात'
Definition: See सुरवात


In [41]:
# Fixing Edge case
bhagubai_entry = next((e for e in data if e['headword_devanagari'] == 'भागुबाई'), None)
bhitri_entry = next((e for e in data if e['headword_devanagari'] == 'भित्री'), None)

if bhagubai_entry and bhitri_entry:
    # Point भागुबाई to भित्री (which has the actual definition)
    bhagubai_entry['definitions'][0]['see_also'] = {
        'entry_id': bhitri_entry['entry_id'],
        'headword_devanagari': bhitri_entry['headword_devanagari'],
        'headword_romanized': bhitri_entry['headword_romanized'],
        'full_entry': bhitri_entry['full_entry'],
        'source_page': bhitri_entry['source_page'],
        'definitions': bhitri_entry['definitions']
    }
    print("✓ Fixed भागुबाई see_also reference to भित्री")
else:
    print("✗ Could not find one of the entries")


✓ Fixed भागुबाई see_also reference to भित्री


In [44]:
import re

# Find all definitions containing Devanagari script
devanagari_pattern = re.compile(r'[\u0900-\u097F]+')

entries_with_devanagari = []

for entry in data:
    for definition in entry['definitions']:
        if devanagari_pattern.search(definition['definition']):
            entries_with_devanagari.append({
                'headword': entry['headword_devanagari'],
                'romanized': entry['headword_romanized'],
                'definition': definition['definition'],
                'pos': definition['pos']
            })

print(f"Total definitions with Devanagari: {len(entries_with_devanagari)}")
print("\n" + "=" * 80)
print("DEFINITIONS CONTAINING DEVANAGARI SCRIPT:")
print("=" * 80)

for i, item in enumerate(entries_with_devanagari, 1):
    print(f"\n{i}. {item['headword']} ({item['romanized']})")
    print(f"   POS: {item['pos']}")
    print(f"   Definition: {item['definition']}")

Total definitions with Devanagari: 948

DEFINITIONS CONTAINING DEVANAGARI SCRIPT:

1. अखेर (akhēra)
   POS: adverb
   Definition: finally. ० चे final

2. अंग (aṅga)
   POS: noun.neuter
   Definition: capacity, skill. ० चे natural, inborn. ० चोरणे to contract one's body, shirk. ० धरणे to have rheumatic pains, gain weight. ० वळणी असणे to be used to. अंगात येणे to be possessed (by a god, evil spirit)

3. अग्रक्रम (agrakrama)
   POS: noun.masculine
   Definition: priority. अग्रक्रमाने giving priority, primarily

4. अट (aṭa)
   POS: noun.feminine.i
   Definition: condition, stipulation. ० घालणे to make a condition

5. अट्टाहास (aṭṭāhāsa)
   POS: noun.masculine
   Definition: obstinacy, dogmatic claim. ० बाळगणे , धरणे to be obstinate, make a dogmatic claim

6. अंत (anta)
   POS: noun.masculine
   Definition: end, death. ० पाहणे to try one's patience

7. अंतर (antara)
   POS: noun.neuter
   Definition: distance. ० तोडणे to cover a distance. ० देणे to part from, leave, abandon

8. अदपाव (adapā

   Definition: longing. ० लागणे to be anxious, long for

943. हुलकावणी (hulakāvaṇī)
   POS: noun.feminine
   Definition: dodging. ० देणे to dodge

944. हूल (hūla)
   POS: noun.feminine.i
   Definition: rumor. ० उठणे for a rumor to start

945. हेलकावा (hēlakāvā)
   POS: noun.masculine
   Definition: swing, oscillation. ० देणे , खाणे to swing

946. होकार (hōkāra)
   POS: noun.masculine
   Definition: assent. ० देणे to give assent

947. होते (hōtē)
   POS: None
   Definition: past tense of असणे

948. होळी (hōḷī)
   POS: noun.feminine
   Definition: See शिमगा


In [46]:
find_see = next((e for e in data if e['headword_devanagari'] == 'हरिण'), None)
find_see

OrderedDict([('entry_id', 'berntsen_हरिण_166'),
             ('headword_devanagari', 'हरिण'),
             ('headword_romanized', 'hariṇa'),
             ('full_entry', 'हरिण hariṇa See हरण , def. 1.'),
             ('source_page', 166),
             ('definitions',
              [{'definition': 'See हरण , def. 1',
                'pos_display': None,
                'number': None,
                'pos': None,
                'gender': None,
                'declension_class': None,
                'see_also': {'entry_id': 'berntsen_हरण_166',
                 'headword_devanagari': 'हरण',
                 'headword_romanized': 'haraṇa',
                 'full_entry': 'हरण haraṇa n. 1. deer. 2. making away with.',
                 'source_page': 166,
                 'definitions': [{'definition': 'deer',
                   'pos_display': 'n.',
                   'number': 1,
                   'pos': 'noun.neuter',
                   'gender': 'neuter',
                   'declension_

In [42]:
print(json.dumps(data[695], ensure_ascii=False, indent=2))

{
  "entry_id": "berntsen_उंचावणे_12",
  "headword_devanagari": "उंचावणे",
  "headword_romanized": "uñcāvaṇē",
  "full_entry": "उंचावणे uñcāvaṇē v.i. to rise, be raised. v.t. to raise.",
  "source_page": 12,
  "definitions": [
    {
      "definition": "to rise, be raised",
      "pos_display": "v.i.",
      "number": null,
      "pos": "verb.intransitive",
      "gender": null,
      "declension_class": null,
      "see_also": null
    },
    {
      "definition": "to raise",
      "pos_display": "v.t.",
      "number": null,
      "pos": "verb.transitive",
      "gender": null,
      "declension_class": null,
      "see_also": null
    }
  ]
}
