In [1]:
from pathlib import Path
import sanskrit as sks
import pandas as pd
import numpy as np

for x in sks.data_dir.glob('Ṛgv*'):
    print(x.name)

Ṛgvedavedāṅgajyotiṣa-all.conllu
Ṛgveda
Ṛgvedakhilāni-all.conllu


In [13]:
file = sks.data_dir / Path('Ṛgvedavedāṅgajyotiṣa-all.conllu')
text = file.read_text()
lines = text.split('\n')
for i in range(15):
  print(lines[i])

## text: Ṛgvedavedāṅgajyotiṣa
## text_id: 33
## chapter: ṚVJ, 1
## chapter_id: 166

# text_line: pañcasaṃvatsaramayaṃ yugādhyakṣaṃ prajāpatim
# text_line_id: 47058
# text_line_counter: 1
# text_line_subcounter: 1
1-3	pañcasaṃvatsaramayaṃ	_	_	_	_	_	_	_	_
1	_	pañcan	NUM	NUM	Case=Cpd	_	_	_	_	165692	pañca	_
2	_	saṃvatsara	NOUN	NC	Case=Cpd	_	_	_	_	42265	saṃvatsara	_
3	_	maya	ADJ	JJ	Case=Acc|Gender=Masc|Number=Sing	_	_	_	_	109021	mayam	_
4-5	yugādhyakṣaṃ	_	_	_	_	_	_	_	_
4	_	yuga	NOUN	NC	Case=Cpd	_	_	_	_	64702	yuga	_


In [None]:
text_line_fields = ['text_id', 'chapter_id', 'text_line', 'text_line_id',
                    'text_line_counter', 'text_line_subcounter']

word_fields = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD',
               'DEPREL', 'DEPS', 'MISC', 'LEMMA_ID', 'unsandhied_form', 'semantic_id',
               'text_line_id']
word_fields_dtype = {'LEMMA_ID':'int32', 'semantic_id':'int32',
                    'text_line_id':'int32'}

def process_lines(lines):
    # text_lines = pd.DataFrame(columns=text_line_fields)
    # words = pd.DataFrame(columns=word_fields)
    text_lines = []
    words = []

    text_line = []
    chapter_id = None
    text_id = None
    for l in lines:
        if not l:
            continue
        if l.startswith('#'):
            _, key, value = l.split(' ',maxsplit=2)
            key = key[:-1] # remove ':'
            
            
            if key == 'text_id':
                text_id = value
            elif key == 'chapter_id':
                chapter_id = value
            elif key == 'text_line':
                text_line = [text_id, chapter_id, value]
            elif key in text_line_fields:
                text_line.append(value)

            # flush text_line?
            if key == 'text_line_subcounter':
                text_lines.append(text_line)
            continue

        row = l.split('\t')
        # composite words do not have the last 3 fields
        if len(row) == 10:
            row = row + [np.NaN, np.NaN, np.NaN]
        # add text_line_id
        row.append(text_lines[-1][3])
        # append row
        words.append(row)
    
    text_lines = pd.DataFrame(text_lines, columns=text_line_fields)
    words = pd.DataFrame(words, columns=word_fields)
    return text_lines, words

In [None]:
def process_files(dir:Path):
    total_text_lines = pd.DataFrame()
    total_words = pd.DataFrame()
    files = list(dir.glob('*.conllu'))
    print(f"n. files:{len(files)}")
    for file in files:
        lines = file.read_text().split('\n')
        text_lines, words = process_lines(lines)
        total_text_lines = pd.concat([total_text_lines, text_lines])
        total_words = pd.concat([total_words, words])
    
    return total_text_lines, total_words

In [None]:
text_lines, words = process_files(sks.data_dir)

out_txt_ls = sks.data_dir / Path('text_lines.feather')
out_words = sks.data_dir / Path('words.feather')

text_lines.reset_index(drop=True).to_feather(out_txt_ls)
words.reset_index(drop=True).to_feather(out_words)

In [None]:
target_dirs = [f for f in sks.data_dir.iterdir() if f.is_dir()]

for i, dir in enumerate(target_dirs):
    text_lines, words = process_files(dir)
    out_txt_ls = dir / Path('text_lines.feather')
    out_words = dir / Path('words.feather')

    text_lines.reset_index(drop=True).to_feather(out_txt_ls)
    words.reset_index(drop=True).to_feather(out_words)
    print(i)

In [None]:
target_dirs = [f for f in sks.data_dir.iterdir() if f.is_dir()]
text_lines_all = pd.read_feather(sks.data_dir / Path('text_lines.feather'))

for i, dir in enumerate(target_dirs):
    text_lines = pd.read_feather(dir / Path('text_lines.feather'))
    text_lines_all = pd.concat([text_lines_all, text_lines])
text_lines_all.drop_duplicates().reset_index(drop=True).to_feather('text_lines_all.feather')

In [2]:
target_dirs = [f for f in sks.data_dir.iterdir() if f.is_dir()]
words_all = pd.read_feather(sks.data_dir / Path('words.feather'))

for i, dir in enumerate(target_dirs):
    words = pd.read_feather(dir / Path('words.feather'))
    words_all = pd.concat([words_all, words])

words_all.drop_duplicates().reset_index(drop=True).to_feather('words_all.feather')

In [3]:
text_lines_all = pd.read_feather('text_lines_all.feather')
words_all = pd.read_feather('words_all.feather')

text_lines_all = text_lines_all.astype({'text_line_id':'int'})
words_all = words_all.astype({'text_line_id':'int'})

In [4]:
texts = pd.read_csv('texts.csv')
chapters = pd.read_csv('chapters.csv')

In [5]:
## To search the data efficiently, we use the duckDB engine.
## In this example, we look for text lines that contain the lemma 'vajra'
## in Accusative Case, and a Verb in Present tense.

import pandas as pd
import duckdb

query = """
select
    c.name as chapter, t.text_line_id, t.text_line
from
    text_lines_all t, chapters c
where
    c.id = t.chapter_id and
    exists
    (
        select *
        from words_all w 
        where w.LEMMA = 'vajra' and w.FEATS like '%Acc%' and w.text_line_id = t.text_line_id
    )
    and
    exists
    (
        select *
        from words_all w2
        where w2.UPOS = 'VERB' and w2.FEATS like '%Tense=Pres%' and w2.text_line_id = t.text_line_id
    )
"""

duckdb.query(query).to_df()

Unnamed: 0,chapter,text_line_id,text_line
0,"YRā, 1",430829,vyāghrīkandagataṃ vajraṃ dolāyantreṇa pācayet
1,"YRā, 1",430833,tadgolake kṣipedvajraṃ ruddhvā gajapuṭe pacet
2,"YRā, 1",430842,sa bhīto mūtrayettatra tanmūtre vajramāvapet
3,"YRā, 1",430857,nīlaṃ nīlīrasair vajraṃ vinā śudhyati dolayā
4,"YRā, 1",430861,vajraṃ vinānyaratnāni mriyante'ṣṭapuṭaiḥ khalu
...,...,...,...
249,"ĀK, 2, 8",402908,piṣṭvā tadgolake vajraṃ pūrvapakvaṃ vinikṣipet
250,"ĀK, 2, 8",402912,tadgole nikṣipedvajraṃ sūtreṇāveṣṭayedbahiḥ
251,"ĀK, 2, 8",402918,tadgole nikṣipedvajraṃ nimbakārpāsakodravaiḥ
252,"ĀK, 2, 8",402924,vajraṃ tittirimāṃsena veṣṭitaṃ nikṣipenmukhe
