In [3]:
import re
import requests
import requests_html
import string

In [24]:
CATEGORY = 'Category:Hindi_terms_with_IPA_pronunciation'
LIMIT = 500    #number of vocab or 'TITLE'
INITIAL_QUERY = f'https://en.wiktionary.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle={CATEGORY}&cmlimit={LIMIT}'
CONTINUE_TEMPLATE = string.Template(INITIAL_QUERY + "&cmcontinue=$cmcontinue")

PAGE_TEMPLATE = string.Template("https://en.wiktionary.org/wiki/$word")     # $_ -> placeholder for values of a dictionary. Template class of string library.

LI_SELECTOR = '//li[sup[a[@title = "Appendix:Hindi pronunciation"]] and span[@class = "IPA"]]' #language specific
SPAN_SELECTOR = '//span[@class = "IPA"]'       # what is this 'span' doing? 
PHONEMES = r"/(.+?)/"   #language specific


In [23]:
def _yield_phn(request):
    for li in request.html.xpath(LI_SELECTOR):
        for span in li.xpath(SPAN_SELECTOR):
            m = re.search(PHONEMES, span.text)
            if m:
                yield m


In [38]:
def _print_data(data):
    session = requests_html.HTMLSession()
    """" <example of data>
query":{"categorymembers":[{"pageid":81811,"ns":0,"title":"Homo sapiens"},
{"pageid":3072481,"ns":0,"title":"\u0901"}...."""

    for member in data["query"]["categorymembers"]:
        word = member["title"]   
        # Skips multiword examples.
        if " " in word:
            continue
        query = PAGE_TEMPLATE.substitute(word=word)
        request = session.get(query)  #Grab a list of all links on the page
        for m in _yield_phn(request):
            pron = m.group(1)   #IPA without /IPA/. m.group(0) =  /IPA/ 
            print(f"{word}\t{pron}")
            
            


In [39]:
def main():
    data = requests.get(INITIAL_QUERY).json()      #In case the JSON decoding fails, r.json() raises an exception
    _print_data(data)
    code = data["continue"]["cmcontinue"]     # next page's number 
    """ <example>
    continue":{"cmcontinue":"page|414245494c4c45204d4552450a414245494c4c45204dc3885245|6478110","continue":"-||"}
    """ 
    
    next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
    while True:
        data = requests.get(next_query).json()
        _print_data(data)
        # Then this is the last one.
        if not "continue" in data:
            break
        code = data["continue"]["cmcontinue"]    #assign a new next page's number to code
        next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
        
if __name__ == "__main__":
    main()

None
None
None
None
None
None
<_sre.SRE_Match object; span=(0, 3), match='/ə/'>
अ	ə
<_sre.SRE_Match object; span=(0, 4), match='/ə̃/'>
अँ	ə̃
<_sre.SRE_Match object; span=(0, 13), match='/ə̃ɡ.ɾeː.ziː/'>
अँग्रेज़ी	ə̃ɡ.ɾeː.ziː
<_sre.SRE_Match object; span=(0, 9), match='/ə̃ʈ.nɑː/'>
अँटना	ə̃ʈ.nɑː
<_sre.SRE_Match object; span=(0, 4), match='/ə̃/'>
अं	ə̃
<_sre.SRE_Match object; span=(0, 5), match='/əŋk/'>
अंक	əŋk
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kən/'>
अंकन	əŋ.kən
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kəl/'>
अंकल	əŋ.kəl
<_sre.SRE_Match object; span=(0, 9), match='/əŋ.kɪt̪/'>
अंकित	əŋ.kɪt̪
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kʊɾ/'>
अंकुर	əŋ.kʊɾ
<_sre.SRE_Match object; span=(0, 12), match='/əŋ.kʊ.ɾɪt̪/'>
अंकुरित	əŋ.kʊ.ɾɪt̪
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kʊʃ/'>
अंकुश	əŋ.kʊʃ
<_sre.SRE_Match object; span=(0, 5), match='/əŋɡ/'>
अंग	əŋɡ
<_sre.SRE_Match object; span=(0, 19), match='/əŋ.ɡət͡ʃ.t͡ʃʰeːd̪/'>
अंगच्छेद	əŋ.ɡət͡ʃ.t͡ʃʰeːd̪
<_sre.SRE_Match 

KeyboardInterrupt: 