In [1]:
import re
import requests
import requests_html
import string
import pandas as pd

In [2]:
CATEGORY = 'Category:Hindi_terms_with_IPA_pronunciation'
LIMIT = 500    #number of vocab or 'TITLE'
INITIAL_QUERY = f'https://en.wiktionary.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle={CATEGORY}&cmlimit={LIMIT}'
CONTINUE_TEMPLATE = string.Template(INITIAL_QUERY + "&cmcontinue=$cmcontinue")

PAGE_TEMPLATE = string.Template("https://en.wiktionary.org/wiki/$word")     # $_ -> placeholder for values of a dictionary. Template class of string library.

LI_SELECTOR = '//li[sup[a[@title = "Appendix:Hindi pronunciation"]] and span[@class = "IPA"]]' #language specific
SPAN_SELECTOR = '//span[@class = "IPA"]'       # what is this 'span' doing? 
PHONEMES = r"/(.+?)/"   #language specific


In [16]:
def _yield_phn(request):
    for li in request.html.xpath(LI_SELECTOR):
        for span in li.xpath(SPAN_SELECTOR):
            print(span.text, '\n\n')
            m = re.search(PHONEMES, span.text)
            if m:
                yield m

                
                
            #(<a href="/wiki/Appendix:Hindi_pronunciation" title="Appendix:Hindi pronunciation">key</a>)</sup>:&#32;<span class="IPA">/ə.səm.bʱɑː.ʋɪt̪/</span></li></ul>

In [18]:
def _print_data(data):
    session = requests_html.HTMLSession()
    """" <example of data>
query":{"categorymembers":[{"pageid":81811,"ns":0,"title":"Homo sapiens"},
{"pageid":3072481,"ns":0,"title":"\u0901"}...."""
    
    for member in data["query"]["categorymembers"]:
        word = member["title"]   
        # Skips multiword examples.
        if " " in word:
            continue
        query = PAGE_TEMPLATE.substitute(word=word)
        request = session.get(query)  #Grab a list of all links on the page
        for m in _yield_phn(request):
            pron = m.group(1)   #IPA without //. m.group(0) =  /IPA/ 
            #print(f"{word}\t{pron}")
          

In [19]:
def main():
    data = requests.get(INITIAL_QUERY).json()
    print('Data is requested, and now put to _print_data function.')
    _print_data(data)
    print('Initial query is processed.')
    code = data["continue"]["cmcontinue"]  
    next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
    i = 1
    while True:
        print('while loop is started.')
        data = requests.get(next_query).json()
        print(f'{i+1} request is done')
        _print_data(data)
        print(f'{i+1} request query is processed.')
        i += 1    
        # Then this is the last one.
        
        if not "continue" in data:
            break
        code = data["continue"]["cmcontinue"]   
        next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
        
if __name__ == "__main__":
    main()

Data is requested, and now put to _print_data function.
[ə̃] 


[ə̃ŋ] 


[ə̃ɲ] 


[ə̃ɳ] 


[ə̃n] 


[ə̃m] 


/ə/ 


/ə̃/ 


/ə̃ɡ.ɾeː.ziː/ 


/ə̃ʈ.nɑː/ 


/ə̃/ 


/əŋk/ 


/əŋ.kən/ 


/əŋ.kəl/ 


/əŋ.kɪt̪/ 


/əŋ.kʊɾ/ 


/əŋ.kʊ.ɾɪt̪/ 


/əŋ.kʊʃ/ 


/əŋɡ/ 


/əŋ.ɡət͡ʃ.t͡ʃʰeːd̪/ 


/əŋɡ.ɽɑːiː/ 


/əŋɡ.d̪ɑːn/ 


/əŋ.ɡə.nɑː/ 


/əŋ.ɡʱiːn/ 


/əŋ.ɡɑːɾ/ 


/əŋ.ɡɑː.ɾɑː/ 


/əŋ.ɡiː.jɑː/ 


/əŋ.ɡiː.ʈʰiː/ 


/əŋ.ɡʊl/ 


/əŋ.ɡʊ.liː/ 


/əŋ.ɡuː.ʈʰɑː/ 


/əŋ.ɡuːɾ/ 


/əŋ.ɡoː.t͡ʃʰɑː/ 


/əŋ.ɡoːʈ/ 


/əŋɡ.ɾeːz/ 


/əŋɡ.ɾeː.ziː.jət̪/ 


/əŋɡ.ɾeː.ziː/ 


/əŋɡ.ɾeː.d͡ʒiː/ 


/əɲ.t͡ʃəl/ 


/əɲ.t͡ʃʰəɾ/ 


/əɲ.d͡ʒən/ 


/əɲ.d͡ʒɑːm/ 


/əɲ.d͡ʒiːɾ/ 


/əɲ.d͡ʒuː/ 


/ənʈ.sənʈ/ 


/ən.ʈɑːɾ.kə.ʈɪ.kɑː/ 


[ə̃ɳɖəkoʃ] 


/ən.ɖɑː/ 


/ən.ɖoː.ɾɑː/ 


/ənt̪/ 


/ən.t̪əʰk.ʃeːp/ 


/ənt̪.ɽiː/ 


/ən.t̪ə.ɾəŋɡ.t̪ɑː/ 


/ən.t̪əɾ.d͡ʒɑːl/ 


/ən.t̪əɾ.ɾɑːʃʈ.ɾiː.jə.t̪ɑː/ 


/ənt̪.ɾɑː/ 


/ən.t̪ə.ɾɑː/ 


/ən.t̪ə.ɾɑːl/ 


/ənt̪.ɾɪkʃ/ 


/ənt̪.ɾɪm/ 


/ən.t̪əɾ.ɡət̪/ 


/ən.t̪əɾ.d͡ʒɑː.t̪iːj/ 


/ən.t̪əɾ.d͡ʒɑːl/ 


/ən.t̪əɾ.d̪eː.ʃiː

/əs.t̪ʰɪ.pəɲ.d͡ʒəɾ/ 


/əs.pə.t̪ɑːl/ 


/əˈspɾɪʃ.jə/ 


/əs.mɪ.t̪ɑː/ 


/əs.siː/ 


/ɛːʱm/ 


/eː.ɦeːŋ.kɑːɾ/ 


/ɛːʱ.bɑːb/ 


/ɛːʱm/ 


/ɛːʱ.miː.jət̪/ 


/ə.ɦɑː.t̪ɑː/ 


/ə.ɦɪt͡ʃ.t͡ʃʰət̪ɾ/ 


/ə.ɦɪ.nɑːʱ/ 


/ə.ɦɪbʱ.ɾɪt̪/ 


/ə.ɦɪɾ.bʊʃn/ 


/ə.ɦɪ.ʋɑːt̪/ 


/ə.ɦɪ.ʋɑːd̪/ 


/ɑː/ 


/aː/ 


/a/ 


/ɑː/ 


[äː] 


/ɑ̃ːkʰ/ 


/ɑ̃ː.ɡən/ 


/ɑ̃ːt͡ʃ/ 


/ɑ̃ː.suː/ 


/ɑːŋk.ɽɑː/ 


/ɑːŋ.ɡən/ 


/ɑːnt̪.ɾɪk/ 


/ɑːn.d̪oː.lən/ 


/ɑ̃ːʋ.lɑː/ 


/ɑːn.ʃɪk/ 


/ɑːɪn.d̪ɑː/ 


/ɑːɪ.zoːl/ 


/ɑːɪs.kɾiːm/ 


/ɑːɪsɖ/ 


/ɑːiː/ 


/ɑːiː.lɑː/ 


/ɑːuː/ 


/ɑːk/ 


/ɑː.kəɾ.ʃək/ 


/ɑː.kəɾ.ʃən/ 


/ɑː.kəɾ.ʃɪt̪/ 


/ɑː.kə.lən/ 


/ɑː.kəs.mɪk/ 


/ɑː.kɑːŋk.ʃɑː/ 


/ɑː.kɑːŋk.ʃiː/ 


/ɑː.kɑːɾ/ 


[aːkaːʃ] 


/ɑː.kɑːʃ.ʋɑː.niː/ 


/ɑːk.ɾɪ.t̪iː/ 


/ɑːk.ɾə.mən/ 


/ɑːk.ɾə.mən.kɑː.ɾiː/ 


/ɑːk.ɾɑː.mək/ 


/ɑːk.ɾɑː.mək.t̪ɑː/ 


/ɑːk.ɾoːʃ/ 


/ɑː.kʰɪɾ/ 


/ɑː.xɪɾ/ 


/ɑː.kʰɪɾ.kɑːɾ/ 


/ɑː.xɪɾ.kɑːɾ/ 


/ɑːkʰ.ɾiː/ 


/ɑːx.ɾiː/ 


/ɑː.kʰɪ.ɾiː/ 


/ɑː.xɪ.ɾiː/ 


/ɑːɡ/ 


/aːɡ.zə.niː/ 


/ɑː.ɡɑːz/ 


/ɑː.ɣɑːz/ 


/ɑː.ɡoːʃ/ 




/ʊɾf/ 


/ʊɾ.ʋə.ɾək/ 


/ʊl.d͡ʒʱən/ 


/ʊ.ləd͡ʒʱ.nɑː/ 


/ʊl.d͡ʒʱɑː.nɑː/ 


/ʊ.ləʈ.nɑː/ 


/ʊl.ʈɑː/ 


/ʊl.ʈiː/ 


[ʊlfət̪] 


/ˈʊl.ɦnɑː/ 


/ʊ.liːt͡ʃ.nɑː/ 


/ʊ.luːk/ 


/ʊl.kɑː/ 


/ʊl.ʈɑː/ 


/ʊl.ʈiː/ 


/ʊl.ləŋ.ɡʱən/ 


/ʊl.leːkʰ/ 


/ʊl.leːkʰ.niːj/ 


[ʊskaː] 


/ʊs.koː/ 


/ʊs.mɑːn/ 


/ʊs.seː/ 


/ʊ.suːl/ 


/ʊ.seː/ 


/ʊs.t̪ɑːd̪/ 


/uː/ 


/ũː.t͡ʃɑː/ 


/uː.t͡ʃɑː/ 


/ũːʈ/ 


/ũˈt͡ʃa/ 


/uːkʰ/ 


/uːd̪.bɪ.lɑːʋ/ 


/uːn/ 


/uː.nɑː/ 


/uː.niː/ 


/uː.pəɾ.ʋɑː.lɑː/ 


/uːp.ɾiː/ 


/uː.fɑː/ 


/uːb/ 


/uː.bəɽ.kʰɑː.bəɽ/ 


/uːb.nɑː/ 


/uːɾ.d͡ʒɑː/ 


/uːʃ.miːk.ɾən/ 


/uː.səɾ/ 


/ɾiː/ 


[ˈɾɪɡʋed̪] 


/ɾɪn/ 


/ɾɪn.d̪ɑː.t̪ɑː/ 


/ɾɪn.d̪ɑː.t̪ɑːoː/ 


/ɾɪn.d̪ɑː.t̪ɑːoːn/ 


/ɾɪ.nɑːɡɾ/ 


/ɾɪ.nɑː.jən/ 


/ɾɪ.ʃiː/ 


/eː/ 


/eː/ 


/eːnɖ.ɾɔːɪɖ/ 


/eːk.eːk/ 


/eːk.ɡʱɑː.t̪iːj/ 


/eːk.d͡ʒʊʈ/ 


/eːk.d͡ʒʊʈ.t̪ɑː/ 


/eː.kəɽ/ 


/eː.kəɾ/ 


/eːk.t̪əɾ.fɑː/ 


/eː.kət̪ɾ/ 


/eːk.ɾuːp.t̪ɑː/ 


/eːk.ʋə.t͡ʃən/ 


/eːk.ʋəɾ.niːj/ 


/eː.kɑːnt̪/ 


/eː.kɑːeːk/ 


/eː.kɑːd̪ʱ/ 


/eː.kɑː.d̪ʱɪ.k

KeyboardInterrupt: 

In [8]:
data = requests.get(INITIAL_QUERY).json() 

In [None]:
_print_data(data)   #check wheather the html format. 


In [48]:
def main():
    data = requests.get(INITIAL_QUERY).json() #In case the JSON decoding fails, r.json() raises an exception
    _print_data(data)
    code = data["continue"]["cmcontinue"]     # next page's number 
    """ <example>
    continue":{"cmcontinue":"page|414245494c4c45204d4552450a414245494c4c45204dc3885245|6478110","continue":"-||"}
    """ 
    next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
    while True:
        
        data = requests.get(next_query).json()
        _print_data(data)
        # Then this is the last one.
        
        if not "continue" in data:
            break
        code = data["continue"]["cmcontinue"]    #assign a new next page's number to code
        next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
        
if __name__ == "__main__":
    main()