In [2]:
import re
import requests
import requests_html
import string
import pandas as pd

In [3]:
CATEGORY = 'Category:Hindi_terms_with_IPA_pronunciation'
LIMIT = 500    #number of vocab or 'TITLE'
INITIAL_QUERY = f'https://en.wiktionary.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle={CATEGORY}&cmlimit={LIMIT}'
CONTINUE_TEMPLATE = string.Template(INITIAL_QUERY + "&cmcontinue=$cmcontinue")

PAGE_TEMPLATE = string.Template("https://en.wiktionary.org/wiki/$word")     # $_ -> placeholder for values of a dictionary. Template class of string library.

LI_SELECTOR = '//li[sup[a[@title = "Appendix:Hindi pronunciation"]] and span[@class = "IPA"]]' #language specific
SPAN_SELECTOR = '//span[@class = "IPA"]'       # what is this 'span' doing? 
PHONEMES = r"/(.+?)/"   #language specific


In [17]:
def _yield_phn(request):
    for li in request.html.xpath(LI_SELECTOR):
        for span in li.xpath(SPAN_SELECTOR):
            m = re.search(PHONEMES, span.text)
            print(m)
            if m:
                yield m


In [12]:
def _print_data(data):
    session = requests_html.HTMLSession()
    """" <example of data>
query":{"categorymembers":[{"pageid":81811,"ns":0,"title":"Homo sapiens"},
{"pageid":3072481,"ns":0,"title":"\u0901"}...."""
    
    for member in data["query"]["categorymembers"]:
        word = member["title"]   
        # Skips multiword examples.
        if " " in word:
            continue
        query = PAGE_TEMPLATE.substitute(word=word)
        request = session.get(query)  #Grab a list of all links on the page
        print(request)
        for m in _yield_phn(request):
            pron = m.group(1)   #IPA without //. m.group(0) =  /IPA/ 
            print(f"{word}\t{pron}")
          

In [48]:
def main():
    data = requests.get(INITIAL_QUERY).json() #In case the JSON decoding fails, r.json() raises an exception
    _print_data(data)
    code = data["continue"]["cmcontinue"]     # next page's number 
    """ <example>
    continue":{"cmcontinue":"page|414245494c4c45204d4552450a414245494c4c45204dc3885245|6478110","continue":"-||"}
    """ 
    next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
    while True:
        
        data = requests.get(next_query).json()
        _print_data(data)
        # Then this is the last one.
        
        if not "continue" in data:
            break
        code = data["continue"]["cmcontinue"]    #assign a new next page's number to code
        next_query = CONTINUE_TEMPLATE.substitute(cmcontinue=code)
        
if __name__ == "__main__":
    main()

In [8]:
data = requests.get(INITIAL_QUERY).json() 

In [18]:
_print_data(data)

<Response [200]>
None
<Response [200]>
None
None
None
None
None
<Response [200]>
<_sre.SRE_Match object; span=(0, 3), match='/ə/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 4), match='/ə̃/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 13), match='/ə̃ɡ.ɾeː.ziː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 9), match='/ə̃ʈ.nɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 4), match='/ə̃/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 5), match='/əŋk/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kən/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kəl/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 9), match='/əŋ.kɪt̪/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kʊɾ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 12), match='/əŋ.kʊ.ɾɪt̪/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/əŋ.kʊʃ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 5), match='/əŋɡ/'>
<Respo

<_sre.SRE_Match object; span=(0, 17), match='/əx.bɑːɾ.ʋɑː.lɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 10), match='/əkʰ.ɾoːʈ/'>
<_sre.SRE_Match object; span=(0, 9), match='/əx.ɾoːʈ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 12), match='/ə.kʰɑː.ɽɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/ə.kʰɪl/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 11), match='/ə.ɡə.nɪt̪/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 7), match='/ə.ɡəɾ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 13), match='/ə.ɡəɾ.t͡ʃeː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 15), match='/ə.ɡəɾ.t̪ə.lɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 17), match='/ə.ɡəɾ.bət̪.t̪iː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/əɡ.lɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 9), match='/ə.ɡəst̪/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 7), match='/ə.ɡɪn/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8)

<Response [200]>
<_sre.SRE_Match object; span=(0, 26), match='/ə.d̪ʱoː.ɦəs.t̪ɑːk.ʃə.ɾiː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 16), match='/əd̪ʱ.jəkʃ.t̪ɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 13), match='/əd̪ʱ.jə.jən/'>
<_sre.SRE_Match object; span=(0, 10), match='/əd̪ʱ.jən/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 16), match='/əd̪ʱ.jɑː.d̪eːʃ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 14), match='/əd̪ʱ.jɑː.pək/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 4), match='/ən/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 11), match='/ən.kə.ɦɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 12), match='/ən.ɡɪ.nət̪/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 14), match='/ən.t͡ʃɑː.ɦɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 14), match='/ən.d͡ʒɑː.nɑː/'>
<Response [200]>
None
<Response [200]>
<_sre.SRE_Match object; span=(0, 10), match='/ən.pət͡ʃ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 9), ma

<Response [200]>
<_sre.SRE_Match object; span=(0, 12), match='/ə.bɪ.t͡ʃəl/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 14), match='/ə.bɪ.ʋɑːt̪t̪/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/ə.biːɾ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/əb.bɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 5), match='/əbɾ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/əb.ɾuː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 9), match='/ə.bʱɑːʋ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 16), match='/ə.bʱɪ.kə.lɪt̪ɾ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 15), match='/ə.bʱɪ.kɑː.ɾək/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 16), match='/ə.bʱɪk.ɾiː.jɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 11), match='/ə.bʱɪ.ɡəm/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 17), match='/ə.bʱɪd̪.ɾɪʃ.jək/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 11), match='/ə.bʱɪ.nəj/'>
<Response 

<Response [200]>
<_sre.SRE_Match object; span=(0, 12), match='/əʋ.loː.kən/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 12), match='/əʋ.ʃoː.ʃən/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 13), match='/əʋ.ʃoː.ʃɪt̪/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/ə.ʋəʃj/'>
<Response [200]>
<Response [200]>
<_sre.SRE_Match object; span=(0, 10), match='/əʋ.ɦəʈʈʰ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 13), match='/əʋ.ɦeːl.nɑː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 9), match='/ə.ʋɑːiː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 8), match='/ə.ʋɑːm/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 9), match='/ə.ʋɑːɾɖ/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 14), match='/ə.ʋɪ.kɑː.ɾiː/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 12), match='/ə.ʋɪ.t͡ʃəl/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 10), match='/ə.ʋɪ.nəj/'>
<Response [200]>
<_sre.SRE_Match object; span=(0, 11), match='/ə.ʋɪ.ləmb/'>
<