In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def get_webpage(page_url):
    # Get the page from the given URL
    response = requests.get(page_url)
    # Check the status of the page is successful else print the error
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(page_url))
    # Parse the web page to beautiful soup
    doc = BeautifulSoup(response.text, 'html.parser')
    return doc

In [None]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population'
doc = get_webpage(wiki_url)
#doc

In [16]:
tables = doc.find_all('table', style = 'text-align:right')

In [18]:
len(tables)

2

In [49]:
datarow = tables[1].find_all('tr')
len(datarow)
datarow[263]

<tr>
<td>309</td>
<td><a href="/wiki/Aurangabad,_Bihar" title="Aurangabad, Bihar">Aurangabad</a></td>
<td>101,520</td>
<td>79,393<sup class="reference" id="cite_ref-30"><a href="#cite_note-30"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup></td>
<td><a href="/wiki/Bihar" title="Bihar">Bihar</a></td>
<td>
</td></tr>

In [53]:
taglist = datarow[263].find_all('a')
city = taglist[0].text
city

state = taglist[2].text
type(state)
state

'Bihar'

In [77]:
import pandas as pd
import numpy as np
def extract_data(webdoc):

    city_state_map = dict()
    
    tables = doc.find_all('table', style = 'text-align:right')
    for table in tables:
        #print(table)
        datarow = table.find_all('tr')
        for row in datarow[1:]:
            #print(row)
            taglist = row.find_all('a')
            city = taglist[0].text.strip()
            text = taglist[1].text.strip() 
            i = 1
            while text.startswith('['):
                i = i + 1
                text = taglist[i].text.strip()
            state = text if text else np.nan
            #print(city, ',', state)
            
            if state in city_state_map:
                city_state_map[state].append(city)
            else:
                city_state_map[state] = [city]
            
    return city_state_map
        

In [80]:
json_file = extract_data(doc)

In [81]:
json_file

{'Maharashtra': ['Mumbai',
  'Pune',
  'Nagpur',
  'Thane',
  'Pimpri-Chinchwad',
  'Nashik',
  'Kalyan-Dombivli',
  'Vasai-Virar',
  'Aurangabad',
  'Navi Mumbai',
  'Solapur',
  'Mira-Bhayandar',
  'Bhiwandi',
  'Amravati',
  'Nanded',
  'Kolhapur',
  'Ulhasnagar',
  'Sangli-Miraj & Kupwad',
  'Malegaon',
  'Jalgaon',
  'Akola',
  'Latur',
  'Dhule',
  'Ahmednagar',
  'Chandrapur',
  'Parbhani',
  'Ichalkaranji',
  'Jalna',
  'Ambarnath',
  'Bhusawal',
  'Panvel',
  'Badlapur',
  'Beed',
  'Gondia',
  'Satara',
  'Barshi',
  'Yavatmal',
  'Achalpur',
  'Osmanabad',
  'Nandurbar',
  'Wardha',
  'Udgir',
  'Hinganghat'],
 'Delhi': ['Delhi',
  'Kirari Suleman Nagar',
  'New Delhi',
  'Karawal Nagar',
  'Nangloi Jat',
  'Bhalswa Jahangir Pur',
  'Sultan Pur Majra'],
 'Karnataka': ['Bengaluru',
  'Hubli–Dharwad',
  'Mysore',
  'Gulbarga',
  'Mangalore',
  'Belgaum',
  'Davanagere',
  'Bellary',
  'Bijapur',
  'Shivamogga',
  'Tumkur',
  'Raichur',
  'Bidar',
  'Hospet',
  'Gadag-Betageri'

In [83]:
import json
# Save to a JSON file
with open('city_state_map.json', 'w', encoding='utf-8') as f:
    json.dump(json_file, f, ensure_ascii=False, indent=4)