In [1]:
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import time

In [2]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

In [3]:
def get_soup(url):
    url = url
    r = rq.get(url,headers = headers)
    soup = BeautifulSoup(r.text,'html.parser')
    return soup

In [4]:
def get_all_links(soup):  
    listlink = []
    links = soup.find_all('a')
    for link in links:
        part = link['href']
        listlink.append(f'https://villageinfo.in{part}')
    return listlink

In [5]:
def get_all_links_table(soup):
    listlink = []
    table = soup.find('table')
    tbody = table.find('tbody')
    trs = tbody.find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        for td in tds:
            if td.find('a') is not None:   
                link = td.find('a')
                part = link['href']
                if part[0] == '/':
                    listlink.append(f'https://villageinfo.in{part}')
                else:
                    listlink.append(f'https://villageinfo.in/{part}')
    return listlink

In [6]:
def construct_table(table):
    head = table.find('thead')
    headlist = []
    if head is not None:
        heads = head.find_all('th')
        for hd in heads:
            txt = hd.text
            headlist.append(txt)
    
    body = table.find('tbody')
    trslist = body.find_all('tr')
    table_body = []
    for tr in trslist:
        final_row = []
        tds = tr.find_all('td')
        for td in tds:
            final_row.append(td.text)
        table_body.append(final_row)
        
    if len(headlist) != 0:
        final_table_list = [headlist] + table_body
        return final_table_list
    else:
        final_table_list = table_body
        return final_table_list

# get states

In [7]:
url = 'https://villageinfo.in/'
soup = get_soup(url)
states_list = get_all_links(soup)
states_list

['https://villageinfo.in/andaman-&-nicobar-islands.html',
 'https://villageinfo.in/andhra-pradesh.html',
 'https://villageinfo.in/arunachal-pradesh.html',
 'https://villageinfo.in/assam.html',
 'https://villageinfo.in/bihar.html',
 'https://villageinfo.in/chandigarh.html',
 'https://villageinfo.in/chhattisgarh.html',
 'https://villageinfo.in/dadra-&-nagar-haveli.html',
 'https://villageinfo.in/daman-&-diu.html',
 'https://villageinfo.in/delhi.html',
 'https://villageinfo.in/goa.html',
 'https://villageinfo.in/gujarat.html',
 'https://villageinfo.in/haryana.html',
 'https://villageinfo.in/himachal-pradesh.html',
 'https://villageinfo.in/jammu-&-kashmir.html',
 'https://villageinfo.in/jharkhand.html',
 'https://villageinfo.in/karnataka.html',
 'https://villageinfo.in/kerala.html',
 'https://villageinfo.in/lakshadweep.html',
 'https://villageinfo.in/madhya-pradesh.html',
 'https://villageinfo.in/maharashtra.html',
 'https://villageinfo.in/manipur.html',
 'https://villageinfo.in/meghalaya.

# get districts in a state

In [8]:
url2 = 'https://villageinfo.in/maharashtra.html'
soup2 = get_soup(url2)
dist_list = get_all_links_table(soup2)
dist_list

['https://villageinfo.in/maharashtra/ahmadnagar.html',
 'https://villageinfo.in/maharashtra/akola.html',
 'https://villageinfo.in/maharashtra/amravati.html',
 'https://villageinfo.in/maharashtra/aurangabad.html',
 'https://villageinfo.in/maharashtra/bhandara.html',
 'https://villageinfo.in/maharashtra/bid.html',
 'https://villageinfo.in/maharashtra/buldana.html',
 'https://villageinfo.in/maharashtra/chandrapur.html',
 'https://villageinfo.in/maharashtra/dhule.html',
 'https://villageinfo.in/maharashtra/gadchiroli.html',
 'https://villageinfo.in/maharashtra/gondiya.html',
 'https://villageinfo.in/maharashtra/hingoli.html',
 'https://villageinfo.in/maharashtra/jalgaon.html',
 'https://villageinfo.in/maharashtra/jalna.html',
 'https://villageinfo.in/maharashtra/kolhapur.html',
 'https://villageinfo.in/maharashtra/latur.html',
 'https://villageinfo.in/maharashtra/mumbai.html',
 'https://villageinfo.in/maharashtra/mumbai-suburban.html',
 'https://villageinfo.in/maharashtra/nagpur.html',
 'h

# Get talukas in a district

In [9]:
url2 = 'https://villageinfo.in/maharashtra/nashik.html'
soup2 = get_soup(url2)
talukas_list = get_all_links_table(soup2)
talukas_list

['https://villageinfo.in/maharashtra/nashik/baglan.html',
 'https://villageinfo.in/maharashtra/nashik/chandvad.html',
 'https://villageinfo.in/maharashtra/nashik/deola.html',
 'https://villageinfo.in/maharashtra/nashik/dindori.html',
 'https://villageinfo.in/maharashtra/nashik/igatpuri.html',
 'https://villageinfo.in/maharashtra/nashik/kalwan.html',
 'https://villageinfo.in/maharashtra/nashik/malegaon.html',
 'https://villageinfo.in/maharashtra/nashik/nandgaon.html',
 'https://villageinfo.in/maharashtra/nashik/nashik.html',
 'https://villageinfo.in/maharashtra/nashik/niphad.html',
 'https://villageinfo.in/maharashtra/nashik/peint.html',
 'https://villageinfo.in/maharashtra/nashik/sinnar.html',
 'https://villageinfo.in/maharashtra/nashik/surgana.html',
 'https://villageinfo.in/maharashtra/nashik/trimbakeshwar.html',
 'https://villageinfo.in/maharashtra/nashik/yevla.html']

# get village in a taluka

In [10]:
url3 = 'https://villageinfo.in/maharashtra/nashik/niphad.html'
soup3 = get_soup(url3)
vill_list = get_all_links_table(soup3)
vill_list

['https://villageinfo.in/maharashtra/nashik/niphad/ahergaon.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/amrut-nagar.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/antarweli.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/aurangpur.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/banganganagar.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/behed.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/bharwas.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/bhendali.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/bhuse.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/bokaddare.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/bramhangaon-vanas.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/bramhangaon-vinchur.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/bramhanwade.html',
 'https://villageinfo.in/maharashtra/nashik/niphad/chandori.html',
 'https://villageinfo.in/maharashtr

# filter out now 

## states

In [11]:
states_list_f = states_list[:-4]
states_list_f

['https://villageinfo.in/andaman-&-nicobar-islands.html',
 'https://villageinfo.in/andhra-pradesh.html',
 'https://villageinfo.in/arunachal-pradesh.html',
 'https://villageinfo.in/assam.html',
 'https://villageinfo.in/bihar.html',
 'https://villageinfo.in/chandigarh.html',
 'https://villageinfo.in/chhattisgarh.html',
 'https://villageinfo.in/dadra-&-nagar-haveli.html',
 'https://villageinfo.in/daman-&-diu.html',
 'https://villageinfo.in/delhi.html',
 'https://villageinfo.in/goa.html',
 'https://villageinfo.in/gujarat.html',
 'https://villageinfo.in/haryana.html',
 'https://villageinfo.in/himachal-pradesh.html',
 'https://villageinfo.in/jammu-&-kashmir.html',
 'https://villageinfo.in/jharkhand.html',
 'https://villageinfo.in/karnataka.html',
 'https://villageinfo.in/kerala.html',
 'https://villageinfo.in/lakshadweep.html',
 'https://villageinfo.in/madhya-pradesh.html',
 'https://villageinfo.in/maharashtra.html',
 'https://villageinfo.in/manipur.html',
 'https://villageinfo.in/meghalaya.

# mass scrapping

In [12]:
# district_list_mega = []
# talukas_list_mega = []
# villages_list_mega = []

# for state_link in states_list_f:
#     # print(state_link)
#     soup = get_soup(state_link)
#     district_list = get_all_links_table(soup)
#     district_list_mega.extend(district_list)  

#     for district_link in district_list:
#         # print(district_link)
#         soup = get_soup(district_link)
#         talukas_list = get_all_links_table(soup)
#         talukas_list_mega.extend(talukas_list) 

#         for talukas_link in talukas_list:
#             # print(talukas_link)
#             soup = get_soup(talukas_link)
#             villages_list = get_all_links_table(soup)
#             villages_list_mega.extend(villages_list)  
        
#         time.sleep(5)
#     time.sleep(10)
#     filename = state_link.replace('/','.')
#     filename = filename.split('.')[-2]
#     with open(f'{filename}.txt', 'a') as file:  
#         for village in villages_list_mega:
#             file.write(f"{village}\n")
#     print(f'file : {filename} written successfully .\n')

# print(len(district_list_mega))
# print(len(talukas_list_mega))
# print(len(villages_list_mega))

# code to collect data in detail 

In [13]:
url = "https://villageinfo.in/maharashtra/nashik/niphad/ahergaon.html"

In [14]:
r = rq.get(url,headers = headers)

In [15]:
soup = BeautifulSoup(r.text,'html.parser')

In [16]:
tables = soup.find('table')

In [17]:
tables = construct_table(tables)
tables_df = pd.DataFrame(tables)
tables_df

Unnamed: 0,0,1,2,3
0,Particulars,Total,Male,Female
1,Total Population,3107,1618,1489
2,Literate Population,2328,1276,1052
3,Illiterate Population,779,342,437


In [18]:
tables = soup.find_all('table')

for i in tables:
    tab = construct_table(i)
    tables_df = pd.DataFrame(tab)
    print(tab,'\n\n')

[['Particulars', 'Total', 'Male', 'Female'], ['Total Population', '3,107', '1,618', '1,489'], ['Literate Population', '2,328', '1,276', '1,052'], ['Illiterate Population', '779', '342', '437']] 


[['Ahergaon - Village Overview'], ['Gram Panchayat : ', 'Ahergaon'], ['Block / Tehsil : ', 'Niphad'], ['District : ', 'Nashik'], ['State : ', 'Maharashtra'], ['Pincode : ', 'N/A'], ['Area : ', '852 hectares'], ['Population : ', '3,107'], ['Households : ', '594'], ['Assembly Constituency : ', 'Niphad'], ['Parliament Constituency : ', 'Dindori'], ['Nearest Town : ', 'Nashik (37 km)']] 


[['Related Pages'], ['List of Villages in Niphad'], ['List of Tehsils in Nashik'], ['List of Districts in Maharashtra']] 




In [19]:
soup.find_all('div',{'data-label':'Connectivity Type'})

[<div class="column" data-label="Connectivity Type">Public Bus Service</div>,
 <div class="column" data-label="Connectivity Type">Private Bus Service</div>,
 <div class="column" data-label="Connectivity Type">Railway Station</div>]

In [20]:
soup.find_all('div',{'data-label':'Status'})

[<div class="column" data-label="Status">Status</div>,
 <div class="column" data-label="Status">Available within village</div>,
 <div class="column" data-label="Status">Available within 5 - 10 km distance</div>,
 <div class="column" data-label="Status">Available within 10+ km distance</div>]

In [21]:
final_data_list = {}

# Ned to get latitude and longitude 

In [22]:
# https://www.geonames.org/search.html?q=vinchur%2Cnashik%2Cmaharashtra&country=IN

In [23]:
df = pd.read_csv('/kaggle/input/indian-villages-database/indian_villages_database.csv')
df

Unnamed: 0.1,Unnamed: 0,country,state,district,taluka,village
0,0,India,andaman-&-nicobar-islands,nicobars,car-nicobar,arong
1,1,India,andaman-&-nicobar-islands,nicobars,car-nicobar,big-lapati
2,2,India,andaman-&-nicobar-islands,nicobars,car-nicobar,chuckchucha
3,3,India,andaman-&-nicobar-islands,nicobars,car-nicobar,iaf-camp
4,4,India,andaman-&-nicobar-islands,nicobars,car-nicobar,kakana
...,...,...,...,...,...,...
619285,619285,India,west-bengal,uttar-dinajpur,raiganj,teghara
619286,619286,India,west-bengal,uttar-dinajpur,raiganj,tenahari
619287,619287,India,west-bengal,uttar-dinajpur,raiganj,tenra
619288,619288,India,west-bengal,uttar-dinajpur,raiganj,udaypur


In [24]:
url_to_hit = "https://www.geonames.org/advanced-search.html?q=Vinchur%2Cnashik%2Cmaharashtra&country=IN&featureClass=&continentCode="

In [25]:
r = rq.get(url_to_hit,headers = headers)

In [26]:
soup = BeautifulSoup(r.text,'html.parser')

In [27]:
soup.find('span',{'class':'latitude'})

<span class="latitude">20.104085</span>

In [28]:
soup.find('span',{'class':'longitude'})

<span class="longitude">74.226091</span>

In [29]:
latlist = []
lonlist = []

for index, row in tqdm(df.head(10).iterrows()):
    state = row.iloc[2]
    district = row.iloc[3]
    taluka = row.iloc[4]
    village = row.iloc[5]
    url_to_hit = f"https://www.geonames.org/advanced-search.html?q={village}%2C{taluka}%2C{district}%2C{state}&country=IN&featureClass=&continentCode="  
    r = rq.get(url_to_hit,headers = headers)    
    soup = BeautifulSoup(r.text,'html.parser')   
    lat = soup.find('span',{'class':'latitude'})    
    lon = soup.find('span',{'class':'longitude'})   
    if lat is not None and lon is not None:
        latlist.append(lat.text)
        lonlist.append(lon.text)
        print(lat)
        print(lon)
    else:
        latlist.append(None)
        lonlist.append(None)

9it [00:05,  1.58it/s]

<span class="latitude">9.1666667</span>
<span class="longitude">92.8166667</span>


10it [00:06,  1.58it/s]

<span class="latitude">9.2333333</span>
<span class="longitude">92.7833333</span>





In [30]:
latlist

[None, None, None, None, None, None, None, None, '9.1666667', '9.2333333']