In [None]:
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import time
from kaggle_secrets import UserSecretsClient
import os
import subprocess

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Ned to get latitude and longitude 

In [None]:
original_df = pd.read_csv('/kaggle/input/indian-villages-database/addresses.csv')
yeasterday_df = pd.read_csv('/kaggle/input/indian-villages-database/villages.csv')

# Decide now , from where to start

In [None]:
original_df = original_df[[ 'country','state','district','taluka','village']]
yeasterday_df = yeasterday_df[[ 'country','state','district','taluka','village']]

In [None]:
limit = 40000
yeasterday_collection = len(yeasterday_df)
print(yeasterday_collection)
original_length = len(original_df)
if original_length - yeasterday_collection > limit:
    target_for_today = yeasterday_collection+limit
else:
    target_for_today = original_length - yeasterday_collection
print(target_for_today)

In [None]:
target_df = original_df[yeasterday_collection:target_for_today]
target_df

In [None]:
latlist = []
lonlist = []

for index, row in tqdm(target_df.iterrows()):
    state = row.iloc[1]
    district = row.iloc[2]
    taluka = row.iloc[3]
    village = row.iloc[4]
    url_to_hit = f"https://www.geonames.org/advanced-search.html?q={village}%2C{taluka}%2C{district}%2C{state}&country=IN&featureClass=&continentCode="  
    r = rq.get(url_to_hit,headers = headers)    
    soup = BeautifulSoup(r.text,'html.parser')   
    lat = soup.find('span',{'class':'latitude'})    
    lon = soup.find('span',{'class':'longitude'})   
    if lat is not None and lon is not None:
        latlist.append(lat.text)
        lonlist.append(lon.text)
    else:
        latlist.append(None)
        lonlist.append(None)

In [None]:
new_df = target_df.copy()
new_df

In [None]:
new_df['latitude'] = latlist
new_df['longitude'] = lonlist
new_df

# update data

In [None]:
updated_df = pd.concat([yeasterday_df, new_df], ignore_index=True)
updated_df

In [None]:
os.mkdir('datasets')
# todays updated dataset
updated_df.to_csv('datasets/villages.csv')
# original_dataset
original_df.to_csv('datasets/addresses.csv')

# Upload 

In [None]:
user_secrets = UserSecretsClient()
kaggle_apikey = user_secrets.get_secret("kaggle_apikey")
kaggle_username = user_secrets.get_secret("kaggle_username")

os.environ['KAGGLE_USERNAME'] = kaggle_username
os.environ['KAGGLE_KEY'] = kaggle_apikey

In [None]:
print('Starting upload >>>>>>') 
print('Creating metadata file >>>>')
data = {
    "id": "ayushkhaire/indian-villages-database"
}
metadata_file_location = '/kaggle/working/datasets/dataset-metadata.json' 
with open(metadata_file_location, 'w', encoding='utf-8') as metadata_file:
    json.dump(data, metadata_file)
print('Metadata file created')

In [None]:
retries = 0
while retries < 5:
    try:
        command = "kaggle datasets version -p '/kaggle/working/datasets' -m 'Update' -r zip"
        subprocess.run(command, shell=True, check=True)
        print("Upload complete")
        break
    except Exception as error:
        print(f"Error from Kaggle: {error}")
        time.sleep(5)
        retries += 1