# Webscrapeo de escuelas - INDIA

In [2]:
import time
import requests
import numpy as np
import pandas as pd

## Funciones

In [3]:
def get_school_df(URL, newId, counter):
    """[Get School DataFrame by calling its API]

    Parameters
    ----------
    URL : [str]
        [String with the original URL]
    newId : [int]
        [Integer belonging to certain school]

    Returns
    -------
    [school_df]
        [DataFrame with the school's info]
    """
    
    new_API = URL.replace("objectIds=20", f"objectIds={newId}")
    r = requests.get(new_API)
    content = r.json()
    school_info = content['features'][0]['attributes']
    school_df = pd.DataFrame(school_info, index=[counter])
    school_df = school_df[['objectid_1', 'fid_school', 'objectid', 'schcd', 'schname', 'schcat',
       'school_cat', 'schtype', 'school_typ', 'schmgt', 'management', 'rururb', 'location', 
       'pincode', 'dtname', 'udise_stco', 'stname', 'longitude', 'latitude', 'stcode11', 'dtcode11']]
    
    school_df.rename(columns={"schcd": "school_code", "schname": "school_name"}, inplace=True)

    return school_df


## Parámetros

In [4]:
URL = """https://geoportal.nic.in/nicgis/rest/services/SCHOOLGIS/Schooldata/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&objectIds=20&outFields=*&outSR=102100&quantizationParameters=%7B%22mode%22%3A%22view%22%2C%22originPosition%22%3A%22upperLeft%22%2C%22tolerance%22%3A4.777314267948289%2C%22extent%22%3A%7B%22xmin%22%3A68.5015470000764%2C%22ymin%22%3A6.8114540002900235%2C%22xmax%22%3A97.02722199976724%2C%22ymax%22%3A35.032117999820365%2C%22spatialReference%22%3A%7B%22wkid%22%3A4326%2C%22latestWkid%22%3A4326%7D%7D%7D"""

In [5]:
short_URL = "https://geoportal.nic.in/nicgis/rest/services/SCHOOLGIS/Schooldata/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&objectIds=20"

## Pipeline

In [6]:
URL_2 = """https://geoportal.nic.in/nicgis/rest/services/SCHOOLGIS/Schooldata/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&objectIds=0&outFields=*&outSR=102100&quantizationParameters=%7B%22mode%22%3A%22view%22%2C%22originPosition%22%3A%22upperLeft%22%2C%22tolerance%22%3A4.777314267948289%2C%22extent%22%3A%7B%22xmin%22%3A68.5015470000764%2C%22ymin%22%3A6.8114540002900235%2C%22xmax%22%3A97.02722199976724%2C%22ymax%22%3A35.032117999820365%2C%22spatialReference%22%3A%7B%22wkid%22%3A4326%2C%22latestWkid%22%3A4326%7D%7D%7D"""

In [7]:
r = requests.get(URL_2)
r.json()

{'displayFieldName': 'vilname',
 'transform': {'originPosition': 'upperLeft',
  'scale': [4.777314267948289, 4.777314267948289],
  'translate': [68.5015470000764, 35.032117999820365]},
 'fieldAliases': {'objectid_1': 'objectid_1',
  'fid_school': 'fid_school',
  'objectid': 'objectid',
  'schcd': 'schcd',
  'schname': 'School',
  'schcat': 'Cat_Code',
  'school_cat': 'Category',
  'schtype': 'Type Code',
  'school_typ': 'School Type',
  'schmgt': 'Mngt Code',
  'management': 'Management',
  'rururb': 'Rural_Urban',
  'location': 'Location',
  'pincode': 'Pincode',
  'dtname': 'District',
  'udise_stco': 'udise_stco',
  'stname': 'State',
  'vilname': 'Village',
  'longitude': 'longitude',
  'latitude': 'latitude',
  'stcode11': 'stcode11',
  'dtcode11': 'dtcode11'},
 'geometryType': 'esriGeometryPoint',
 'spatialReference': {'wkid': 102100, 'latestWkid': 3857},
 'fields': [{'name': 'objectid_1',
   'type': 'esriFieldTypeOID',
   'alias': 'objectid_1'},
  {'name': 'fid_school',
   'type

In [9]:
school_df = get_school_df(URL, 1489554, 1)
school_df

Unnamed: 0,objectid_1,fid_school,objectid,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,...,rururb,location,pincode,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
1,1489554,1025827,1033102,37070500910,YARMA GONPA,1,Primary,1,Boys,4,...,1,,194401,Leh,37,LADAKH,77.341913,35.032118,38,3


## Probar con cierto número de escuelas

In [21]:
schools_df = pd.DataFrame([])
school_numbers = np.arange(1495001,1500000)
for school_number in school_numbers:
    try:
        school_df = get_school_df(URL, school_number, school_number)
        schools_df = pd.concat([schools_df, school_df], ignore_index=True)
        time.sleep(np.random.randint(0, 2))
    
    except Exception as err:
        print(str(err))
        print("Unable to get data")
        school_df = pd.DataFrame([])
        schools_df = pd.concat([schools_df, school_df], ignore_index=True)
        time.sleep(2)

    if school_number % 1000 == 0:
        time.sleep(4)
        print(f"{school_number} schools obtained")
        schools_df.to_csv(f"schools_df_{school_number}.csv", index=False, encoding='utf-8-sig')

list index out of range
Unable to get data
list index out of range
Unable to get data


KeyboardInterrupt: 

In [22]:
schools_df