# Webscrapeo de escuelas - INDIA

In [1]:
import time
import requests
import numpy as np
import pandas as pd

## Funciones

In [2]:
def get_school_df(URL, newId, counter):
    """[Get School DataFrame by calling its API]

    Parameters
    ----------
    URL : [str]
        [String with the original URL]
    newId : [int]
        [Integer belonging to certain school]

    Returns
    -------
    [school_df]
        [DataFrame with the school's info]
    """
    
    new_API = URL.replace("objectIds=20", f"objectIds={newId}")
    r = requests.get(new_API)
    content = r.json()
    school_info = content['features'][0]['attributes']
    school_df = pd.DataFrame(school_info, index=[counter])
    school_df = school_df[['objectid_1', 'fid_school', 'objectid', 'schcd', 'schname', 'schcat',
       'school_cat', 'schtype', 'school_typ', 'schmgt', 'management', 'rururb', 'location', 
       'pincode', 'dtname', 'udise_stco', 'stname', 'longitude', 'latitude', 'stcode11', 'dtcode11']]
    
    school_df.rename(columns={"schcd": "school_code", "schname": "school_name"}, inplace=True)

    return school_df


## Parámetros

In [3]:
URL = """https://geoportal.nic.in/nicgis/rest/services/SCHOOLGIS/Schooldata/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&objectIds=20&outFields=*&outSR=102100&quantizationParameters=%7B%22mode%22%3A%22view%22%2C%22originPosition%22%3A%22upperLeft%22%2C%22tolerance%22%3A4.777314267948289%2C%22extent%22%3A%7B%22xmin%22%3A68.5015470000764%2C%22ymin%22%3A6.8114540002900235%2C%22xmax%22%3A97.02722199976724%2C%22ymax%22%3A35.032117999820365%2C%22spatialReference%22%3A%7B%22wkid%22%3A4326%2C%22latestWkid%22%3A4326%7D%7D%7D"""

In [4]:
short_URL = "https://geoportal.nic.in/nicgis/rest/services/SCHOOLGIS/Schooldata/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&objectIds=20"

## Pipeline

In [5]:
URL_2 = """https://geoportal.nic.in/nicgis/rest/services/SCHOOLGIS/Schooldata/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&objectIds=0&outFields=*&outSR=102100&quantizationParameters=%7B%22mode%22%3A%22view%22%2C%22originPosition%22%3A%22upperLeft%22%2C%22tolerance%22%3A4.777314267948289%2C%22extent%22%3A%7B%22xmin%22%3A68.5015470000764%2C%22ymin%22%3A6.8114540002900235%2C%22xmax%22%3A97.02722199976724%2C%22ymax%22%3A35.032117999820365%2C%22spatialReference%22%3A%7B%22wkid%22%3A4326%2C%22latestWkid%22%3A4326%7D%7D%7D"""

In [6]:
r = requests.get(URL_2)
r.json()

{'displayFieldName': 'vilname',
 'transform': {'originPosition': 'upperLeft',
  'scale': [4.777314267948289, 4.777314267948289],
  'translate': [68.5015470000764, 35.032117999820365]},
 'fieldAliases': {'objectid_1': 'objectid_1',
  'fid_school': 'fid_school',
  'objectid': 'objectid',
  'schcd': 'schcd',
  'schname': 'School',
  'schcat': 'Cat_Code',
  'school_cat': 'Category',
  'schtype': 'Type Code',
  'school_typ': 'School Type',
  'schmgt': 'Mngt Code',
  'management': 'Management',
  'rururb': 'Rural_Urban',
  'location': 'Location',
  'pincode': 'Pincode',
  'dtname': 'District',
  'udise_stco': 'udise_stco',
  'stname': 'State',
  'vilname': 'Village',
  'longitude': 'longitude',
  'latitude': 'latitude',
  'stcode11': 'stcode11',
  'dtcode11': 'dtcode11'},
 'geometryType': 'esriGeometryPoint',
 'spatialReference': {'wkid': 102100, 'latestWkid': 3857},
 'fields': [{'name': 'objectid_1',
   'type': 'esriFieldTypeOID',
   'alias': 'objectid_1'},
  {'name': 'fid_school',
   'type

In [7]:
school_df = get_school_df(URL, 1488900, 1)
school_df

Unnamed: 0,objectid_1,fid_school,objectid,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,...,rururb,location,pincode,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
1,1488900,531525,535786,1010302403,UPS AWATHKOOLI,2,Upper Primary,3,Co-Ed,1,...,1,,193224,Kupwara,1,JAMMU & KASHMIR,74.0756,34.5476,1,1


## Probar con cierto número de escuelas

In [9]:
schools_df = pd.DataFrame([])
school_numbers = np.arange(1280001,1285001)
for school_number in school_numbers:
    try:
        school_df = get_school_df(URL, school_number, school_number)
        schools_df = pd.concat([schools_df, school_df], ignore_index=True)
        time.sleep(np.random.randint(0, 2))
    
    except Exception as err:
        print(str(err))
        print("Unable to get data")
        school_df = pd.DataFrame([])
        schools_df = pd.concat([schools_df, school_df], ignore_index=True)
        time.sleep(2)

    if school_number % 1000 == 0:
        time.sleep(4)
        print(f"{school_number} schools obtained")
        schools_df.to_csv(f"schools_df_{school_number}.csv", index=False, encoding='utf-8-sig')

1281000 schools obtained
1282000 schools obtained
1283000 schools obtained
1284000 schools obtained
1285000 schools obtained


In [13]:
schools_df

Unnamed: 0,objectid_1,fid_school,objectid,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,...,rururb,location,pincode,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
0,1,233143,235240,35020300702,GOVT PRIMARY SCHOOL SHASTRI NAGAR,1,Primary,3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.892304,6.811454,35,638
1,2,747816,753170,35020300201,GOVT SECONDARY SCHOOL GANDHI NAGAR,6,Secondary (1-10),3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.890508,6.838517,35,638
2,3,55885,56582,35020300703,GOVT PRIMARY SCHOOL LAXMI NAGAR,1,Primary,3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.886408,6.893214,35,638
3,4,639587,644125,35020300701,GOVT SENIOR SECONDARY SCHOOL VIJAY NAGAR,3,Higher Secondary (1-12),3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.919901,6.961255,35,638
4,5,223993,226006,35020300404,GOVT PRIMARY SCHOOL CHINGAN,1,Primary,3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.912472,6.972397,35,638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30043,30220,1320813,1335153,32070502903,ANSAR ENGLISH SCHOOL,3,Higher Secondary (1-12),3,Co-Ed,5,...,1,,680519,Thrissur,32,KERALA,76.089887,10.699364,32,594
30044,30221,198499,200320,33200704101,"PUPS, KUDITHANGICHERI",1,Primary,3,Co-Ed,1,...,1,,614102,Thiruvarur,33,TAMIL NADU,79.499640,10.699370,33,619
30045,30222,1057599,1064993,33150402609,"ST.JOSEPH'S HR. SEC. SCHOOL, NAGAMANGALAM",3,Higher Secondary (1-12),3,Co-Ed,4,...,1,,620012,Tiruchirappalli,33,TAMIL NADU,78.621328,10.699393,33,614
30046,30223,579554,583943,33190303103,PUMS KERALANTHAN,2,Upper Primary,3,Co-Ed,1,...,1,,610106,Thiruvarur,33,TAMIL NADU,79.652170,10.699410,33,619
