# Concatenar los distintos DataFrames y sacar datos faltantes

## Paquetes

In [3]:
import os
import time
import requests
import numpy as np
import pandas as pd

## Parámetros

In [4]:
URL = """https://geoportal.nic.in/nicgis/rest/services/SCHOOLGIS/Schooldata/MapServer/0/query?f=json&returnGeometry=true&spatialRel=esriSpatialRelIntersects&objectIds=20&outFields=*&outSR=102100&quantizationParameters=%7B%22mode%22%3A%22view%22%2C%22originPosition%22%3A%22upperLeft%22%2C%22tolerance%22%3A4.777314267948289%2C%22extent%22%3A%7B%22xmin%22%3A68.5015470000764%2C%22ymin%22%3A6.8114540002900235%2C%22xmax%22%3A97.02722199976724%2C%22ymax%22%3A35.032117999820365%2C%22spatialReference%22%3A%7B%22wkid%22%3A4326%2C%22latestWkid%22%3A4326%7D%7D%7D"""

## Funciones - Leer dataframes del directorio

In [5]:
def read_all_csvs(files):
    """[Lee todos los csv's del directorio]

    Parameters
    ----------
    files : [list]
        [lista con nombres de los archivos en
        el directorio actual]

    Returns
    -------
    all_schools : [DataFrame]
        [Df con datos de todas las escuelas]
    """
    all_schools = pd.DataFrame([])
    for file in files:
        if '.csv' in file:
            df = pd.read_csv(file)
            all_schools = pd.concat([all_schools, df], ignore_index=True)
            
    else:
        next
    
    # Quitar duplicados
    all_schools.drop_duplicates(inplace=True)
    all_schools.sort_values(by='objectid_1', ascending=True, inplace=True)
    all_schools.reset_index(drop=True, inplace=True)
    
    return all_schools

In [6]:
def get_missing_ids(all_schools):
    """
    Obtiene IDs de las escuelas con datos faltantes 
    """
    
    max_id = all_schools['objectid_1'].max()
    min_id = all_schools['objectid_1'].min()

    # Sacar todas las id's que deberíamos tener
    all_ids = np.arange(min_id, max_id + 1)
    # Sacar id's que tenemos
    existent_ids = list(all_schools['objectid_1'])

    # Sacar id's que nos faltan
    missing_ids = list(set(all_ids) - set(existent_ids))

    return missing_ids

## Funciones - Webscrapear escuelas faltantes

In [7]:
def get_school_df(URL, newId, counter):
    """[Get School DataFrame by calling its API]

    Parameters
    ----------
    URL : [str]
        [String with the original URL]
    newId : [int]
        [Integer belonging to certain school]

    Returns
    -------
    [school_df]
        [DataFrame with the school's info]
    """
    
    new_API = URL.replace("objectIds=20", f"objectIds={newId}")
    r = requests.get(new_API)
    content = r.json()
    school_info = content['features'][0]['attributes']
    school_df = pd.DataFrame(school_info, index=[counter])
    school_df = school_df[['objectid_1', 'fid_school', 'objectid', 'schcd', 'schname', 'schcat',
       'school_cat', 'schtype', 'school_typ', 'schmgt', 'management', 'rururb', 'location', 
       'pincode', 'dtname', 'udise_stco', 'stname', 'longitude', 'latitude', 'stcode11', 'dtcode11']]
    
    school_df.rename(columns={"schcd": "school_code", "schname": "school_name"}, inplace=True)

    return school_df

## Pipeline

In [8]:
# Definir df final   y sacar lista con nombres de todos los archivos
files = os.listdir("C:/Users/fdmol/Desktop/Webscraping-India-Schools")

In [9]:
all_schools = read_all_csvs(files)

In [10]:
all_schools

Unnamed: 0,objectid_1,fid_school,objectid,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,...,rururb,location,pincode,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
0,1255001,687647,692507,9231408402,PMV. NARAYANPUR AHLAD,4,Upper Primary Only,3,Co-Ed,1,...,1,,242301,Hardoi,9,UTTAR PRADESH,80.330819,27.502598,9,155
1,1255002,1177097,1188097,9231800105,NATIONAL MONTESSORI SCHOOL,1,Primary,3,Co-Ed,5,...,2,,262804,Hardoi,9,UTTAR PRADESH,80.271100,27.502600,9,155
2,1255003,1282312,1296031,18140105705,SNEHALAYA,2,Upper Primary,3,Co-Ed,5,...,1,,786125,Tinsukia,18,ASSAM,95.354000,27.502600,18,309
3,1255004,19565,19951,9251207601,PRY. SCH. MUNDER,1,Primary,3,Co-Ed,1,...,1,,241123,Hardoi,9,UTTAR PRADESH,79.780957,27.502602,9,155
4,1255005,413654,417172,8061007602,GOVT. PS HADA YALI,1,Primary,3,Co-Ed,3,...,1,,301024,Alwar,8,RAJASTHAN,76.325725,27.502605,8,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234243,1489550,516180,520404,1021703201,MS KHANPETH,2,Upper Primary,3,Co-Ed,1,...,1,,193121,Muzaffarabad,1,JAMMU & KASHMIR,74.372000,34.927000,1,992
234244,1489551,10708,11039,37070500905,P/S KHEMI,1,Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.428660,34.944427,38,3
234245,1489552,587820,592231,37070500901,M/S TONGSTED,2,Upper Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.397833,34.977283,38,3
234246,1489553,89211,90186,37070500701,P/S NUNGSTATE,1,Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.379175,34.984490,38,3


### Guardar el csv concatenado

In [11]:
max_id = all_schools['objectid_1'].max()
all_schools.to_csv(f"C:/Users/fdmol/Desktop/Webscraping-India-Schools/all_schools/all_schools_{max_id}.csv", index=False, encoding='utf-8-sig')

In [12]:
# Sacar ids de las escuelas faltantes
missing_ids = get_missing_ids(all_schools)

In [13]:
len(missing_ids)

306

### Obtener info. de las escuelas faltantes

In [14]:
missing_schools_df = pd.DataFrame([])
counter = 0

for missing_id in missing_ids:
    try:
        missing_school_df = get_school_df(URL, missing_id, missing_id)
        missing_schools_df = pd.concat([missing_schools_df, missing_school_df], ignore_index=True)
        time.sleep(np.random.randint(0, 2))
        counter += 1

    except Exception as err:
        print(str(err))
        print("Unable to get data")
        missing_school_df = pd.DataFrame([])
        missing_schools_df = pd.concat([missing_schools_df, missing_school_df], ignore_index=True)
        counter += 1
        time.sleep(5)
    
    if counter % 500 == 0:
        print(f"{counter} missing schools obtained")

print("Missing schools obtained, saving file")
missing_schools_df.to_csv(f"C:/Users/fdmol/Desktop/Webscraping-India-Schools/all_schools/missing_schools_df_{missing_id}.csv", index=False, encoding='utf-8-sig')

Missing schools obtained, saving file


In [16]:
missing_schools_df

Unnamed: 0,objectid_1,fid_school,objectid,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,...,rururb,location,pincode,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
0,1411073,1220047,1232401,5050530577,K.P. MEMORIAL,2,Upper Primary,3,Co-Ed,5,...,2,,248001,Dehradun,05,UTTARAKHAND,78.061504,30.342436,05,060
1,1417222,105649,106753,3180120401,GPS MANOLI SURAT,1,Primary,3,Co-Ed,1,...,1,,140601,Patiala,03,PUNJAB,76.764607,30.520606,03,048
2,1417223,1312442,1326685,3210404203,BABA NAND SINGH MIDDLE SCHOOL CHAK DUMAL,3,Higher Secondary (1-12),3,Co-Ed,5,...,1,,152024,Fazilka,03,PUNJAB,74.217020,30.520614,03,701
3,1417224,711906,716981,3160106102,GMS MEHBOOBPURA,4,Upper Primary Only,3,Co-Ed,1,...,1,,148023,Sangrur,03,PUNJAB,75.827698,30.520643,03,053
4,1417225,171802,173442,5050201601,GOVT. P.S. BOSAAN,1,Primary,3,Co-Ed,1,...,1,,248159,Dehradun,05,UTTARAKHAND,77.875100,30.520700,05,060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,1413622,657683,662269,5030120102,UPS JAHANGI,4,Upper Primary Only,3,Co-Ed,1,...,1,,246425,Rudraprayag,05,UTTARAKHAND,79.087500,30.407100,05,058
302,1413623,355849,358869,5020108402,PS NAIGWAD,1,Primary,3,Co-Ed,1,...,1,,246443,Chamoli,05,UTTARAKHAND,79.324000,30.407100,05,057
303,1413624,371039,374172,3120108102,GPS RUPANA GIRLS,1,Primary,3,Co-Ed,1,...,1,,152032,Sri Muktsar Sahib,03,PUNJAB,74.525053,30.407118,03,044
304,1413625,1172852,1183666,3200300904,DASHMESH PUBLIC SCHOOL CHEEMA,1,Primary,3,Co-Ed,5,...,1,,148103,Barnala,03,PUNJAB,75.461223,30.407200,03,054


### Juntar df de missing schools y el ya concatenado para agregar los datos faltantes

In [17]:
complete_schools = pd.concat([all_schools, missing_schools_df], ignore_index=True)
complete_schools.sort_values(by='objectid_1', ascending=True, inplace=True)
complete_schools.reset_index(drop=True, inplace=True)
complete_schools

Unnamed: 0,objectid_1,fid_school,objectid,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,...,rururb,location,pincode,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
0,1255001,687647,692507,9231408402,PMV. NARAYANPUR AHLAD,4,Upper Primary Only,3,Co-Ed,1,...,1,,242301,Hardoi,9,UTTAR PRADESH,80.330819,27.502598,9,155
1,1255002,1177097,1188097,9231800105,NATIONAL MONTESSORI SCHOOL,1,Primary,3,Co-Ed,5,...,2,,262804,Hardoi,9,UTTAR PRADESH,80.271100,27.502600,9,155
2,1255003,1282312,1296031,18140105705,SNEHALAYA,2,Upper Primary,3,Co-Ed,5,...,1,,786125,Tinsukia,18,ASSAM,95.354000,27.502600,18,309
3,1255004,19565,19951,9251207601,PRY. SCH. MUNDER,1,Primary,3,Co-Ed,1,...,1,,241123,Hardoi,9,UTTAR PRADESH,79.780957,27.502602,9,155
4,1255005,413654,417172,8061007602,GOVT. PS HADA YALI,1,Primary,3,Co-Ed,3,...,1,,301024,Alwar,8,RAJASTHAN,76.325725,27.502605,8,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234549,1489550,516180,520404,1021703201,MS KHANPETH,2,Upper Primary,3,Co-Ed,1,...,1,,193121,Muzaffarabad,1,JAMMU & KASHMIR,74.372000,34.927000,1,992
234550,1489551,10708,11039,37070500905,P/S KHEMI,1,Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.428660,34.944427,38,3
234551,1489552,587820,592231,37070500901,M/S TONGSTED,2,Upper Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.397833,34.977283,38,3
234552,1489553,89211,90186,37070500701,P/S NUNGSTATE,1,Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.379175,34.984490,38,3


In [18]:
# Guardar nuevo csv
max_id = complete_schools['objectid_1'].max()
complete_schools.to_csv(f"C:/Users/fdmol/Desktop/Webscraping-India-Schools/all_schools/all_schools_{max_id}.csv", index=False, encoding='utf-8-sig')

## Unir _todas_ las escuelas

In [25]:
all_files = os.listdir("C:/Users/fdmol/Desktop/Webscraping-India-Schools/all_schools")
all_files

['all_schools_1015000.csv',
 'all_schools_1255000.csv',
 'all_schools_134000.csv',
 'all_schools_1489554.csv',
 'all_schools_195000.csv',
 'all_schools_235000.csv',
 'all_schools_291000.csv',
 'all_schools_375000.csv',
 'all_schools_455000.csv',
 'all_schools_575000.csv',
 'all_schools_655000.csv',
 'all_schools_815000.csv',
 'all_schools_895000.csv',
 'all_schools_951000.csv']

In [72]:
all_dfs = pd.DataFrame([])

for file in all_files:

    df = pd.read_csv(f"C:/Users/fdmol/Desktop/Webscraping-India-Schools/all_schools/{file}")
    all_dfs = pd.concat([all_dfs, df], ignore_index=True) 

In [73]:
missing_ids = get_missing_ids(all_dfs)

In [74]:
missing_ids

[1002530, 226171]

In [75]:
missing_schools_df = pd.DataFrame([])
counter = 0

for missing_id in missing_ids:
    try:
        missing_school_df = get_school_df(URL, missing_id, missing_id)
        missing_schools_df = pd.concat([missing_schools_df, missing_school_df], ignore_index=True)
        time.sleep(np.random.randint(0, 2))
        counter += 1

    except Exception as err:
        print(str(err))
        print("Unable to get data")
        missing_school_df = pd.DataFrame([])
        missing_schools_df = pd.concat([missing_schools_df, missing_school_df], ignore_index=True)
        counter += 1
        time.sleep(5)
    
    if counter % 500 == 0:
        print(f"{counter} missing schools obtained")

print("Missing schools obtained, saving file")
missing_schools_df.to_csv(f"C:/Users/fdmol/Desktop/Webscraping-India-Schools/all_schools/missing_schools_df_{missing_id}.csv", index=False, encoding='utf-8-sig')

Missing schools obtained, saving file


In [82]:
complete_schools = pd.concat([all_dfs, missing_schools_df], ignore_index=True)
complete_schools.sort_values(by='objectid_1', ascending=True, inplace=True)
complete_schools.reset_index(drop=True, inplace=True)
complete_schools

Unnamed: 0,objectid_1,fid_school,objectid,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,...,rururb,location,pincode,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
0,1,233143,235240,35020300702,GOVT PRIMARY SCHOOL SHASTRI NAGAR,1,Primary,3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.892304,6.811454,35,638
1,2,747816,753170,35020300201,GOVT SECONDARY SCHOOL GANDHI NAGAR,6,Secondary (1-10),3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.890508,6.838517,35,638
2,3,55885,56582,35020300703,GOVT PRIMARY SCHOOL LAXMI NAGAR,1,Primary,3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.886408,6.893214,35,638
3,4,639587,644125,35020300701,GOVT SENIOR SECONDARY SCHOOL VIJAY NAGAR,3,Higher Secondary (1-12),3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.919901,6.961255,35,638
4,5,223993,226006,35020300404,GOVT PRIMARY SCHOOL CHINGAN,1,Primary,3,Co-Ed,1,...,1,,744302,Nicobars,35,ANDAMAN & NICOBAR,93.912472,6.972397,35,638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489549,1489550,516180,520404,1021703201,MS KHANPETH,2,Upper Primary,3,Co-Ed,1,...,1,,193121,Muzaffarabad,1,JAMMU & KASHMIR,74.372000,34.927000,1,992
1489550,1489551,10708,11039,37070500905,P/S KHEMI,1,Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.428660,34.944427,38,3
1489551,1489552,587820,592231,37070500901,M/S TONGSTED,2,Upper Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.397833,34.977283,38,3
1489552,1489553,89211,90186,37070500701,P/S NUNGSTATE,1,Primary,3,Co-Ed,1,...,1,,194401,Leh,37,LADAKH,77.379175,34.984490,38,3


In [83]:
complete_schools.sort_values(by='objectid_1', ascending=True, inplace=True)
complete_schools.drop(columns=["objectid", "objectid_1", "pincode"], inplace=True)
complete_schools.reset_index(drop=True, inplace=True)

In [84]:
complete_schools.columns

Index(['fid_school', 'school_code', 'school_name', 'schcat', 'school_cat',
       'schtype', 'school_typ', 'schmgt', 'management', 'rururb', 'location',
       'dtname', 'udise_stco', 'stname', 'longitude', 'latitude', 'stcode11',
       'dtcode11'],
      dtype='object')

In [85]:
# Guardar en local
complete_schools.to_csv("india_schools.csv", index=False, encoding='utf-8-sig')

In [86]:
complete_schools

Unnamed: 0,fid_school,school_code,school_name,schcat,school_cat,schtype,school_typ,schmgt,management,rururb,location,dtname,udise_stco,stname,longitude,latitude,stcode11,dtcode11
0,233143,35020300702,GOVT PRIMARY SCHOOL SHASTRI NAGAR,1,Primary,3,Co-Ed,1,Department of Education,1,,Nicobars,35,ANDAMAN & NICOBAR,93.892304,6.811454,35,638
1,747816,35020300201,GOVT SECONDARY SCHOOL GANDHI NAGAR,6,Secondary (1-10),3,Co-Ed,1,Department of Education,1,,Nicobars,35,ANDAMAN & NICOBAR,93.890508,6.838517,35,638
2,55885,35020300703,GOVT PRIMARY SCHOOL LAXMI NAGAR,1,Primary,3,Co-Ed,1,Department of Education,1,,Nicobars,35,ANDAMAN & NICOBAR,93.886408,6.893214,35,638
3,639587,35020300701,GOVT SENIOR SECONDARY SCHOOL VIJAY NAGAR,3,Higher Secondary (1-12),3,Co-Ed,1,Department of Education,1,,Nicobars,35,ANDAMAN & NICOBAR,93.919901,6.961255,35,638
4,223993,35020300404,GOVT PRIMARY SCHOOL CHINGAN,1,Primary,3,Co-Ed,1,Department of Education,1,,Nicobars,35,ANDAMAN & NICOBAR,93.912472,6.972397,35,638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489549,516180,1021703201,MS KHANPETH,2,Upper Primary,3,Co-Ed,1,Department of Education,1,,Muzaffarabad,1,JAMMU & KASHMIR,74.372000,34.927000,1,992
1489550,10708,37070500905,P/S KHEMI,1,Primary,3,Co-Ed,1,Department of Education,1,,Leh,37,LADAKH,77.428660,34.944427,38,3
1489551,587820,37070500901,M/S TONGSTED,2,Upper Primary,3,Co-Ed,1,Department of Education,1,,Leh,37,LADAKH,77.397833,34.977283,38,3
1489552,89211,37070500701,P/S NUNGSTATE,1,Primary,3,Co-Ed,1,Department of Education,1,,Leh,37,LADAKH,77.379175,34.984490,38,3
