# Extract Data from Schools

In [47]:
# Global Packages
import time
import requests
import certifi
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup


### Parameters

In [2]:
BASE_URL = "https://www.redacademica.edu.co/colegios?name=&field_localidad_target_id=All"
HEADERS = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
           "Host": "www.redacademica.edu.co" }

In [21]:
# find all a's with href
def get_urls_dict(soup):
    """
    Getting school URLS from the given soup object
    """
    urls_dict = {}
    a_objects = soup.find_all("a", href=True)
    for a_object in a_objects:
        if "colegios/" in str.lower(a_object["href"]):
            new_url = "https://www.redacademica.edu.co" + a_object["href"]
            urls_dict[a_object.text] = new_url

    if "\n\n\n\n" in urls_dict:
        del urls_dict["\n\n\n\n"]

    return urls_dict

In [50]:
def get_schools_df(urls_dict):
    """
    Takes a dictionary of school names and urls and returns a dataframe with all the info

    Parameters
    ----------
    urls_dict : dict
        Dictionary of school names and urls
    
    Returns
    -------
    schools_df : pd.DataFrame
        Dataframe with all the info
    """
    # Getting info from all schools
    school_dicts = []

    for school_name, school_url in urls_dict.items():
        print(f"Getting info from {school_name}...")
        # Create school dict
        school_dict = {}
        school_dict["school_name"] = school_name
        school_dict["school_url"] = school_url
        # Get school info
        school_r = requests.get(school_url, headers=HEADERS)
        school_soup = BeautifulSoup(school_r.content, "html.parser")
        info_container = school_soup.find("ul", {"class": "school-info__list"})
        if  not info_container:
            print(f"Skipping {school_name}...")     
            continue
        else:
            column_names = info_container.find_all("h4")
            list_elements = info_container.find_all("li")
            for column_name, list_element in zip(column_names, list_elements):
                # Remove column name from raw text
                clean_col_name = column_name.text
                raw_text = list_element.text.replace(clean_col_name, "")
                # Remove new lines and strip
                raw_text = raw_text.replace("\n", " ").strip()
                # Add to school dict
                school_dict[clean_col_name] = raw_text

        school_dicts.append(school_dict)
        time.sleep(1.5)

    # Create schools df
    schools_df = pd.DataFrame.from_dict(school_dicts)

    return schools_df

### Playground

In [3]:
r = requests.get(BASE_URL, headers=HEADERS)
soup = BeautifulSoup(r.content, "html.parser")

In [23]:
urls_dict = get_urls_dict(soup)
urls_dict

{'Colegio Aquileo Parra': 'https://www.redacademica.edu.co/colegios/colegio-aquileo-parra',
 'IED Colegio El Verjon': 'https://www.redacademica.edu.co/colegios/ied-colegio-el-verjon',
 'Colegio Altamira Sur Oriental (IED)': 'https://www.redacademica.edu.co/colegios/colegio-altamira-sur-oriental-ied',
 'Colegio Montebello (IED)': 'https://www.redacademica.edu.co/colegios/colegio-montebello-ied',
 'Colegio Atenas (IED)': 'https://www.redacademica.edu.co/colegios/colegio-atenas-ied',
 'Colegio Jose Joaquin Castro Martinez (IED)': 'https://www.redacademica.edu.co/colegios/colegio-jose-joaquin-castro-martinez-ied',
 'Colegio Entre Nubes Sur Oriental (IED)': 'https://www.redacademica.edu.co/colegios/colegio-entre-nubes-sur-oriental-ied',
 'Colegio Diego Montaña Cuellar (IED)': 'https://www.redacademica.edu.co/colegios/colegio-diego-montana-cuellar-ied',
 'Colegio Gran Yomasa I. E. D.': 'https://www.redacademica.edu.co/colegios/colegio-gran-yomasa-i-e-d',
 'Colegio Marco Fidel Suárez (IED)': 

In [49]:
pd.DataFrame.from_dict(school_dicts)

Unnamed: 0,school_name,school_url,Jornada,Dirección,Correo,Teléfono,Horario de Atención,DANE
0,Colegio Aquileo Parra,https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde Nocturna,Carrera 18A # 187-75,insdiaquileoparrav1@educacionbogota.edu.co,6745973 - 6745973,8:00 a.m a 4:00 p.m.,
1,IED Colegio El Verjon,https://www.redacademica.edu.co/colegios/ied-c...,única,km 13 de la vía Bogotá - Choachí,escdielverjonbajoe2@redp.edu.co,,,
2,Colegio Altamira Sur Oriental (IED),https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,Calle 42 A Sur # 12 A - 27 Este,escdialtamirasuror4@educacionbogota.edu.co,2068596 / 3043999116,M y V 9:00 a.m. a 11:00 a.m. - 2:00 p.m. a 4:0...,
3,Colegio Montebello (IED),https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,Calle 24A Sur No 1A- 95 Este,cedmontebello4@educacionbogota.edu.co,2067680 - 3002067382,8:00 a.m. - 4:00 p.m.,
4,Colegio Atenas (IED),https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,Diagonal 34 Sur # 2A-05 Este Barrio: Atenas.,,2069970,Lunes a Viernes de 8:00 am - 4:30pm.,
5,Colegio Jose Joaquin Castro Martinez (IED),https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,Sede A Calle 31 D Bis Sur No. 2 - 24 Este,coldijosejoaquinca4@educacionbogota.edu.co,2065563,,
6,Colegio Entre Nubes Sur Oriental (IED),https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,CL 43 B Sur N. 1 D - 03 Este,cedanibalfernande4@educacionbogota.edu.co,2068676 - 3644228,8:00 a.m. - 2:00 p.m.,
7,Colegio Diego Montaña Cuellar (IED),https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,Transversal 6b # 100c 55 sur,cedmonteblanco5@redp.edu.co,,8:00 a.m. - 4:00 p.m.,
8,Colegio Gran Yomasa I. E. D.,https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,CL 81C SUR # 8A - 06 ESTE,cedgranyomasa5@educacionbogota.edu.co,3002068876,Lunes a viernes de 8:00 a. m. a 12:00 m. y de ...,211850000051.0
9,Colegio Marco Fidel Suárez (IED),https://www.redacademica.edu.co/colegios/coleg...,Mañana Tarde,Carrera 25Sur # 52 C - 92,coldimarcofidelsua6@educacionbogota.edu.co,7412113 / 2700366 / 7410022,L - V 7:00 a.m. a 4:30 p.m. en jornada continua.,


In [43]:
school_dict

{'Jornada': 'Mañana Tarde Nocturna',
 'Dirección': 'Carrera 18A # 187-75',
 'Correo': 'insdiaquileoparrav1@educacionbogota.edu.co',
 'Teléfono': '6745973 - 6745973',
 'Horario de Atención': '8:00 a.m a 4:00 p.m.'}

In [31]:
li.text.split("\n")

['', 'Horario de Atención', '8:00 a.m a 4:00 p.m.', '']