# Extract Data from Schools

In [47]:
# Global Packages
import time
import requests
import certifi
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup


### Parameters

In [2]:
BASE_URL = "https://www.redacademica.edu.co/colegios?name=&field_localidad_target_id=All"
HEADERS = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
           "Host": "www.redacademica.edu.co" }

In [21]:
# find all a's with href
def get_urls_dict(soup):
    """
    Getting school URLS from the given soup object
    """
    urls_dict = {}
    a_objects = soup.find_all("a", href=True)
    for a_object in a_objects:
        if "colegios/" in str.lower(a_object["href"]):
            new_url = "https://www.redacademica.edu.co" + a_object["href"]
            urls_dict[a_object.text] = new_url

    if "\n\n\n\n" in urls_dict:
        del urls_dict["\n\n\n\n"]

    return urls_dict

### Playground

In [3]:
r = requests.get(BASE_URL, headers=HEADERS)
soup = BeautifulSoup(r.content, "html.parser")

In [23]:
urls_dict = get_urls_dict(soup)
urls_dict

{'Colegio Aquileo Parra': 'https://www.redacademica.edu.co/colegios/colegio-aquileo-parra',
 'IED Colegio El Verjon': 'https://www.redacademica.edu.co/colegios/ied-colegio-el-verjon',
 'Colegio Altamira Sur Oriental (IED)': 'https://www.redacademica.edu.co/colegios/colegio-altamira-sur-oriental-ied',
 'Colegio Montebello (IED)': 'https://www.redacademica.edu.co/colegios/colegio-montebello-ied',
 'Colegio Atenas (IED)': 'https://www.redacademica.edu.co/colegios/colegio-atenas-ied',
 'Colegio Jose Joaquin Castro Martinez (IED)': 'https://www.redacademica.edu.co/colegios/colegio-jose-joaquin-castro-martinez-ied',
 'Colegio Entre Nubes Sur Oriental (IED)': 'https://www.redacademica.edu.co/colegios/colegio-entre-nubes-sur-oriental-ied',
 'Colegio Diego Montaña Cuellar (IED)': 'https://www.redacademica.edu.co/colegios/colegio-diego-montana-cuellar-ied',
 'Colegio Gran Yomasa I. E. D.': 'https://www.redacademica.edu.co/colegios/colegio-gran-yomasa-i-e-d',
 'Colegio Marco Fidel Suárez (IED)': 

In [48]:
# Getting info from all schools
school_dicts = []

for school_name, school_url in urls_dict.items():
    print(f"Getting info from {school_name}...")
    # Create school dict
    school_dict = {}
    school_dict["school_name"] = school_name
    school_dict["school_url"] = school_url

    # Get school info
    school_r = requests.get(school_url, headers=HEADERS)
    school_soup = BeautifulSoup(school_r.content, "html.parser")
    info_container = school_soup.find("ul", {"class": "school-info__list"})
    column_names = info_container.find_all("h4")
    list_elements = info_container.find_all("li")
    for column_name, list_element in zip(column_names, list_elements):
        # Remove column name from raw text
        clean_col_name = column_name.text
        raw_text = list_element.text.replace(clean_col_name, "")
        raw_text = raw_text.replace("\n", " ").strip()
        school_dict[clean_col_name] = raw_text

    school_dicts.append(school_dict)

    time.sleep(3)

Getting info from Colegio Aquileo Parra...
Getting info from IED Colegio El Verjon...
Getting info from Colegio Altamira Sur Oriental (IED)...


In [43]:
school_dict

{'Jornada': 'Mañana Tarde Nocturna',
 'Dirección': 'Carrera 18A # 187-75',
 'Correo': 'insdiaquileoparrav1@educacionbogota.edu.co',
 'Teléfono': '6745973 - 6745973',
 'Horario de Atención': '8:00 a.m a 4:00 p.m.'}

In [31]:
li.text.split("\n")

['', 'Horario de Atención', '8:00 a.m a 4:00 p.m.', '']