# Extract Data from Schools

In [63]:
# Global Packages
import re
import time
import requests
import certifi
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup


### Parameters

In [51]:
BASE_URL = "https://www.redacademica.edu.co/colegios?name=&field_localidad_target_id=All&page=0"
HEADERS = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
           "Host": "www.redacademica.edu.co" }
LAST_PAGE = 13

In [52]:
# find all a's with href
def get_urls_dict(soup):
    """
    Getting school URLS from the given soup object
    """
    urls_dict = {}
    a_objects = soup.find_all("a", href=True)
    for a_object in a_objects:
        if "colegios/" in str.lower(a_object["href"]):
            new_url = "https://www.redacademica.edu.co" + a_object["href"]
            urls_dict[a_object.text] = new_url

    if "\n\n\n\n" in urls_dict:
        del urls_dict["\n\n\n\n"]

    return urls_dict

In [64]:
def get_schools_df(urls_dict):
    """
    Takes a dictionary of school names and urls and returns a dataframe with all the info

    Parameters
    ----------
    urls_dict : dict
        Dictionary of school names and urls
    
    Returns
    -------
    schools_df : pd.DataFrame
        Dataframe with all the info
    """
    # Getting info from all schools
    school_dicts = []

    for school_name, school_url in urls_dict.items():
        print(f"Getting info from {school_name}...")
        # Create school dict
        school_dict = {}
        school_dict["school_name"] = school_name
        school_dict["school_url"] = school_url
        # Get school info
        school_r = requests.get(school_url, headers=HEADERS)
        school_soup = BeautifulSoup(school_r.content, "html.parser")
        info_container = school_soup.find("ul", {"class": "school-info__list"})
        if not info_container:
            print(f"Skipping {school_name}...")     
            continue
        else:
            column_names = info_container.find_all("h4")
            list_elements = info_container.find_all("li")
            for column_name, list_element in zip(column_names, list_elements):
                # Remove column name from raw text
                clean_col_name = column_name.text
                raw_text = list_element.text.replace(clean_col_name, "")
                # Remove new lines and strip
                raw_text = raw_text.replace("\n", " ").strip()
                raw_text = re.replace(r"\s+", " ", raw_text)
                # Add to school dict
                school_dict[clean_col_name] = raw_text

        school_dicts.append(school_dict)
        time.sleep(0.5)

    # Create schools df
    schools_df = pd.DataFrame.from_dict(school_dicts)

    return schools_df

### Playground

In [61]:
all_schools_df = pd.DataFrame()
pages = [num for num in range(0, LAST_PAGE)]

for page in pages:
    print(f" ================ Getting page {page}... ==================")
    page_url = BASE_URL.replace("page=0", f"page={page}")
    r = requests.get(page_url, headers=HEADERS)
    soup = BeautifulSoup(r.content, "html.parser")
    urls_dict = get_urls_dict(soup)
    print(f"Found {len(urls_dict)} schools in page {page}...")
    schools_df = get_schools_df(urls_dict)
    schools_df.to_csv(f"schools_{page}.csv", encoding='utf-8-sig',
                       index=False)
    all_schools_df = pd.concat([all_schools_df, schools_df], ignore_index=True)
    time.sleep(10)

Found 20 schools in page 0...
Getting info from Colegio Aquileo Parra...
Getting info from IED Colegio El Verjon...
Getting info from Colegio Altamira Sur Oriental (IED)...
Getting info from Colegio Montebello (IED)...
Getting info from Colegio Atenas (IED)...
Getting info from Colegio Jose Joaquin Castro Martinez (IED)...
Getting info from Colegio Entre Nubes Sur Oriental (IED)...
Getting info from Colegio Diego Montaña Cuellar (IED)...
Getting info from Colegio Gran Yomasa I. E. D....
Getting info from Colegio Marco Fidel Suárez (IED)...
Getting info from Colegio Motorista (CED)...
Getting info from Colegio Orlando Higuita Rojas (IED)...
Getting info from Colegio Carlos Pizarro Leongómez (IED)...
Getting info from Colegio Jose Francisco Socarras (IED)...
Getting info from Colegio Debora Arango Perez (IED)...
Getting info from Colegio Luis Lopez De Mesa (IED)...
Getting info from Colegio El Porvenir (IED)...
Getting info from Colegio Carlos Arango Velez (IED)...
Getting info from Cole

In [62]:
all_schools_df.to_csv("all_schools.csv", encoding='utf-8-sig', 
                      index=False)