# Tanzania - Webscraping 2021

### Paquetes (imports)

In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Functions

In [2]:
base_url = 'https://matokeo.necta.go.tz/psle/psle.htm'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}

In [3]:
def get_regions_dict(base_url):
    """
    Get the regions dictionary from the base url

    Parameters
    ----------
    base_url : str
    """

    # Connect to the base url
    r = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Parse html and get regions
    a_objects = soup.find_all('a')
    unwanted_chars = ['\r', '\n']
    regions_dict = {}
    
    for a_object in a_objects:
        href = a_object['href']
        region_name = a_object.text

        # Remove unwanted characters
        for unwanted_char in unwanted_chars:
            region_name = region_name.replace(unwanted_char, '')

        
        region_url = base_url.replace("psle.htm", href)
        regions_dict[region_name] = region_url

    return regions_dict

In [4]:
def get_districts_dict(region_url):
    """
    Get the districts dictionary from the regions dictionary

    Parameters
    ----------
    regions_dict : dict

    Returns
    -------

    districts_dict : dict
    """
    r = requests.get(region_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')
    
    unwanted_chars = ['\r', '\n']
    distr_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        distr_name = a_object.text

        for unwanted_char in unwanted_chars:
            distr_name = distr_name.replace(unwanted_char, '')

        distr_url = base_url.replace("psle.htm", f"results/{href}")
        distr_dict[distr_name] = distr_url

    return distr_dict

In [13]:
def get_schools_dict(distr_url):
    """
    Get the schools dictionary from the districts dictionary
    """
    
    r = requests.get(distr_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')

    unwanted_chars = ['\r', '\n']
    schools_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        school_name = a_object.text

        for unwanted_char in unwanted_chars:
            school_name = school_name.replace(unwanted_char, '')

        school_url = base_url.replace("psle.htm", f"results/{href}")
        schools_dict[school_name] = school_url

    return schools_dict

### Pipeline

In [16]:
regions_dict = get_regions_dict(base_url)
distr_dict = get_districts_dict(regions_dict['ARUSHA'])
schools_dict = get_schools_dict(distr_dict['NGORONGORO'])

In [19]:
schools_dict

{'ALAILELAI PRIMARY SCHOOL - PS0107001': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107001.htm',
 'ARASH PRIMARY SCHOOL - PS0107002': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107002.htm',
 'BRIGHT PRIMARY SCHOOL - PS0107066': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107066.htm',
 'BULATI PRIMARY SCHOOL - PS0107003': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107003.htm',
 'BUTEMINE PRIMARY SCHOOL - PS0107080': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107080.htm',
 'DIGODIGO GCCT PRIMARY SCHOOL - PS0107079': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107079.htm',
 'DIGODIGO PRIMARY SCHOOL - PS0107004': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107004.htm',
 'EMBAASI PRIMARY SCHOOL - PS0107078': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107078.htm',
 'ENDULEN PRIMARY SCHOOL - PS0107005': 'https://matokeo.necta.go.tz/psle/results/shl_ps0107005.htm',
 'ENGARESERO PRIMARY SCHOOL - PS0107030': 'https://matokeo.necta.go.tz/psle/results/s