# Tanzania - Webscraping 2021

### Paquetes (imports)

In [4]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Functions

In [66]:
base_url = 'https://matokeo.necta.go.tz/psle/psle.htm'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}

In [67]:
def get_regions_dict(base_url):
    """
    Get the regions dictionary from the base url

    Parameters
    ----------
    base_url : str
    """

    # Connect to the base url
    r = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Parse html and get regions
    a_objects = soup.find_all('a')
    unwanted_chars = ['\r', '\n']
    regions_dict = {}
    
    for a_object in a_objects:
        href = a_object['href']
        region_name = a_object.text

        # Remove unwanted characters
        for unwanted_char in unwanted_chars:
            region_name = region_name.replace(unwanted_char, '')

        
        region_url = base_url.replace("psle.htm", href)
        regions_dict[region_name] = region_url

    return regions_dict

In [95]:
def get_districts_dict(region_url):
    """
    Get the districts dictionary from the regions dictionary

    Parameters
    ----------
    regions_dict : dict

    Returns
    -------

    districts_dict : dict
    """
    r = requests.get(region_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')
    
    unwanted_chars = ['\r', '\n']
    distr_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        distr_name = a_object.text

        for unwanted_char in unwanted_chars:
            distr_name = distr_name.replace(unwanted_char, '')

        distr_url = base_url.replace("psle.htm", f"results/{href}")
        distr_dict[distr_name] = distr_url

    return distr_dict

### Pipeline

In [99]:
regions_dict = get_regions_dict(base_url)
distr_dict = get_districts_dict(regions_dict['KATAVI'])
distr_dict

{'MLELE': 'https://matokeo.necta.go.tz/psle/results/distr_2501.htm',
 'MPANDA MC': 'https://matokeo.necta.go.tz/psle/results/distr_2502.htm',
 'MPIMBWE': 'https://matokeo.necta.go.tz/psle/results/distr_2505.htm',
 'NSIMBO': 'https://matokeo.necta.go.tz/psle/results/distr_2504.htm',
 'TANGANYIKA': 'https://matokeo.necta.go.tz/psle/results/distr_2503.htm'}