# Tanzania - Webscraping 2021

### Paquetes (imports)

In [178]:
import re
import time
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Functions

In [2]:
base_url = 'https://matokeo.necta.go.tz/psle/psle.htm'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}

#### Funciones para sacar urls

In [3]:
def get_regions_dict(base_url):
    """
    Get the regions dictionary from the base url

    Parameters
    ----------
    base_url : str
    """

    # Connect to the base url
    r = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Parse html and get regions
    a_objects = soup.find_all('a')
    unwanted_chars = ['\r', '\n']
    regions_dict = {}
    
    for a_object in a_objects:
        href = a_object['href']
        region_name = a_object.text

        # Remove unwanted characters
        for unwanted_char in unwanted_chars:
            region_name = region_name.replace(unwanted_char, '')

        
        region_url = base_url.replace("psle.htm", href)
        regions_dict[region_name] = region_url

    return regions_dict

In [4]:
def get_districts_dict(region_url):
    """
    Get the districts dictionary from the regions dictionary

    Parameters
    ----------
    regions_dict : dict

    Returns
    -------

    districts_dict : dict
    """
    r = requests.get(region_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')
    
    unwanted_chars = ['\r', '\n']
    distr_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        distr_name = a_object.text

        for unwanted_char in unwanted_chars:
            distr_name = distr_name.replace(unwanted_char, '')

        distr_url = base_url.replace("psle.htm", f"results/{href}")
        distr_dict[distr_name] = distr_url

    return distr_dict

In [13]:
def get_schools_dict(distr_url):
    """
    Get the schools dictionary from the districts dictionary
    """
    
    r = requests.get(distr_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')

    unwanted_chars = ['\r', '\n']
    schools_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        school_name = a_object.text

        for unwanted_char in unwanted_chars:
            school_name = school_name.replace(unwanted_char, '')

        school_url = base_url.replace("psle.htm", f"results/{href}")
        schools_dict[school_name] = school_url

    return schools_dict

### Funciones - Parsear los datos

In [231]:
def get_raw_table(school_url):

    """
    Gets table with raw data from the school url
    """
    r = requests.get(school_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    raw_tables = soup.find_all('table')

    if len(raw_tables) == 0:
        raw_table = pd.DataFrame([])

    else:
        tables = pd.read_html(r.content)
        raw_table = tables[1]
        raw_table.rename(columns = {0: "CAND_NO", 1: "PREM_NO", 2: "SEX", 
                                    3: "CAND_NAME", 4: "SUBJECTS"}, inplace=True)
        raw_table = raw_table.iloc[1:,:]
        raw_table.reset_index(drop=True, inplace=True)

    return raw_table

In [232]:
def get_subjects(raw_table):
    """
    Gets the subjects from the raw table
    """        
    unique_subjects = raw_table['SUBJECTS'].unique()

    # Asegurarnos de que las materias no sean "*R" o una cosa así
    for unique_subject in unique_subjects:
        if len(unique_subject) > 3:
            str_subjects = unique_subject
            break
    
    list_subjects = str_subjects.split(",")
    clean_subjects = []

    for subject in list_subjects:
        raw_subject = subject.split("-")
        clean_subject = raw_subject[0].replace(" ", "")
        clean_subjects.append(clean_subject)

    return clean_subjects

In [233]:
def get_grades(raw_table, num_subjects):
    """
    Gets the grades from the raw table
    """
    str_subjects = raw_table['SUBJECTS']

    # Casos especiales cuando viene una sola letra, poner un vector de NA's
    if (len(str_subjects) <= 3) or ("*" in str_subjects):
            clean_grades = ["NA"] * num_subjects
    else:
        raw_grades = re.findall( r'\s-\s\w+' , str_subjects)
        clean_grades = []
        
        for raw_grade in raw_grades:
            clean_grade = re.findall( r'\w+' , raw_grade)
            clean_grade = clean_grade[0]
            clean_grades.append(clean_grade)

    return clean_grades

### Pipeline

In [234]:
regions_dict = get_regions_dict(base_url)
region_names = list(regions_dict.keys())

regions_df = pd.DataFrame([])

for region_name in region_names:

    distr_dict = get_districts_dict(regions_dict[region_name])
    distr_names = list(distr_dict.keys())

    district_df = pd.DataFrame([])

    for distr_name in distr_names:
        schools_dict = get_schools_dict(distr_dict[distr_name])

        schools_names = list(schools_dict.keys())

        students_df = pd.DataFrame([])

        for school_name in schools_names:

            school_url = schools_dict[school_name]
            raw_table = get_raw_table(school_url)
            
            if len(raw_table) == 0: # Si no hubo tabla, continuar con otra escuela
                print("Skipping school:", school_name)
                continue

            subjects = get_subjects(raw_table)
            num_subjects = len(subjects)
            
            raw_table[subjects] = raw_table.apply(get_grades, args=[num_subjects], 
                                                    axis=1, result_type='expand')

            raw_table['SCHOOL_NAME'] = school_name
            raw_table['DISTRICT_NAME'] = distr_name
            raw_table['REGION_NAME'] = region_name

            students_df = pd.concat([students_df, raw_table],
                                    ignore_index=True)
        
        print(f"Done with district {distr_name}")
        time.sleep(2)
        district_df = pd.concat([district_df, students_df],
                                ignore_index=True)
    
    regions_df = pd.concat([regions_df, district_df], 
                            ignore_index=True)

    print(f" =============== Done with region {region_name} ============")
    time.sleep(5)
        

Done with district ARUSHA
Done with district ARUSHA CC
Done with district KARATU
Done with district LONGIDO
Done with district MERU
Done with district MONDULI
Done with district NGORONGORO
Done with district ILALA
Skipping school: SEGEREA ADVENTIST PRIMARY SCHOOL - PS0202045
Skipping school: ST.ANTHONY OF PADUA PRIMARY SCHOOL - PS0202135
Done with district ILALA CC
Done with district KIGAMBONI MC
Done with district KINONDONI MC
Done with district TEMEKE MC
Done with district UBUNGO MC
Done with district BAHI
Done with district CHAMWINO
Done with district CHEMBA
Done with district DODOMA CC
Done with district KONDOA
Done with district KONDOA TC
Done with district KONGWA
Done with district MPWAPWA
Done with district IRINGA
Done with district IRINGA MC
Done with district KILOLO
Done with district MAFINGA TC
Done with district MUFINDI
Done with district BIHARAMULO
Done with district BUKOBA
Done with district BUKOBA MC
Done with district KARAGWE
Done with district KYERWA
Done with district 

UnboundLocalError: local variable 'str_subjects' referenced before assignment

### Debugging

In [203]:
school_url
raw_table = get_raw_table(school_url)
raw_table
subjects = get_subjects(raw_table)
num_subjects = len(subjects)
subjects

['*R']

In [206]:
raw_table['SUBJECTS'].unique()[1]

'Kiswahili - A, English - A, Maarifa - A, Hisabati - A, Science - A, Uraia - B, Average Grade - A'

In [194]:
raw_table['SUBJECTS'][0]

'*R'

In [142]:
students_df['SUBJECTS'].unique()

array(['Kiswahili - B, English - A, Maarifa - C, Hisabati - C, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - A, Science - C, Uraia - B, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - B, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - A, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, M

In [128]:
schools_dict["ALAILELAI PRIMARY SCHOOL - PS0107001"]

'https://matokeo.necta.go.tz/psle/results/shl_ps0107001.htm'

In [116]:
school_url = schools_dict['ALAILELAI PRIMARY SCHOOL - PS0107001']
raw_table = get_raw_table(school_url)

In [117]:
subjects = get_subjects(raw_table)

In [118]:
raw_table[subjects] = raw_table.apply(get_grades, axis=1, result_type='expand')