# Tanzania - Webscraping 2021

### Paquetes (imports)

In [83]:
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Functions

In [2]:
base_url = 'https://matokeo.necta.go.tz/psle/psle.htm'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}

#### Funciones para sacar urls

In [3]:
def get_regions_dict(base_url):
    """
    Get the regions dictionary from the base url

    Parameters
    ----------
    base_url : str
    """

    # Connect to the base url
    r = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Parse html and get regions
    a_objects = soup.find_all('a')
    unwanted_chars = ['\r', '\n']
    regions_dict = {}
    
    for a_object in a_objects:
        href = a_object['href']
        region_name = a_object.text

        # Remove unwanted characters
        for unwanted_char in unwanted_chars:
            region_name = region_name.replace(unwanted_char, '')

        
        region_url = base_url.replace("psle.htm", href)
        regions_dict[region_name] = region_url

    return regions_dict

In [4]:
def get_districts_dict(region_url):
    """
    Get the districts dictionary from the regions dictionary

    Parameters
    ----------
    regions_dict : dict

    Returns
    -------

    districts_dict : dict
    """
    r = requests.get(region_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')
    
    unwanted_chars = ['\r', '\n']
    distr_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        distr_name = a_object.text

        for unwanted_char in unwanted_chars:
            distr_name = distr_name.replace(unwanted_char, '')

        distr_url = base_url.replace("psle.htm", f"results/{href}")
        distr_dict[distr_name] = distr_url

    return distr_dict

In [13]:
def get_schools_dict(distr_url):
    """
    Get the schools dictionary from the districts dictionary
    """
    
    r = requests.get(distr_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')

    unwanted_chars = ['\r', '\n']
    schools_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        school_name = a_object.text

        for unwanted_char in unwanted_chars:
            school_name = school_name.replace(unwanted_char, '')

        school_url = base_url.replace("psle.htm", f"results/{href}")
        schools_dict[school_name] = school_url

    return schools_dict

### Funciones - Parsear los datos

In [107]:
def get_raw_table(school_url):

    """
    Gets table with raw data from the school url
    """
    r = requests.get(school_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    tables = pd.read_html(r.content)

    raw_table = tables[1]
    raw_table.rename(columns = {0: "CAND_NO", 1: "PREM_NO", 2: "SEX", 
                                3: "CAND_NAME", 4: "SUBJECTS"}, inplace=True)
    raw_table = raw_table.iloc[1:,:]
    raw_table.reset_index(drop=True, inplace=True)

    return raw_table

In [156]:
def get_subjects(raw_table):
    """
    Gets the subjects from the raw table
    """        
    str_subjects = raw_table['SUBJECTS'][0]
    list_subjects = str_subjects.split(",")
    clean_subjects = []

    for subject in list_subjects:
        raw_subject = subject.split("-")
        clean_subject = raw_subject[0].replace(" ", "")
        clean_subjects.append(clean_subject)

    return clean_subjects

In [157]:
def get_grades(raw_table, clean_subjects):
    """
    Gets the grades from the raw table
    """
    str_subjects = raw_table['SUBJECTS']
    num_subjects = len(clean_subjects) # num de materias

    # Casos especiales cuando viene una sola letra
    if len(str_subjects) <= 3:
            clean_grades = ["NA" for i in range(num_subjects)]
    
    raw_grades = re.findall( r'\s-\s\w+' , str_subjects)
    clean_grades = []
    
    for raw_grade in raw_grades:
        clean_grade = re.findall( r'\w+' , raw_grade)
        clean_grade = clean_grade[0]
        clean_grades.append(clean_grade)

    return clean_grades

### Pipeline

In [158]:
regions_dict = get_regions_dict(base_url)
distr_dict = get_districts_dict(regions_dict['ARUSHA'])
distr_names = list(distr_dict.keys())

district_df = pd.DataFrame([])



schools_dict = get_schools_dict(distr_dict[distr_names[0]])

schools_names = list(schools_dict.keys())

students_df = pd.DataFrame([])

for school_name in schools_names:

    school_url = schools_dict[school_name]
    raw_table = get_raw_table(school_url)
    subjects = get_subjects(raw_table)
    
    
    raw_table[subjects] = raw_table.apply(get_grades, axis=1, result_type='expand')

    # raw_table['SCHOOL_NAME'] = school_name
    # raw_table['DISTRICT_NAME'] = distr_name
    # raw_table['REGION_NAME'] = region_name

    students_df = pd.concat([students_df, raw_table],
                            ignore_index=True)
    
# district_df = pd.concat([district_df, students_df],
#                             ignore_index=True)

ValueError: arrays must all be same length

In [153]:
raw_table.iloc[29]['SUBJECTS']

'*W'

In [142]:
students_df['SUBJECTS'].unique()

array(['Kiswahili - B, English - A, Maarifa - C, Hisabati - C, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - A, Science - C, Uraia - B, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - B, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - A, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, M

In [128]:
schools_dict["ALAILELAI PRIMARY SCHOOL - PS0107001"]

'https://matokeo.necta.go.tz/psle/results/shl_ps0107001.htm'

In [116]:
school_url = schools_dict['ALAILELAI PRIMARY SCHOOL - PS0107001']
raw_table = get_raw_table(school_url)

In [117]:
subjects = get_subjects(raw_table)

In [118]:
raw_table[subjects] = raw_table.apply(get_grades, axis=1, result_type='expand')