# Tanzania - Webscraping 2021

### Paquetes (imports)

In [83]:
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Functions

In [2]:
base_url = 'https://matokeo.necta.go.tz/psle/psle.htm'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}

#### Funciones para sacar urls

In [3]:
def get_regions_dict(base_url):
    """
    Get the regions dictionary from the base url

    Parameters
    ----------
    base_url : str
    """

    # Connect to the base url
    r = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Parse html and get regions
    a_objects = soup.find_all('a')
    unwanted_chars = ['\r', '\n']
    regions_dict = {}
    
    for a_object in a_objects:
        href = a_object['href']
        region_name = a_object.text

        # Remove unwanted characters
        for unwanted_char in unwanted_chars:
            region_name = region_name.replace(unwanted_char, '')

        
        region_url = base_url.replace("psle.htm", href)
        regions_dict[region_name] = region_url

    return regions_dict

In [4]:
def get_districts_dict(region_url):
    """
    Get the districts dictionary from the regions dictionary

    Parameters
    ----------
    regions_dict : dict

    Returns
    -------

    districts_dict : dict
    """
    r = requests.get(region_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')
    
    unwanted_chars = ['\r', '\n']
    distr_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        distr_name = a_object.text

        for unwanted_char in unwanted_chars:
            distr_name = distr_name.replace(unwanted_char, '')

        distr_url = base_url.replace("psle.htm", f"results/{href}")
        distr_dict[distr_name] = distr_url

    return distr_dict

In [13]:
def get_schools_dict(distr_url):
    """
    Get the schools dictionary from the districts dictionary
    """
    
    r = requests.get(distr_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')

    unwanted_chars = ['\r', '\n']
    schools_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        school_name = a_object.text

        for unwanted_char in unwanted_chars:
            school_name = school_name.replace(unwanted_char, '')

        school_url = base_url.replace("psle.htm", f"results/{href}")
        schools_dict[school_name] = school_url

    return schools_dict

### Funciones - Parsear los datos

In [107]:
def get_raw_table(school_url):

    """
    Gets table with raw data from the school url
    """
    r = requests.get(school_url)
    soup = BeautifulSoup(r.content, 'html.parser')
    tables = pd.read_html(r.content)

    raw_table = tables[1]
    raw_table.rename(columns = {0: "CAND_NO", 1: "PREM_NO", 2: "SEX", 
                                3: "CAND_NAME", 4: "SUBJECTS"}, inplace=True)
    raw_table = raw_table.iloc[1:,:]
    raw_table.reset_index(drop=True, inplace=True)

    return raw_table

In [109]:
def get_subjects(raw_table):
    """
    Gets the subjects from the raw table
    """        
    str_subjects = raw_table['SUBJECTS'][0]
    list_subjects = str_subjects.split(",")
    clean_subjects = []

    for subject in list_subjects:
        raw_subject = subject.split("-")
        clean_subject = raw_subject[0].replace(" ", "")
        clean_subjects.append(clean_subject)

    return clean_subjects

In [113]:
def get_grades(raw_table):
    """
    Gets the grades from the raw table
    """
    str_subjects = raw_table['SUBJECTS']
    raw_grades = re.findall( r'\s-\s\w+' , str_subjects)
    clean_grades = []
    for raw_grade in raw_grades:
        clean_grade = re.findall( r'\w+' , raw_grade)
        clean_grade = clean_grade[0]
        clean_grades.append(clean_grade)

    return clean_grades

### Pipeline

In [16]:
regions_dict = get_regions_dict(base_url)
distr_dict = get_districts_dict(regions_dict['ARUSHA'])
schools_dict = get_schools_dict(distr_dict['NGORONGORO'])

In [108]:
school_url = schools_dict['ALAILELAI PRIMARY SCHOOL - PS0107001']
raw_table = get_raw_table(school_url)
raw_table

Unnamed: 0,CAND_NO,PREM_NO,SEX,CAND_NAME,SUBJECTS
0,PS0107001-0001,20150394747,M,ALAIDEDIA SARINGE TAUWO,"Kiswahili - E, English - E, Maarifa - D, Hisab..."
1,PS0107001-0002,20150394749,M,ASHAMU PARMET LULUNGENI,"Kiswahili - B, English - B, Maarifa - B, Hisab..."
2,PS0107001-0003,20150985362,M,KIDIRI ALAANDARE OLOPONO,"Kiswahili - X, English - X, Maarifa - X, Hisab..."
3,PS0107001-0004,20150394752,M,LEMENDIYA NGAAKA NGAKURU,"Kiswahili - B, English - B, Maarifa - C, Hisab..."
4,PS0107001-0005,20150394753,M,LENGAI PARSAN LEBANGUTI,"Kiswahili - C, English - C, Maarifa - D, Hisab..."
5,PS0107001-0006,20150394755,M,LENGAKWAI OHORI OLONJUS,"Kiswahili - C, English - B, Maarifa - C, Hisab..."
6,PS0107001-0007,20150394756,M,LESI MOPORO LENGILALA,"Kiswahili - C, English - C, Maarifa - C, Hisab..."
7,PS0107001-0008,20150394758,M,LONYORI MENG'ORU PARSAN,"Kiswahili - E, English - D, Maarifa - E, Hisab..."
8,PS0107001-0009,20150394759,M,MUNDELEI SEMBETA LEBARUTU,"Kiswahili - C, English - B, Maarifa - C, Hisab..."
9,PS0107001-0010,20150394760,M,NGAYENI STEVEN OLESAMBU,"Kiswahili - D, English - B, Maarifa - C, Hisab..."


In [115]:
subjects = get_subjects(raw_table)
subjects

['Kiswahili',
 'English',
 'Maarifa',
 'Hisabati',
 'Science',
 'Uraia',
 'AverageGrade']

In [114]:
raw_table[subjects] = raw_table.apply(get_grades, axis=1, result_type='expand')
raw_table

Unnamed: 0,CAND_NO,PREM_NO,SEX,CAND_NAME,SUBJECTS,Kiswahili,English,Maarifa,Hisabati,Science,Uraia,AverageGrade
0,PS0107001-0001,20150394747,M,ALAIDEDIA SARINGE TAUWO,"Kiswahili - E, English - E, Maarifa - D, Hisab...",E,E,D,D,D,E,E
1,PS0107001-0002,20150394749,M,ASHAMU PARMET LULUNGENI,"Kiswahili - B, English - B, Maarifa - B, Hisab...",B,B,B,B,B,B,B
2,PS0107001-0003,20150985362,M,KIDIRI ALAANDARE OLOPONO,"Kiswahili - X, English - X, Maarifa - X, Hisab...",X,X,X,X,X,X,X
3,PS0107001-0004,20150394752,M,LEMENDIYA NGAAKA NGAKURU,"Kiswahili - B, English - B, Maarifa - C, Hisab...",B,B,C,B,B,B,B
4,PS0107001-0005,20150394753,M,LENGAI PARSAN LEBANGUTI,"Kiswahili - C, English - C, Maarifa - D, Hisab...",C,C,D,D,D,D,D
5,PS0107001-0006,20150394755,M,LENGAKWAI OHORI OLONJUS,"Kiswahili - C, English - B, Maarifa - C, Hisab...",C,B,C,C,B,D,C
6,PS0107001-0007,20150394756,M,LESI MOPORO LENGILALA,"Kiswahili - C, English - C, Maarifa - C, Hisab...",C,C,C,C,C,D,C
7,PS0107001-0008,20150394758,M,LONYORI MENG'ORU PARSAN,"Kiswahili - E, English - D, Maarifa - E, Hisab...",E,D,E,E,D,E,D
8,PS0107001-0009,20150394759,M,MUNDELEI SEMBETA LEBARUTU,"Kiswahili - C, English - B, Maarifa - C, Hisab...",C,B,C,C,B,C,C
9,PS0107001-0010,20150394760,M,NGAYENI STEVEN OLESAMBU,"Kiswahili - D, English - B, Maarifa - C, Hisab...",D,B,C,B,B,C,C
