# Tanzania - Webscraping 2021

### Paquetes (imports)

In [23]:
import re
import time
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Parameters

In [29]:
base_url = 'https://onlinesys.necta.go.tz/results/2021/psle/psle.htm'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}

##### Helper Functions

In [30]:
def create_session(base_url):
    """
    Create Session with base_url
    """
    with requests.Session() as session:
        session.get(base_url, headers=headers)

    return session

#### Funciones para sacar urls

In [31]:
def get_regions_dict(base_url):
    """
    Get the regions dictionary from the base url

    Parameters
    ----------
    base_url : str
    """

    # Connect to the base url
    r = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Parse html and get regions
    a_objects = soup.find_all('a')
    unwanted_chars = ['\r', '\n']
    regions_dict = {}
    
    for a_object in a_objects:
        href = a_object['href']
        region_name = a_object.text

        # Remove unwanted characters
        for unwanted_char in unwanted_chars:
            region_name = region_name.replace(unwanted_char, '')

        
        region_url = base_url.replace("psle.htm", href)
        regions_dict[region_name] = region_url

    return regions_dict

In [32]:
def get_districts_dict(region_url, session):
    """
    Get the districts dictionary from the regions dictionary

    Parameters
    ----------
    regions_dict : dict

    Returns
    -------

    districts_dict : dict
    """
    success = 0
    while success == 0:
        try:
            r = session.get(region_url, headers=headers, timeout=5)
            success = 1
        except Exception as err:
            time.sleep(1.5)
            print(f"Retrying request: {err}")
            
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')
    
    unwanted_chars = ['\r', '\n']
    distr_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        distr_name = a_object.text

        for unwanted_char in unwanted_chars:
            distr_name = distr_name.replace(unwanted_char, '')

        # if "results" not in base_url:
        distr_url = base_url.replace("psle.htm", f"results/{href}")
        #else:
        #distr_url = base_url.replace("psle.htm", href)
            
        distr_dict[distr_name] = distr_url

    return distr_dict

In [33]:
def get_schools_dict(distr_url, session):
    """
    Get the schools dictionary from the districts dictionary
    """
    success = 0
    while success == 0:
        try:
            r = session.get(distr_url, headers=headers, timeout=5)
            success = 1
        except Exception as err:
            time.sleep(1.5)
            print(f"Retrying request: {err}")
    
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')

    unwanted_chars = ['\r', '\n']
    schools_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        school_name = a_object.text

        for unwanted_char in unwanted_chars:
            school_name = school_name.replace(unwanted_char, '')

        school_url = base_url.replace("psle.htm", f"results/{href}")
        
        schools_dict[school_name] = school_url

    return schools_dict

### Funciones - Parsear los datos

In [53]:
def get_raw_table(school_url, session, year):

    """
    Gets table with raw data from the school url
    """
    
    success = 0
    while success == 0:
        try:
            r = session.get(school_url, headers=headers, timeout=5)
            success = 1
        except Exception as err:
            time.sleep(1.5)
            print(f"Retrying request: {err}")

    soup = BeautifulSoup(r.content, 'html.parser')
    raw_tables = soup.find_all('table')

    if len(raw_tables) == 0:
        raw_table = pd.DataFrame([])

    else:
        tables = pd.read_html(r.content)
        raw_table = tables[1]
        if year == "2021":
            raw_table.rename(columns = {0: "CAND_NO", 1: "PREM_NO", 2: "SEX", 
                                    3: "CAND_NAME", 4: "SUBJECTS"}, inplace=True)
        else:
            raw_table.rename(columns = {0: "CAND_NO", 1: "PREM_NO", 2: "SEX", 
                                       3: "PREM_NO", 4: "SUBJECTS"}, inplace=True)
        raw_table = raw_table.iloc[1:,:]
        raw_table.reset_index(drop=True, inplace=True)

    return raw_table, soup

In [54]:
def get_subjects(raw_table):
    """
    Gets the subjects from the raw table
    """        
    unique_subjects = raw_table['SUBJECTS'].unique()

    # Asegurarnos de que las materias no sean "*R" o una cosa así
    for unique_subject in unique_subjects:
        if len(unique_subject) > 3:
            str_subjects = unique_subject
            break
    
    # Caso cuando no hay materias, hardcodearlas
    if (len(unique_subjects) == 1) and ( "*" in unique_subjects[0]):
        clean_subjects = ["Kiswahili", "English", "Maarifa", "Hisabati", "Science", "Uraia", "AverageGrade"]
    
    else:
        list_subjects = str_subjects.split(",")
        clean_subjects = []

        for subject in list_subjects:
            raw_subject = subject.split("-")
            clean_subject = raw_subject[0].replace(" ", "")
            clean_subjects.append(clean_subject)

    return clean_subjects

In [55]:
def get_grades(raw_table, num_subjects):
    """
    Gets the grades from the raw table
    """
    str_subjects = raw_table['SUBJECTS']
    
    # Casos especiales cuando viene una sola letra, poner un vector de NA's
    if (len(str_subjects) <= 3) or ("*" in str_subjects):
            clean_grades = ["NA"] * num_subjects
    else:
        raw_grades = re.findall( r'\s-\s\w+' , str_subjects)
        clean_grades = []
        
        for raw_grade in raw_grades:
            clean_grade = re.findall( r'\w+' , raw_grade)
            clean_grade = clean_grade[0]
            clean_grades.append(clean_grade)

    return clean_grades

In [56]:
def get_second_table(soup, school_name, distr_name, region_name):
    """
    Gets the second table from the school url
    
    Parameters:
    -----------
    soup: BeautifulSoup object
    school_name: string
        Name of the school

    Returns:
    --------
    second_table: pandas.DataFrame
        Table with the second table
    """

    text = soup.find_all("p", {"align": "LEFT"})[2]
    raw_contents = text.contents
    dicts_list = []
    for raw_content in raw_contents:
        if isinstance(raw_content, str):
            temp_dict = {}
            clean_content = raw_content.replace("\n", "").replace("\r", "")
            clean_content = clean_content.replace("  ", " ")
            content_list = clean_content.split(":")
            
            if content_list[0] != '':
                if len(content_list) == 2:
                    temp_dict["Column 0"] = content_list[0].strip()
                    temp_dict["Column 1"] = content_list[1].strip()
                else:
                    temp_dict["Column 0"] = content_list[0].strip()
                    temp_dict["Column 1"] = ''
                
                dicts_list.append(temp_dict)
    
    second_table = pd.DataFrame.from_dict(dicts_list)
    second_table['SCHOOL_NAME'] = school_name
    second_table['DISTRICT_NAME'] = distr_name
    second_table['REGION_NAME'] = region_name

    return second_table

### Pipeline

In [59]:
new_base_url = base_url.replace("2021", "2020")

regions_dict = get_regions_dict(new_base_url)
region_names = list(regions_dict.keys())

regions_df = pd.DataFrame([])
second_tables = pd.DataFrame([])


for region_name in region_names[:1]:
    
    session = create_session(base_url)
    distr_dict = get_districts_dict(regions_dict[region_name], session)
    distr_names = list(distr_dict.keys())

    district_df = pd.DataFrame([])

    for distr_name in distr_names:

        schools_dict = get_schools_dict(distr_dict[distr_name], session)
        schools_names = list(schools_dict.keys())
        students_df = pd.DataFrame([])

        for school_name in schools_names:
            school_url = schools_dict[school_name]
            raw_table, soup = get_raw_table(school_url, session, year="2020")
            if len(raw_table) == 0: # Si no hubo tabla, continuar con otra escuela
                print("Skipping school:", school_name)
                continue

            else:
                subjects = get_subjects(raw_table)
                num_subjects = len(subjects)
                
                raw_table[subjects] = raw_table.apply(get_grades, args=[num_subjects], 
                                                        axis=1, result_type='expand')
                raw_table['SCHOOL_NAME'] = school_name
                raw_table['DISTRICT_NAME'] = distr_name
                raw_table['REGION_NAME'] = region_name

                students_df = pd.concat([students_df, raw_table], ignore_index=True)

                second_table = get_second_table(soup, school_name, distr_name, region_name)
                second_tables = pd.concat([second_tables, second_table], ignore_index=True)

            
        print(f"Done with district {distr_name}")
        time.sleep(2)
        district_df = pd.concat([district_df, students_df],
                                ignore_index=True)
     
    regions_df = pd.concat([regions_df, district_df], 
                            ignore_index=True)

     # close requests session
    requests.post(base_url, headers={'Connection':'close'})

    print(f" =============== Done with region {region_name} ==============")


    time.sleep(15)

Retrying request: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying request: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying request: HTTPSConnectionPool(host='onlinesys.necta.go.tz', port=443): Max retries exceeded with url: /results/2021/psle/results/shl_ps0101017.htm (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001B493B10D30>, 'Connection to onlinesys.necta.go.tz timed out. (connect timeout=5)'))
Done with district ARUSHA
Retrying request: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying request: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying request: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying request: ('Connection aborted.', RemoteDisconnected('Remote end closed connection with

In [51]:
students_df

In [52]:
raw_table

Unnamed: 0,CAND_NO,SEX,CAND_NAME,SUBJECTS,4
0,PS0101114-0001,20150348195,M,DANIEL JAMES KAGUO,"Kiswahili - B, English - A, Maarifa - C, Hisab..."
1,PS0101114-0002,20156273910,M,DAVIS ROBERT LUCAS,"Kiswahili - A, English - A, Maarifa - C, Hisab..."
2,PS0101114-0003,20150348196,M,ELISANTE GABRIEL NKYA,"Kiswahili - A, English - A, Maarifa - D, Hisab..."
3,PS0101114-0004,20156555361,M,FESTUS RENASTUS CHIMOLA,"Kiswahili - B, English - A, Maarifa - C, Hisab..."
4,PS0101114-0005,20156272958,M,IAN INNOCENT GEOFREY,"Kiswahili - A, English - A, Maarifa - C, Hisab..."
5,PS0101114-0006,20150316147,M,MESHACK THOMAS NATHANAEL,"Kiswahili - B, English - B, Maarifa - C, Hisab..."
6,PS0101114-0007,20150348198,M,PHILIPO AYUBU MBWANA,"Kiswahili - B, English - B, Maarifa - D, Hisab..."
7,PS0101114-0008,20150348199,M,SALIM IDDY RASHID,"Kiswahili - A, English - A, Maarifa - B, Hisab..."
8,PS0101114-0009,20150377806,F,DORA SALVATORY THADEI,"Kiswahili - B, English - A, Maarifa - C, Hisab..."
9,PS0101114-0010,20152878842,F,EVALYNE PERFECT ELIAS,"Kiswahili - A, English - A, Maarifa - B, Hisab..."


In [105]:
regions_df

Unnamed: 0,CAND_NO,PREM_NO,SEX,CAND_NAME,SUBJECTS,Kiswahili,English,Maarifa,Hisabati,Science,Uraia,AverageGrade,SCHOOL_NAME,DISTRICT_NAME,REGION_NAME
0,PS0101114-0001,20150348195,M,DANIEL JAMES KAGUO,"Kiswahili - B, English - A, Maarifa - C, Hisab...",B,A,C,C,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
1,PS0101114-0002,20156273910,M,DAVIS ROBERT LUCAS,"Kiswahili - A, English - A, Maarifa - C, Hisab...",A,A,C,B,B,B,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
2,PS0101114-0003,20150348196,M,ELISANTE GABRIEL NKYA,"Kiswahili - A, English - A, Maarifa - D, Hisab...",A,A,D,B,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
3,PS0101114-0004,20156555361,M,FESTUS RENASTUS CHIMOLA,"Kiswahili - B, English - A, Maarifa - C, Hisab...",B,A,C,B,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
4,PS0101114-0005,20156272958,M,IAN INNOCENT GEOFREY,"Kiswahili - A, English - A, Maarifa - C, Hisab...",A,A,C,A,C,B,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543101,PS1105057-0072,20170567216,F,TERESIA DENIS LIGAMBAZI,"Kiswahili - A, English - C, Maarifa - B, Hisab...",A,C,B,B,A,B,B,UPONERA PRIMARY SCHOOL - PS1105057,ULANGA,MOROGORO
543102,PS1105057-0073,20170567217,F,VAILETH KRISANTUS MATANDU,"Kiswahili - A, English - D, Maarifa - B, Hisab...",A,D,B,C,C,C,C,UPONERA PRIMARY SCHOOL - PS1105057,ULANGA,MOROGORO
543103,PS1105057-0074,20170567218,F,VAILETH MBOPA MWALIMA,"Kiswahili - B, English - D, Maarifa - B, Hisab...",B,D,B,D,B,C,C,UPONERA PRIMARY SCHOOL - PS1105057,ULANGA,MOROGORO
543104,PS1105057-0075,20170567219,F,WINFRIDA RENATUS BILALI,"Kiswahili - A, English - D, Maarifa - C, Hisab...",A,D,C,D,C,D,C,UPONERA PRIMARY SCHOOL - PS1105057,ULANGA,MOROGORO


In [106]:
second_tables

Unnamed: 0,Column 0,Column 1,SCHOOL_NAME,DISTRICT_NAME,REGION_NAME
0,WALIOFANYA MTIHANI,16,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
1,WASTANI WA SHULE,217.3750,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
2,KUNDI LA SHULE,Wanafunzi chini ya 40,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
3,NAFASI YA SHULE KWENYE KUNDI LAKE KIHALMASHAURI,22 kati ya 46,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
4,NAFASI YA SHULE KWENYE KUNDI LAKE KIMKOA,74 kati ya 290,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
...,...,...,...,...,...
58228,KUNDI LA SHULE,Wanafunzi chini ya 40,MITESA PRIMARY SCHOOL - PS1201056,MASASI,MTWARA
58229,NAFASI YA SHULE KWENYE KUNDI LAKE KIHALMASHAURI,41 kati ya 42,MITESA PRIMARY SCHOOL - PS1201056,MASASI,MTWARA
58230,NAFASI YA SHULE KWENYE KUNDI LAKE KIMKOA,279 kati ya 284,MITESA PRIMARY SCHOOL - PS1201056,MASASI,MTWARA
58231,NAFASI YA SHULE KWENYE KUNDI LAKE KITAIFA,5475 kati ya 5664,MITESA PRIMARY SCHOOL - PS1201056,MASASI,MTWARA


In [107]:
school_url

'https://onlinesys.necta.go.tz/results/2021/psle/results/shl_ps1201057.htm'

In [111]:
previous_results = pd.read_csv("tanzania_2021_results.csv", encoding='utf-8-sig')

In [112]:
previous_results

Unnamed: 0,CAND_NO,PREM_NO,SEX,CAND_NAME,SUBJECTS,Kiswahili,English,Maarifa,Hisabati,Science,Uraia,AverageGrade,SCHOOL_NAME,DISTRICT_NAME,REGION_NAME
0,PS0101114-0001,2.015035e+10,M,DANIEL JAMES KAGUO,"Kiswahili - B, English - A, Maarifa - C, Hisab...",B,A,C,C,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
1,PS0101114-0002,2.015627e+10,M,DAVIS ROBERT LUCAS,"Kiswahili - A, English - A, Maarifa - C, Hisab...",A,A,C,B,B,B,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
2,PS0101114-0003,2.015035e+10,M,ELISANTE GABRIEL NKYA,"Kiswahili - A, English - A, Maarifa - D, Hisab...",A,A,D,B,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
3,PS0101114-0004,2.015656e+10,M,FESTUS RENASTUS CHIMOLA,"Kiswahili - B, English - A, Maarifa - C, Hisab...",B,A,C,B,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
4,PS0101114-0005,2.015627e+10,M,IAN INNOCENT GEOFREY,"Kiswahili - A, English - A, Maarifa - C, Hisab...",A,A,C,A,C,B,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1132036,PS3105028-0022,2.015655e+10,F,JESCA METHORD MGAYA,"Kiswahili - A, English - A, Maarifa - B, Hisab...",A,A,B,A,B,A,A,WISDOM PRIMARY SCHOOL - PS3105028,TUNDUMA TC,SONGWE
1132037,PS3105028-0023,2.015037e+10,F,KEYLINE ELENEUS MPUNJANI,"Kiswahili - A, English - A, Maarifa - A, Hisab...",A,A,A,B,B,B,A,WISDOM PRIMARY SCHOOL - PS3105028,TUNDUMA TC,SONGWE
1132038,PS3105028-0024,2.015655e+10,F,ROSE CHARLES SIMBEYE,"Kiswahili - A, English - A, Maarifa - B, Hisab...",A,A,B,B,B,B,A,WISDOM PRIMARY SCHOOL - PS3105028,TUNDUMA TC,SONGWE
1132039,PS3105028-0025,2.015037e+10,F,SABRINA EMANUEL SIMBA,"Kiswahili - A, English - A, Maarifa - B, Hisab...",A,A,B,B,B,A,A,WISDOM PRIMARY SCHOOL - PS3105028,TUNDUMA TC,SONGWE


In [None]:
r = requests.get("https://onlinesys.necta.go.tz/results/2021/psle/results/shl_ps1308027.htm", headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
soup

<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<head>
<body bgcolor="LIGHTBLUE" link="#0000ff" text="#000080" vlink="#800080">
<font color="#800080"><h2>NATIONAL EXAMINATIONS COUNCIL OF TANZANIA</h2>
<h1><p align="LEFT"> PSLE 2021 EXAMINATION RESULTS</p></h1>
<h3><p align="LEFT">ITULABUSIGA PRIMARY SCHOOL - PS1308027
</p></h3>
<p align="LEFT">
WALIOFANYA MTIHANI : 67
<br/>
WASTANI WA SHULE   : 151.6567
<br/>
KUNDI LA SHULE : Wanafunzi 40 au zaidi
<br/>
NAFASI YA SHULE KWENYE KUNDI LAKE KIHALMASHAURI: 33 kati ya 83
<br/>
NAFASI YA SHULE KWENYE KUNDI LAKE KIMKOA  : 435 kati ya 843
<br/>
NAFASI YA SHULE KWENYE KUNDI LAKE KITAIFA : 6132 kati ya 11909
<br/>
<br/>
MADARAJA YA UFAULU WA UJUMLA
<br/>
<table align="LEFT" border="" width=" 50%">
<tr><td width="10%">
<p align="CENTER">
<b><font face="Courier" size="2"><p align="CENTER">JINSI</p></font></b></p></td></tr></table></p></font>
<td valign="MIDDLE" width="6%">
<b><font face="Courier" size="2"><p align

In [34]:
regions_df.to_csv("tanzania_2021_results.csv", encoding="utf-8-sig", index=False)

### Debugging

In [None]:
school_url
raw_table = get_raw_table(school_url)
raw_table
subjects = get_subjects(raw_table)
num_subjects = len(subjects)
subjects

['*R']

In [None]:
raw_table['SUBJECTS'].unique()[1]

'Kiswahili - A, English - A, Maarifa - A, Hisabati - A, Science - A, Uraia - B, Average Grade - A'

In [None]:
raw_table['SUBJECTS'][0]

'*R'

In [None]:
students_df['SUBJECTS'].unique()

array(['Kiswahili - B, English - A, Maarifa - C, Hisabati - C, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - A, Science - C, Uraia - B, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - B, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - A, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, M

In [None]:
schools_dict["ALAILELAI PRIMARY SCHOOL - PS0107001"]

'https://matokeo.necta.go.tz/psle/results/shl_ps0107001.htm'

In [None]:
school_url = schools_dict['ALAILELAI PRIMARY SCHOOL - PS0107001']
raw_table = get_raw_table(school_url)

In [None]:
subjects = get_subjects(raw_table)

In [None]:
raw_table[subjects] = raw_table.apply(get_grades, axis=1, result_type='expand')