# Tanzania - Webscraping 2021

### Paquetes (imports)

In [1]:
import re
import time
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Parameters

In [2]:
base_url = 'https://onlinesys.necta.go.tz/results/2021/psle/psle.htm'
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}

##### Helper Functions

In [3]:
def create_session(base_url):
    """
    Create Session with base_url
    """
    with requests.Session() as session:
        session.get(base_url, headers=headers)

    return session

#### Funciones para sacar urls

In [4]:
def get_regions_dict(base_url):
    """
    Get the regions dictionary from the base url

    Parameters
    ----------
    base_url : str
    """

    # Connect to the base url
    r = requests.get(base_url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Parse html and get regions
    a_objects = soup.find_all('a')
    unwanted_chars = ['\r', '\n']
    regions_dict = {}
    
    for a_object in a_objects:
        href = a_object['href']
        region_name = a_object.text

        # Remove unwanted characters
        for unwanted_char in unwanted_chars:
            region_name = region_name.replace(unwanted_char, '')

        
        region_url = base_url.replace("psle.htm", href)
        regions_dict[region_name] = region_url

    return regions_dict

In [5]:
def get_districts_dict(region_url, session):
    """
    Get the districts dictionary from the regions dictionary

    Parameters
    ----------
    regions_dict : dict

    Returns
    -------

    districts_dict : dict
    """
    success = 0
    while success == 0:
        try:
            r = session.get(region_url, headers=headers, timeout=5)
            success = 1
        except Exception as err:
            time.sleep(1.5)
            print(f"Retrying request: {err}")
            
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')
    
    unwanted_chars = ['\r', '\n']
    distr_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        distr_name = a_object.text

        for unwanted_char in unwanted_chars:
            distr_name = distr_name.replace(unwanted_char, '')

        # if "results" not in base_url:
        distr_url = base_url.replace("psle.htm", f"results/{href}")
        #else:
        #distr_url = base_url.replace("psle.htm", href)
            
        distr_dict[distr_name] = distr_url

    return distr_dict

In [6]:
def get_schools_dict(distr_url, session):
    """
    Get the schools dictionary from the districts dictionary
    """
    success = 0
    while success == 0:
        try:
            r = session.get(distr_url, headers=headers, timeout=5)
            success = 1
        except Exception as err:
            time.sleep(1.5)
            print(f"Retrying request: {err}")
    
    soup = BeautifulSoup(r.content, 'html.parser')
    a_objects = soup.find_all('a')

    unwanted_chars = ['\r', '\n']
    schools_dict = {}

    # Get district dictionary from the region url
    for a_object in a_objects:
        href = a_object['href']
        school_name = a_object.text

        for unwanted_char in unwanted_chars:
            school_name = school_name.replace(unwanted_char, '')

        school_url = base_url.replace("psle.htm", f"results/{href}")
        
        schools_dict[school_name] = school_url

    return schools_dict

### Funciones - Parsear los datos

In [7]:
def get_raw_table(school_url, session):

    """
    Gets table with raw data from the school url
    """
    
    success = 0
    while success == 0:
        try:
            r = session.get(school_url, headers=headers, timeout=5)
            success = 1
        except Exception as err:
            time.sleep(1.5)
            print(f"Retrying request: {err}")

    soup = BeautifulSoup(r.content, 'html.parser')
    raw_tables = soup.find_all('table')

    if len(raw_tables) == 0:
        raw_table = pd.DataFrame([])

    else:
        tables = pd.read_html(r.content)
        raw_table = tables[1]
        raw_table.rename(columns = {0: "CAND_NO", 1: "PREM_NO", 2: "SEX", 
                                    3: "CAND_NAME", 4: "SUBJECTS"}, inplace=True)
        raw_table = raw_table.iloc[1:,:]
        raw_table.reset_index(drop=True, inplace=True)

    return raw_table

In [8]:
def get_subjects(raw_table):
    """
    Gets the subjects from the raw table
    """        
    unique_subjects = raw_table['SUBJECTS'].unique()

    # Asegurarnos de que las materias no sean "*R" o una cosa así
    for unique_subject in unique_subjects:
        if len(unique_subject) > 3:
            str_subjects = unique_subject
            break
    
    # Caso cuando no hay materias, hardcodearlas
    if (len(unique_subjects) == 1) and ( "*" in unique_subjects[0]):
        clean_subjects = ["Kiswahili", "English", "Maarifa", "Hisabati", "Science", "Uraia", "AverageGrade"]
    
    else:
        list_subjects = str_subjects.split(",")
        clean_subjects = []

        for subject in list_subjects:
            raw_subject = subject.split("-")
            clean_subject = raw_subject[0].replace(" ", "")
            clean_subjects.append(clean_subject)

    return clean_subjects

In [9]:
def get_grades(raw_table, num_subjects):
    """
    Gets the grades from the raw table
    """
    str_subjects = raw_table['SUBJECTS']
    print(str_subjects)
    
    # Casos especiales cuando viene una sola letra, poner un vector de NA's
    if (len(str_subjects) <= 3) or ("*" in str_subjects):
            clean_grades = ["NA"] * num_subjects
    else:
        raw_grades = re.findall( r'\s-\s\w+' , str_subjects)
        clean_grades = []
        
        for raw_grade in raw_grades:
            clean_grade = re.findall( r'\w+' , raw_grade)
            clean_grade = clean_grade[0]
            clean_grades.append(clean_grade)

    return clean_grades

#### Probar con el objeto

In [10]:
from tanzaniaScrapper import TanzaniaScrapper
scrapper = TanzaniaScrapper(base_url, 2021, headers)
scrapper.create_session()

<requests.sessions.Session at 0x1f28b223850>

In [11]:
regions_dict = scrapper.get_regions_dict()

In [None]:
# region_names = list(regions_dict.keys())

# regions_df = pd.DataFrame([])


# for region_name in region_names[:3]:
#     print(f"Getting {region_name}")
#     session = scrapper.create_session()
#     distr_dict = scrapper.get_districts_dict(regions_dict[region_name])

#     distr_names = list(distr_dict.keys())
#     district_df = pd.DataFrame([])

#     for distr_name in distr_names:
#         schools_dict = scrapper.get_schools_dict(distr_dict[distr_name])
#         schools_names = list(schools_dict.keys())
#         students_df = pd.DataFrame([])

#         for school_name in schools_names:
#             print(f"Getting {school_name}")
#             raw_table = scrapper.get_raw_table(schools_dict[school_name])

#             if len(raw_table) == 0: # Si no hubo tabla, continuar con otra escuela
#                 print("Skipping school:", school_name)
#                 continue

#             else:
#                 subjects = scrapper.get_subjects()
#                 num_subjects = len(subjects)
                
#                 raw_table[subjects] = raw_table.apply(scrapper.get_grades, args=[num_subjects], 
#                                                         axis=1, result_type='expand')

Getting ARUSHA
Getting ALBEHIJE PRIMARY SCHOOL - PS0101114


TypeError: get_grades() takes 2 positional arguments but 3 were given

In [24]:
num_subjects

7

In [13]:
scrapper.get_grades(num_subjects)

TypeError: cannot use a string pattern on a bytes-like object

In [17]:
raw_table['SUBJECTS']

0     Kiswahili - B, English - A, Maarifa - C, Hisab...
1     Kiswahili - A, English - A, Maarifa - C, Hisab...
2     Kiswahili - A, English - A, Maarifa - D, Hisab...
3     Kiswahili - B, English - A, Maarifa - C, Hisab...
4     Kiswahili - A, English - A, Maarifa - C, Hisab...
5     Kiswahili - B, English - B, Maarifa - C, Hisab...
6     Kiswahili - B, English - B, Maarifa - D, Hisab...
7     Kiswahili - A, English - A, Maarifa - B, Hisab...
8     Kiswahili - B, English - A, Maarifa - C, Hisab...
9     Kiswahili - A, English - A, Maarifa - B, Hisab...
10    Kiswahili - A, English - A, Maarifa - C, Hisab...
11    Kiswahili - B, English - A, Maarifa - C, Hisab...
12    Kiswahili - A, English - A, Maarifa - B, Hisab...
13    Kiswahili - A, English - A, Maarifa - C, Hisab...
14    Kiswahili - B, English - A, Maarifa - C, Hisab...
15    Kiswahili - B, English - A, Maarifa - D, Hisab...
Name: SUBJECTS, dtype: object

In [17]:
distr_dict

{'BAHI': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0301.htm',
 'CHAMWINO': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0306.htm',
 'CHEMBA': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0307.htm',
 'DODOMA CC': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0302.htm',
 'KONDOA': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0303.htm',
 'KONDOA TC': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0308.htm',
 'KONGWA': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0305.htm',
 'MPWAPWA': 'https://onlinesys.necta.go.tz/results/2021/psle/results/distr_0304.htm'}

### Pipeline

In [18]:
regions_dict = get_regions_dict(base_url)
region_names = list(regions_dict.keys())

regions_df = pd.DataFrame([])


for region_name in region_names:
    
    session = create_session(base_url)
    distr_dict = get_districts_dict(regions_dict[region_name], session)
    distr_names = list(distr_dict.keys())

    district_df = pd.DataFrame([])

    for distr_name in distr_names:

        schools_dict = get_schools_dict(distr_dict[distr_name], session)
        schools_names = list(schools_dict.keys())
        students_df = pd.DataFrame([])

        for school_name in schools_names:
            school_url = schools_dict[school_name]
            raw_table = get_raw_table(school_url, session)
            if len(raw_table) == 0: # Si no hubo tabla, continuar con otra escuela
                print("Skipping school:", school_name)
                continue

            else:
                subjects = get_subjects(raw_table)
                num_subjects = len(subjects)
                
                raw_table[subjects] = raw_table.apply(get_grades, args=[num_subjects], 
                                                        axis=1, result_type='expand')

                raw_table['SCHOOL_NAME'] = school_name
                raw_table['DISTRICT_NAME'] = distr_name
                raw_table['REGION_NAME'] = region_name

                students_df = pd.concat([students_df, raw_table], ignore_index=True)
            
        print(f"Done with district {distr_name}")
        time.sleep(2)
        district_df = pd.concat([district_df, students_df],
                                ignore_index=True)
    
    regions_df = pd.concat([regions_df, district_df], 
                            ignore_index=True)

     # close requests session
    requests.post(base_url, headers={'Connection':'close'})

    print(f" =============== Done with region {region_name} ==============")

   
    
    time.sleep(15)


ConnectionError: HTTPSConnectionPool(host='onlinesys.necta.go.tz', port=443): Max retries exceeded with url: /results/2021/psle/psle.htm (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001F28D1D3A30>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [159]:
regions_df

Unnamed: 0,CAND_NO,PREM_NO,SEX,CAND_NAME,SUBJECTS,Kiswahili,English,Maarifa,Hisabati,Science,Uraia,AverageGrade,SCHOOL_NAME,DISTRICT_NAME,REGION_NAME
0,PS0101114-0001,20150348195,M,DANIEL JAMES KAGUO,"Kiswahili - B, English - A, Maarifa - C, Hisab...",B,A,C,C,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
1,PS0101114-0002,20156273910,M,DAVIS ROBERT LUCAS,"Kiswahili - A, English - A, Maarifa - C, Hisab...",A,A,C,B,B,B,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
2,PS0101114-0003,20150348196,M,ELISANTE GABRIEL NKYA,"Kiswahili - A, English - A, Maarifa - D, Hisab...",A,A,D,B,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
3,PS0101114-0004,20156555361,M,FESTUS RENASTUS CHIMOLA,"Kiswahili - B, English - A, Maarifa - C, Hisab...",B,A,C,B,B,C,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
4,PS0101114-0005,20156272958,M,IAN INNOCENT GEOFREY,"Kiswahili - A, English - A, Maarifa - C, Hisab...",A,A,C,A,C,B,B,ALBEHIJE PRIMARY SCHOOL - PS0101114,ARUSHA,ARUSHA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176345,PS0304115-0074,20150318960,F,SHAMIMU ZADU HELMANI,"Kiswahili - B, English - C, Maarifa - C, Hisab...",B,C,C,D,C,C,C,WOTTA PRIMARY SCHOOL - PS0304115,MPWAPWA,DODOMA
176346,PS0304115-0075,20150318961,F,TARIME KANGOLA SAMILA,"Kiswahili - C, English - D, Maarifa - D, Hisab...",C,D,D,D,C,D,D,WOTTA PRIMARY SCHOOL - PS0304115,MPWAPWA,DODOMA
176347,PS0304115-0076,20150318962,F,VENELANDA JOSEPH MWIKOLA,"Kiswahili - B, English - D, Maarifa - C, Hisab...",B,D,C,C,C,C,C,WOTTA PRIMARY SCHOOL - PS0304115,MPWAPWA,DODOMA
176348,PS0304115-0077,20150318963,F,VIGENIA ELIABI NGILIULE,"Kiswahili - B, English - D, Maarifa - C, Hisab...",B,D,C,C,C,C,C,WOTTA PRIMARY SCHOOL - PS0304115,MPWAPWA,DODOMA


### Debugging

In [None]:
school_url
raw_table = get_raw_table(school_url)
raw_table
subjects = get_subjects(raw_table)
num_subjects = len(subjects)
subjects

['*R']

In [None]:
raw_table['SUBJECTS'].unique()[1]

'Kiswahili - A, English - A, Maarifa - A, Hisabati - A, Science - A, Uraia - B, Average Grade - A'

In [None]:
raw_table['SUBJECTS'][0]

'*R'

In [None]:
students_df['SUBJECTS'].unique()

array(['Kiswahili - B, English - A, Maarifa - C, Hisabati - C, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - C, Hisabati - A, Science - C, Uraia - B, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - C, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - B, Maarifa - D, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - A, English - A, Maarifa - B, Hisabati - B, Science - B, Uraia - C, Average Grade - B',
       'Kiswahili - B, English - A, Maarifa - C, Hisabati - A, Science - B, Uraia - B, Average Grade - B',
       'Kiswahili - A, English - A, M

In [None]:
schools_dict["ALAILELAI PRIMARY SCHOOL - PS0107001"]

'https://matokeo.necta.go.tz/psle/results/shl_ps0107001.htm'

In [None]:
school_url = schools_dict['ALAILELAI PRIMARY SCHOOL - PS0107001']
raw_table = get_raw_table(school_url)

In [None]:
subjects = get_subjects(raw_table)

In [None]:
raw_table[subjects] = raw_table.apply(get_grades, axis=1, result_type='expand')