# Import

In [1]:
from func_timeout import func_timeout, FunctionTimedOut, func_set_timeout

# Helper functions

In [2]:
from ipynb.fs.full.functions import RequestSoup

# Functions

In [3]:
# Find header tags, count them and add them to a dictionary
# Input: webpage, output: dictionary of count of H1 til H6 headers on this webpage
@func_set_timeout(5)
def GetAllHeadersFromURL(webpage):
    soup = RequestSoup(webpage)
    
    headers = {}
    for i in range(1,7):
        header_soup = soup.find(f"h{i}")
        if header_soup is not None:                        
            header_soup_count = len(header_soup)
        else:
            header_soup_count = 0
        headers[f"h{i}"] = header_soup_count
        
    return headers
    
# Find header tags, count them and give a sum
# Input: webpage, output: count of headers on this webpage
@func_set_timeout(5)
def GetHeaderCountFromURL(webpage):
    soup = RequestSoup(webpage)
    sumHeaders = 0
    
    for i in range(1,7):
        header_soup = soup.find(f"h{i}")
        if header_soup is not None:                        
            header_soup_count = len(header_soup)
        else:
            header_soup_count = 0
        sumHeaders += header_soup_count
        
    return sumHeaders
    
# Find lists, count them and add them to a dictionary
# Input: webpage, output: dictionary of count of OL, UL and DL lists on this webpage
@func_set_timeout(5)
def GetAllListsFromURL(webpage):
    soup = RequestSoup(webpage)
    lists = {'ol': 0, 'ul': 0, 'dl': 0}
    
    for listtype in lists.keys():
        list_soup = soup.find(listtype)
        if list_soup is not None:
            list_soup_count = len(list_soup)
        else:
            list_soup_count = 0
        lists[listtype] = list_soup_count
    
    return lists

# Find lists, count them and give a sum
# Input: webpage, output: count of lists on this webpage
@func_set_timeout(5)
def GetListsCountFromURL(webpage):
    soup = RequestSoup(webpage)
    lists = ['ol', 'ul', 'dl']
    sumLists = 0
    
    for listtype in lists:
        list_soup = soup.find(listtype)
        if list_soup is not None:
            list_soup_count = len(list_soup)
        else:
            list_soup_count = 0
        sumLists += list_soup_count
    
    return sumLists

# Looping functions

In [1]:
# Loop GetHeaderCountFromURL
# Input: SEMrush keyword dataframe, output: list with header counts from all websites
def LoopGetHeaderCountFromURL(dataframe):
    HeaderCount = []
    for url in dataframe.index:
        try:
            var = GetHeaderCountFromURL(dataframe['Ur'][url])
        except:
            var = None
        HeaderCount.append(var)
    return HeaderCount

# Loop GetListsCountFromURL
# Input: SEMrush keyword dataframe, output: list with header counts from all websites
def LoopGetListsCountFromURL(dataframe):
    ListsCount = []
    for url in dataframe.index:
        try:
            var = GetListsCountFromURL(dataframe['Ur'][url])
        except:
            var = None
        ListsCount.append(var)
    return ListsCount

# Executing

In [4]:
GetHeaderCountFromURL('https://bouwmaat.nl')

6

# Unused code

In [13]:
# ul_soup_all = soup.find_all('ul', {'class':'class-name'})
# li_soup_all = soup.find_all('li', {'class':'class-name'})
# li_ul_soup_count = sum(1 for ul in ul_soup_all for li in ul) # Count list items within unordered lists
# li_ol_soup_count = sum(1 for ol in li_soup_all for li in ol) # Count list items within ordered lists
# print(f"LI in UL: {ul_soup_count}")
# print(f"LI in OL: {ol_soup_count}")