In [1]:
import requests
from bs4 import BeautifulSoup

api_page = requests.get("https://scikit-learn.org/stable/modules/classes.html")
soup = BeautifulSoup(api_page.content, 'html.parser')

In [2]:
h2_elements = soup.find_all('h2')

In [3]:
sections = []
for i in range(len(h2_elements) - 1):
    section = []
    for sibling in h2_elements[i].next_siblings:
        if sibling == h2_elements[i + 1]:
            break
        section.append(str(sibling))
    section_text = ''.join(section)
    sections.append(BeautifulSoup(section_text, 'lxml'))

In [4]:
import re
def clean_text(text):
    text = re.sub("\xa0","",text)
    text = re.sub("¶","",text)
    text = re.sub("\n","",text)
    return text.strip()
def get_links(*,sub_section_elem,class_name,base_func_url,title):
    curr_urls = []
    try:
        func_urls = sub_section_elem.find_all(attrs={"class": class_name})
        for curr_url in func_urls:
            try:
                func_url = curr_url.find("a")["href"]
                func_text = curr_url.text
                if func_text.find(")") == -1:
                    separator = "\n"
                else:
                    separator = ")"
                func_name,func_desc = func_text[:func_text.find(separator)+1].strip("\n"),func_text[func_text.find(separator)+1:].strip("\n")
                # curr_urls.append(base_func_url + func_url)
                curr_urls.append({"name":func_name,"description":func_desc, "url":base_func_url + func_url})
            except Exception as e:
                print(e)
                print(title.text)
        return curr_urls
    except Exception as e:
        curr_url = sub_section_elem.find(attrs={"class": class_name})
        func_url = curr_url.find('a')["href"]
        func_text = curr_url.text
        if func_text.find(")") == -1:
            separator = "\n"
        else:
            separator = ")"
        func_name,func_desc = func_text[:func_text.find(separator)+1].strip("\n"),func_text[func_text.find(separator)+1:].strip("\n")
        curr_urls.append({"name":clean_text(func_name),"description":func_desc, "url":base_func_url + func_url})
        return curr_urls
    finally:
        return curr_urls

In [5]:
def get_sub_level_dict(h3_titles_list,base_sklearn_url):
    sub_level_dict = {}
    for idx,h3_title in enumerate(h3_titles_list):
        h3_title_text = clean_text(h3_title.text)
        title_siblings = []
        # If it is not the last element
        if idx != len(h3_titles_list)-1:
            for title_functions_siblings in h3_title.next_siblings:
                if title_functions_siblings == h3_titles_list[idx+1]: break
                title_siblings.append(str(title_functions_siblings))
            title_siblings_text = ''.join(title_siblings)
            title_siblings_soup = BeautifulSoup(title_siblings_text, 'lxml')
        else:
            for title_functions_siblings in h3_title.next_siblings:
                title_siblings.append(str(title_functions_siblings))
            title_siblings_text = ''.join(title_siblings)
            title_siblings_soup = BeautifulSoup(title_siblings_text, 'lxml')
        odd_urls = get_links(sub_section_elem=title_siblings_soup,class_name="row-odd",base_func_url=base_sklearn_url,title=h3_title)
        even_urls = get_links(sub_section_elem=title_siblings_soup,class_name="row-even",base_func_url=base_sklearn_url,title=h3_title)
        all_urls = odd_urls + even_urls
        sub_level_dict.update({h3_title_text:{'name':clean_text(h3_title.text),'functions':all_urls}})
    return sub_level_dict

In [6]:
from tqdm import tqdm

parent_dict = {}
pbar = tqdm(total=len(h2_elements[:-1]),desc="Scraping Sklearn")
for sub_section_h2,sub_section in zip(h2_elements[:-1],sections):
    base_sklearn_url = "https://scikit-learn.org/stable/modules/"
    base_parent_url = "https://scikit-learn.org/stable/modules/classes.html"
    # sub_section = sections[1]
    # sub_section_h2 = h2_elements[1]
    parent_class_href = sub_section_h2.find('a')['href']
    parent_name = clean_text(sub_section_h2.text)
    parent_function = parent_name.split(":")[0]
    parent_name = parent_name.split(":")[1].strip()
    parent_text = " ".join([para.text for para in sub_section.find_all('p')])
    if "h3" in str(sub_section):
        try:
            h3_titles_list = sub_section.find_all('h3')
            sub_level_dict = get_sub_level_dict(h3_titles_list,base_sklearn_url)
        except:
            h3_titles_list = [sub_section.find('h3')]
        sub_level_dict = get_sub_level_dict(h3_titles_list,base_sklearn_url)
        for sub_level,vals in sub_level_dict.items():
            if vals['functions'] == []:
                odd_urls = get_links(sub_section_elem=sub_section,class_name="row-odd",base_func_url=base_sklearn_url,title=sub_section_h2)
                even_urls = get_links(sub_section_elem=sub_section,class_name="row-even",base_func_url=base_sklearn_url,title=sub_section_h2)
                all_urls = odd_urls + even_urls
                sub_level_dict = {parent_name:{'functions':all_urls}}
    elif "h3" not in str(sub_section):
        odd_urls = get_links(sub_section_elem=sub_section,class_name="row-odd",base_func_url=base_sklearn_url,title=sub_section_h2)
        even_urls = get_links(sub_section_elem=sub_section,class_name="row-even",base_func_url=base_sklearn_url,title=sub_section_h2)
        all_urls = odd_urls + even_urls
        sub_level_dict = {parent_name:{'functions':all_urls}}
    parent_dict.update({parent_name:{"functions":parent_function,"url":base_parent_url+parent_class_href,"sub_level_dict":sub_level_dict,"parent_text":parent_text}})
    pbar.update(1)

Scraping Sklearn:   0%|          | 0/39 [00:00<?, ?it/s]

Scraping Sklearn:  82%|████████▏ | 32/39 [00:00<00:00, 100.48it/s]