In [1]:
import requests
from bs4 import BeautifulSoup

api_page = requests.get("https://scikit-learn.org/1.4/modules/classes.html")
soup = BeautifulSoup(api_page.content, 'html.parser')

In [2]:
h2_elements = soup.find_all('h2')

In [3]:
sections = []
for i in range(len(h2_elements) - 1):
    section = []
    for sibling in h2_elements[i].next_siblings:
        if sibling == h2_elements[i + 1]:
            break
        section.append(str(sibling))
    section_text = ''.join(section)
    sections.append(BeautifulSoup(section_text, 'lxml'))

In [4]:
import re
def clean_text(text):
    text = re.sub("\xa0","",text)
    text = re.sub("¶","",text)
    text = re.sub("\n","",text)
    return text.strip()
def get_links(*,sub_section_elem,class_name,base_func_url,title):
    curr_urls = []
    try:
        func_urls = sub_section_elem.find_all(attrs={"class": class_name})
        for curr_url in func_urls:
            try:
                func_url = curr_url.find("a")["href"]
                func_text = curr_url.text
                if func_text.find(")") == -1:
                    separator = "\n"
                else:
                    separator = ")"
                func_name,func_desc = func_text[:func_text.find(separator)+1].strip("\n"),func_text[func_text.find(separator)+1:].strip("\n")
                # curr_urls.append(base_func_url + func_url)
                curr_urls.append({"name":func_name,"description":func_desc, "url":base_func_url + func_url})
            except Exception as e:
                print(e)
                print(title.text)
        return curr_urls
    except Exception as e:
        curr_url = sub_section_elem.find(attrs={"class": class_name})
        func_url = curr_url.find('a')["href"]
        func_text = curr_url.text
        if func_text.find(")") == -1:
            separator = "\n"
        else:
            separator = ")"
        func_name,func_desc = func_text[:func_text.find(separator)+1].strip("\n"),func_text[func_text.find(separator)+1:].strip("\n")
        curr_urls.append({"name":clean_text(func_name),"description":func_desc, "url":base_func_url + func_url})
        return curr_urls
    finally:
        return curr_urls

In [5]:
def get_sub_level_dict(h3_titles_list,base_sklearn_url):
    sub_level_dict = {}
    for idx,h3_title in enumerate(h3_titles_list):
        h3_title_text = clean_text(h3_title.text)
        title_siblings = []
        # If it is not the last element
        if idx != len(h3_titles_list)-1:
            for title_functions_siblings in h3_title.next_siblings:
                if title_functions_siblings == h3_titles_list[idx+1]: break
                title_siblings.append(str(title_functions_siblings))
            title_siblings_text = ''.join(title_siblings)
            title_siblings_soup = BeautifulSoup(title_siblings_text, 'lxml')
        else:
            for title_functions_siblings in h3_title.next_siblings:
                title_siblings.append(str(title_functions_siblings))
            title_siblings_text = ''.join(title_siblings)
            title_siblings_soup = BeautifulSoup(title_siblings_text, 'lxml')
        odd_urls = get_links(sub_section_elem=title_siblings_soup,class_name="row-odd",base_func_url=base_sklearn_url,title=h3_title)
        even_urls = get_links(sub_section_elem=title_siblings_soup,class_name="row-even",base_func_url=base_sklearn_url,title=h3_title)
        all_urls = odd_urls + even_urls
        sub_level_dict.update({h3_title_text:{'functions':all_urls}})
    return sub_level_dict

In [6]:
# from tqdm import tqdm

# parent_dict = {}
# pbar = tqdm(total=len(h2_elements[:-1]),desc="Scraping Sklearn")
# for sub_section_h2,sub_section in zip(h2_elements[:-1],sections):
#     base_sklearn_url = "https://scikit-learn.org/stable/modules/"
#     base_parent_url = "https://scikit-learn.org/stable/modules/classes.html"
#     # sub_section = sections[1]
#     # sub_section_h2 = h2_elements[1]
#     parent_class_href = sub_section_h2.find('a')['href']
#     parent_name = clean_text(sub_section_h2.text)
#     parent_function = parent_name.split(":")[0]
#     parent_name = parent_name.split(":")[1].strip()
#     parent_text = " ".join([para.text for para in sub_section.find_all('p')])
#     if "h3" in str(sub_section):
#         try:
#             h3_titles_list = sub_section.find_all('h3')
#             sub_level_dict = get_sub_level_dict(h3_titles_list,base_sklearn_url)
#         except:
#             h3_titles_list = [sub_section.find('h3')]
#         sub_level_dict = get_sub_level_dict(h3_titles_list,base_sklearn_url)
#         for sub_level,vals in sub_level_dict.items():
#             if vals['functions'] == []:
#                 odd_urls = get_links(sub_section_elem=sub_section,class_name="row-odd",base_func_url=base_sklearn_url,title=sub_section_h2)
#                 even_urls = get_links(sub_section_elem=sub_section,class_name="row-even",base_func_url=base_sklearn_url,title=sub_section_h2)
#                 all_urls = odd_urls + even_urls
#                 sub_level_dict = {parent_name:{'functions':all_urls}}
#     elif "h3" not in str(sub_section):
#         odd_urls = get_links(sub_section_elem=sub_section,class_name="row-odd",base_func_url=base_sklearn_url,title=sub_section_h2)
#         even_urls = get_links(sub_section_elem=sub_section,class_name="row-even",base_func_url=base_sklearn_url,title=sub_section_h2)
#         all_urls = odd_urls + even_urls
#         sub_level_dict = {parent_name:{'functions':all_urls}}
#     parent_dict.update({parent_name:{"functions":parent_function,"url":base_parent_url+parent_class_href,"sub_level_dict":sub_level_dict,"parent_text":parent_text}})
#     pbar.update(1)

In [7]:
from tqdm import tqdm

parent_dict = {}
pbar = tqdm(total=len(h2_elements[:-1]),desc="Scraping Sklearn")
for sub_section_h2,sub_section in zip(h2_elements[:-1],sections):
    base_sklearn_url = "https://scikit-learn.org/stable/modules/"
    base_parent_url = "https://scikit-learn.org/stable/modules/classes.html"
    # sub_section = sections[1]
    # sub_section_h2 = h2_elements[1]
    parent_class_href = sub_section_h2.find('a')['href']
    parent_name = clean_text(sub_section_h2.text)
    parent_function = clean_text(parent_name.split(":")[0])
    parent_name = parent_name.split(":")[1].strip()
    parent_text = " ".join([para.text for para in sub_section.find_all('p')])
    try:
        default_funcs = []
        default_vals_list = []
        defaul_vals = sub_section.find_all(class_="autosummary longtable docutils align-default")
        for df in defaul_vals:
            ourl = get_links(sub_section_elem=sub_section,class_name="row-odd",base_func_url=base_sklearn_url,title=sub_section_h2)
            eurl = get_links(sub_section_elem=sub_section,class_name="row-even",base_func_url=base_sklearn_url,title=sub_section_h2)
            default_funcs.extend(ourl + eurl)
        parent_dict.update({parent_name:{"base_function":parent_function,"url":base_parent_url+parent_class_href,"functions":default_funcs,"parent_text":parent_text}})
    except:
        pass

    if "h3" in str(sub_section):
        try:
            h3_titles_list = sub_section.find_all('h3')
        except:
            h3_titles_list = [sub_section.find('h3')]
        sub_level_dict = get_sub_level_dict(h3_titles_list,base_sklearn_url)
        parent_dict[parent_name].update({"sub_level_dict":sub_level_dict})
    pbar.update(1)

Scraping Sklearn:   0%|          | 0/39 [00:00<?, ?it/s]

Scraping Sklearn:  59%|█████▉    | 23/39 [00:00<00:00, 179.54it/s]

In [8]:
from markdownify import MarkdownConverter

# Create shorthand method for conversion
def md(soup, **options):
    return MarkdownConverter(**options).convert_soup(soup)

In [9]:
import re
import sys

def normalize_newlines(paragraph):
    normalized_paragraph = re.sub(r'\n+', '\n\n', paragraph)
    return normalized_paragraph
def remove_links(soup):
        links = soup.find_all('a')
        for link in links:
            link.decompose()
        
        return soup
def get_py_obj(base_func_url,parent_name):
    func_url = requests.get(base_func_url)
    func_soup = BeautifulSoup(func_url.content, 'lxml')
    func_name = clean_text(func_soup.find('h1').text)
    func_signature_elem = func_soup.find(class_="sig sig-object py")
    if func_signature_elem is not None:
        func_signature = clean_text(func_signature_elem.text)
    else:
        func_signature = ""
    all_urls = []
    page_dict = {"func_name":func_name,"func_signature":func_signature}
    try:
        if func_soup.find(class_="py class"):
            class_or_fn = "py class"
            type = "class"
        elif func_soup.find(class_="py function"):
            class_or_fn = "py function"
            type = "function"
        py_soup = func_soup.find(class_=class_or_fn)
        func_text_list = []
        dd = py_soup.find('dd')
        field_list = dd.find(class_="field-list")

        for i in field_list.previous_siblings:
            func_text_list.append(i.text)
        func_text = "".join(func_text_list[::-1]).replace("\n"," ").strip()
        page_dict.update({"func_text":func_text})
        for url in py_soup.find_all('a'):
            url = url['href']
            if url is None: continue
            elif url.startswith("#"):
                all_urls.append(base_func_url + url)
            elif url.startswith(".."):
                continue
            elif url.startswith("http"):
                all_urls.append(url)
        py_soup = remove_links(py_soup)
        py_md = normalize_newlines(md(py_soup))
        page_dict.update({"func_md":py_md,"type":type})
        fodd = field_list.find_all(class_="field-odd")
        if fodd[0].text == "Parameters":
            dts = fodd[1].find_all('dt')
            paremter_names_desc = {}

            for idx,dt in enumerate(dts):
                param_name = dt.find('strong').text
                param_type = dt.find(class_="classifier").text
                param_desc = ""
                for next_sib in dt.next_siblings:
                    if idx == len(dts)-1:
                        pass
                    else:
                        if next_sib == dts[idx+1]:
                            break
                    next_sib = str(next_sib)
                    param_desc += next_sib[next_sib.find("<p>"):next_sib.find("</p>")]
                paremter_names_desc.update({param_name:{"param_type":param_type,"params_desc":param_desc}})
            page_dict.update({"paremter_names_desc":paremter_names_desc})
    except Exception as e:
        print(e,parent_name)
        return page_dict

pbar_ = tqdm(total=len(list(parent_dict.keys())))
for parent_name,parent_vals in parent_dict.items():
    if 'functions' in parent_vals:
        for func in parent_vals['functions']:
            func_url = func['url']
            web_page_res = get_py_obj(func_url,parent_name)
            if web_page_res is not None:
                for k,v in web_page_res.items():
                    func.update({k:v})
    if 'sub_level_dict' in parent_vals:
        for sub_level_name,sub_level_vals in parent_vals['sub_level_dict'].items():
            if 'functions' in sub_level_vals:
                for func in sub_level_vals['functions']:
                    func_url = func['url']
                    web_page_res = get_py_obj(func_url,parent_name)
                    if web_page_res is not None:
                        for k,v in web_page_res.items():
                            func.update({k:v})
    pbar_.update(1)



'NoneType' object has no attribute 'previous_siblings' Settings and information tools


KeyboardInterrupt: 

In [29]:
import json
with open('sklearn.json', 'w') as f:
    json.dump(parent_dict, f)

In [30]:
# import concurrent.futures
# from copy import deepcopy

# parent_dict_copy = deepcopy(parent_dict)
# def main_scraper(parent_name):
#     parent_vals = parent_dict_copy[parent_name]
#     if 'functions' in parent_vals:
#         for func in parent_vals['functions']:
#             func_url = func['url']
#             web_page_res = get_py_obj(func_url,parent_name)
#             if web_page_res is not None:
#                 for k,v in web_page_res.items():
#                     func.update({k:v})
#     if 'sub_level_dict' in parent_vals:
#         for sub_level_name,sub_level_vals in parent_vals['sub_level_dict'].items():
#             if 'functions' in sub_level_vals:
#                 for func in sub_level_vals['functions']:
#                     func_url = func['url']
#                     web_page_res = get_py_obj(func_url)
#                     if web_page_res is not None:
#                         for k,v in web_page_res.items():
#                             func.update({k:v})
#     print(f"Done for {parent_name}")
# if __name__ == "__main__":
#     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
#         parent_names_list = list(parent_dict.keys())
#         results = executor.map(main_scraper, parent_names_list)


In [14]:
# base_func_url = "https://scikit-learn.org/stable/modules/generated/sklearn.utils.as_float_array.html#sklearn.utils.as_float_array"
# func_url = requests.get(base_func_url)
# func_soup = BeautifulSoup(func_url.content, 'lxml')
# func_name = clean_text(func_soup.find('h1').text)
# func_signature_elem = func_soup.find(class_="sig sig-object py")
# if func_signature_elem is not None:
#     func_signature = clean_text(func_signature_elem.text)
# else:
#     func_signature = ""
# all_urls = []

# if func_soup.find(class_="py class"):
#     class_or_fn = "py class"
#     type = "class"
# elif func_soup.find(class_="py function"):
#     class_or_fn = "py function"
#     type = "function"
# py_soup = func_soup.find(class_=class_or_fn)
# func_text_list = []
# dd = py_soup.find('dd')
# field_list = dd.find(class_="field-list")

# for i in field_list.previous_siblings:
#     func_text_list.append(i.text)
# func_text = "".join(func_text_list[::-1]).replace("\n"," ").strip()
# for url in py_soup.find_all('a'):
#     url = url['href']
#     if url is None: continue
#     elif url.startswith("#"):
#         all_urls.append(base_func_url + url)
#     elif url.startswith(".."):
#         continue
#     elif url.startswith("http"):
#         all_urls.append(url)
# fodd = field_list.find_all(class_="field-odd")
# dts = fodd[1].find_all('dt')
# paremter_names_desc = {}

# for idx,dt in enumerate(dts):
#     param_name = dt.find('strong').text
#     param_type = dt.find(class_="classifier").text
#     param_desc = ""
#     for next_sib in dt.next_siblings:
#         if idx == len(dts)-1:
#             pass
#         else:
#             if next_sib == dts[idx+1]:
#                 break
#         next_sib = str(next_sib)
#         param_desc += next_sib[next_sib.find("<p>"):next_sib.find("</p>")]
#     paremter_names_desc.update({param_name:{"param_type":param_type,"params_desc":param_desc}})

## FROM PANDAS

In [23]:
base_url = "https://scikit-learn.org/stable/api/index.html"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "lxml")
l1_elems = soup.find_all(class_="toctree-l1")

In [24]:
l1_elems[0].find_all('a')[1]['href']

'../modules/generated/sklearn.config_context.html'

In [28]:
base_url = "https://scikit-learn.org/stable/api/index.html"
base_parent_url = "https://scikit-learn.org/stable/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "lxml")
base_func_url = "https://pandas.pydata.org/docs/reference/"
l1_elems = soup.find_all(class_="toctree-l1")

first_level = {}
curr_parent = ""
for parent_functions in l1_elems:
    for func in parent_functions.find_all("a"):
        href = func["href"]
        if ".." not in href:
            first_level.update(
                {
                    href: {
                        "functions": [],
                        "name": func.text,
                        "url": base_parent_url +"api/" + href,
                    }
                }
            )

In [71]:
first_level["sklearn.feature_extraction.html"]

{'functions': [],
 'name': 'sklearn.feature_extraction',
 'url': 'https://scikit-learn.org/stable/api/sklearn.feature_extraction.html'}

In [72]:
parent_page = requests.get(first_level["sklearn.feature_extraction.html"]['url'])
parent_soup = BeautifulSoup(parent_page.content, 'lxml',from_encoding="utf-8")

In [73]:
func_text = parent_soup.find('h1').text.replace("#","")
if "h2" in str(parent_soup):
    h2_elements = parent_soup.find_all("h2")
    base_section = []
    for next_sib in paren
    # for idx,h2_elem in enumerate(h2_elements):


SyntaxError: expected ':' (535049815.py, line 5)

In [81]:
tables = parent_soup.find_all(class_="autosummary longtable table autosummary")[0]

In [87]:
h2_elements = parent_soup.find_all('h2')

In [88]:
for i in tables.next_siblings:
    if i == h2_elements[0]:
        break
    print(i.text)




From images#
Utilities to extract features from images.


image.PatchExtractor
Extracts patches from a collection of images.

image.extract_patches_2d
Reshape a 2D image into a collection of patches.

image.grid_to_graph
Graph of the pixel-to-pixel connections.

image.img_to_graph
Graph of the pixel-to-pixel gradient connections.

image.reconstruct_from_patches_2d
Reconstruct the image from all of its patches.







From text#
Utilities to build feature vectors from text documents.


text.CountVectorizer
Convert a collection of text documents to a matrix of token counts.

text.HashingVectorizer
Convert a collection of text documents to a matrix of token occurrences.

text.TfidfTransformer
Transform a count matrix to a normalized tf or tf-idf representation.

text.TfidfVectorizer
Convert a collection of raw documents to a matrix of TF-IDF features.








In [80]:
h2_elements = parent_soup.find_all("h2")
for i in parent_soup.find('h1').next_siblings:
    # print(i)
    if i.find(' id')!=-1:
        print(i)
        break
    print(i)
    print("-"*100)



----------------------------------------------------------------------------------------------------
<p>Feature extraction from raw data.</p>


In [None]:
parent_soup.find_all("")