In [7]:
import requests
from bs4 import BeautifulSoup
import pprint

DOC_URL = "https://plotly.com/javascript"

In [4]:
response = requests.get('/'.join([DOC_URL, 'bar-charts']))
if response.status_code != 200:
    print(f"Failed to retrieve {url}")

soup = BeautifulSoup(response.content, "html.parser")
SCRAPED_DATA = {}
                    

### Section Headers
These are descriptions of what the code examples do

In [10]:
section_headers_html = soup.find_all("h3")
section_headers = []

for sec in section_headers_html:
    section_headers.append(sec.text.strip())

pprint.pp(section_headers)

['Basic Bar Chart',
 'Grouped Bar Chart',
 'Stacked Bar Chart',
 'Bar Chart with Hover Text',
 'Bar Chart with Direct Labels',
 'Grouped Bar Chart with Direct Labels',
 'Bar Chart with Rotated Labels',
 'Customizing Individual Bar Colors',
 'Customizing Individual Bar Widths',
 'Customizing Individual Bar Base',
 'Rounded Corners on Bars',
 'Colored and Styled Bar Chart',
 'Waterfall Bar Chart',
 'Bar Chart with Relative Barmode']


### Code examples
Code examples corresponding to the section headers

In [12]:
code_html = soup.find_all('code')
code_blocks = []

for code in code_html:
    code_blocks.append(code.text)

print('Number of code blocks:', len(code_blocks), '\n')
pprint.pp(code_blocks)

Number of code blocks: 16 

['var data = [\n'
 '  {\n'
 "    x: ['giraffes', 'orangutans', 'monkeys'],\n"
 '    y: [20, 14, 23],\n'
 "    type: 'bar'\n"
 '  }\n'
 '];\n'
 '\n'
 "Plotly.newPlot('myDiv', data);\n",
 'var trace1 = {\n'
 "  x: ['giraffes', 'orangutans', 'monkeys'],\n"
 '  y: [20, 14, 23],\n'
 "  name: 'SF Zoo',\n"
 "  type: 'bar'\n"
 '};\n'
 '\n'
 'var trace2 = {\n'
 "  x: ['giraffes', 'orangutans', 'monkeys'],\n"
 '  y: [12, 18, 29],\n'
 "  name: 'LA Zoo',\n"
 "  type: 'bar'\n"
 '};\n'
 '\n'
 'var data = [trace1, trace2];\n'
 '\n'
 "var layout = {barmode: 'group'};\n"
 '\n'
 "Plotly.newPlot('myDiv', data, layout);\n",
 'var trace1 = {\n'
 "  x: ['giraffes', 'orangutans', 'monkeys'],\n"
 '  y: [20, 14, 23],\n'
 "  name: 'SF Zoo',\n"
 "  type: 'bar'\n"
 '};\n'
 '\n'
 'var trace2 = {\n'
 "  x: ['giraffes', 'orangutans', 'monkeys'],\n"
 '  y: [12, 18, 29],\n'
 "  name: 'LA Zoo',\n"
 "  type: 'bar'\n"
 '};\n'
 '\n'
 'var data = [trace1, trace2];\n'
 '\n'
 "var layout = {barmod

In [31]:
# These two are wrong and need to be removed
print(code_blocks[10], code_blocks[11])

barcornerradius marker.cornerradius


In [32]:
corrected_code = [c for i, c in enumerate(code_blocks) if i not in [10, 11]]
print(len(corrected_code))

14


In [33]:
code_blocks = corrected_code

In [42]:
def get_code_and_headers(url):
    """Get h3 and code tags from `url`. Return the
    corresponding header and code examples as a dict

    Args:
        url(str): the url to scrape. Must be a valid HTTP address
        that accepts GET requests

    Returns:
        (dict): a dictionary with `header` and `code` scraped from
        the url.
            {
                "header": [],
                "code": [],
            }
    """
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Could not retrieve url: {url} Status code: {response.status_code}")
        return
    
    soup = BeautifulSoup(response.content, "html.parser")
    parsed_content = {'header': [], 'code': []}

    section_headers_html = soup.find_all('h3')
    code_html = soup.find_all('code')

    if len(code_html) == len(section_headers_html):
        for sec, code in zip(section_headers_html, code_html):
            parsed_content['header'].append(sec.text.strip())
            parsed_content['code'].append(code.text)

    else:
        print('Mismatch in lengths of code and headers! You should investigate further')
        print(f'Length of headers: {len(section_headers_html)}, Length of code: {len(code_html)}')

        for sec in section_headers_html:
            parsed_content['header'].append(sec.text.strip())

        for code in code_html:
            parsed_content['code'].append(code.text)

    print('Finished scraping!')
    return parsed_content
    
    

In [52]:
def get_scrape_stats():
    print(SCRAPED_DATA.keys())

    total_examples = 0
    print('\n', 'Scraped data statistics...')
    for key, data_block in SCRAPED_DATA.items():
        print(f"{key} -> {len(data_block['code'])}")
        total_examples += len(data_block['code'])
    
    print(f'Total code examples = {total_examples}')

### Use automated functions to scrape required Plotly documentation

In [59]:
# SCRAPED
    # line and bar charts completed above
    # documentation_articles = ['bubble-charts', 'dot-plots', 'subplots', 'line-and-scatter', 'histograms'] # only headers
documentation_articles = ['filled-area-plots', 'horizontal-bar-charts', 'graphing-multiple-chart-types', 'waterfall-charts', 'time-series'] # only headers

# NOT SCRAPED
documentation_articles_with_desc = ['pie-charts', 'sankey-diagram', 'treemaps'] # may contain additional <p> tags describing section

for doc in documentation_articles:
    print(f'\nGetting content for {doc}...')
    url = '/'.join([DOC_URL, doc])
    scraped_data = get_code_and_headers(url)
    if len(scraped_data['header']) == len(scraped_data['code']):
        SCRAPED_DATA[doc] = scraped_data.copy()
    else:
        print(f'Length mismatch for {doc}\n')

get_scrape_stats()

dict_keys(['line-charts', 'bar-charts', 'bubble-charts', 'dot-plots', 'subplots', 'line-and-scatter', 'histograms', 'filled-area-plots', 'horizontal-bar-charts', 'graphing-multiple-chart-types', 'waterfall-charts', 'time-series'])

 Scraped data statistics...
line-charts -> 11
bar-charts -> 14
bubble-charts -> 5
dot-plots -> 1
subplots -> 7
line-and-scatter -> 6
histograms -> 8
filled-area-plots -> 5
horizontal-bar-charts -> 3
graphing-multiple-chart-types -> 2
waterfall-charts -> 4
time-series -> 4
Total code examples = 70


## Save scraped data

In [56]:
!ls ../

config.py  dataset.py	pyproject.toml	scrape
data	   poetry.lock	queryengine.py


In [57]:
import pickle

def save_scraped(save_loc='../data/plotly-scraped-data.pkl'):
    with open(save_loc, 'wb') as handle:
        pickle.dump(SCRAPED_DATA, handle)

    print(f'Saved SCRAPED_DATA to {save_loc}')
    

In [58]:
save_scraped()

Saved SCRAPED_DATA to ../data/plotly-scraped-data.pkl
