><b>Important note to make this work</b><br>
>You need to run the following command<br>
>    `pip install lxml bs4 pandas requests ipywidgets qgrid pygments pathlib networkx`

In [5]:
# importing libs
from lxml import html
import lxml.html.clean
from bs4 import BeautifulSoup
import pandas as pd
import requests

def cleantext(text):
    ''' Removes Scripts and CSS ect. '''
    return lxml.html.clean.clean_html(text)

def html2text(text):
    ''' Converts a html formatted text to text '''
    return BeautifulSoup(text).get_text()

def gethtmltree(docpath):
    ''' Extract the htmltree from a htmldoc at docpath '''
    with open(docpath, 'r') as f:
        text = f.read()
    text = cleantext(text)
    return html.fromstring(text)

def extracttext(docpath):
    ''' Extracts text from html document at given path'''
    with open(docpath, 'r') as f:
        text = f.read()
    text = cleantext(text)
    return BeautifulSoup(text).get_text()

def extractlinks(docpath):
    ''' Get linklist from htmldoc at docpath '''
    htmltree = gethtmltree(docpath)
    return list(htmltree.iterlinks())

def extractlinkurls(docpath):
    ''' Get link urls from htmldoc at docpath '''
    links = extractlinks(docpath)
    return [ link[2] for link in links ]


def conv2abs_links(htmltree, basepath):
    ''' Get absolut linklist from Htmltree ''' # Maybe it would be simpler to keep a
    htmltree.make_links_absolute(basepath)     # corrected htmltree??
    return list(html.iterlinks(htmltree))

def geturllist(htmltree):
    ''' Get urllist from Htmltree '''
    return htmltree.xpath('//a/@href')

def gettxtlist(htmltree):
    ''' Get all parts of text from HtmlTree '''
    return [ text for text in htmltree.itertext() ]

def getelemlist(htmltree):
    ''' Returns a list of all elements in HtmlTree '''
    return list(htmltree.getiterator())

def showelem(element):
    ''' Prints element in human readable form '''
    print('Tag: ' + element.tag)
    for i in element.attrib.iteritems():
        print('Attribute: ' + i[0] + '=' + i[1])
    print('Text: ' + element.text)

def gettablelist(url):
    ''' Returns a list of dataframes with all html tables
    found on the webpage'''
    return pd.read_html(url)

def dlwebpage(url):
    req = requests.get(url)
    return req.text

from urllib.parse import urlparse

In [6]:
# Set style and import libraries
from IPython.core.display import display, HTML
style = """
<style>
  .container { 
    width:100% !important;
  }
  div#notebook {
    font-size: 14px;
    line-height: 20px;
    overflow-y: hidden;
    overflow-x: auto;
    width: 100%;
    padding-top: 0px;
    margin: 0px;
    outline: none;
    box-sizing: border-box;
    -moz-box-sizing: border-box;
    -webkit-box-sizing: border-box;
    min-height: 100%;
  }
  .prompt_container {
    width: 0px;
    padding-left: 0px;
    margin-left: 0px;
    visibility: visible;
  }
  .prompt {
    visibility: visible;
    width: 0px;
  }
  .input_prompt {
    width: 0px;
  }
  .run_this_cell {
    margin: 0px;
    padding: 0px;
  }
</style>"""
display(HTML(style))

from collections import Counter
import numpy as np
from ipywidgets import interact, interactive, interactive_output, interact_manual, Layout
from IPython.display import display
import ipywidgets as widgets

# qgrid Tabellen
import qgrid

def dict2df(d=dict()):
    df = pd.DataFrame()
    return df.from_dict(d, orient='index')

# Syntax Highlighting
import pygments
from pygments import styles, formatters, lexers, highlight
from pygments.lexers import PythonLexer, HtmlLexer
from pygments.formatters import HtmlFormatter

def colorize(string='', style='native'):
    display(HTML("""
    <style>
    {pygments_css}
    </style>
    """.format(pygments_css=formatters.HtmlFormatter(style=style).get_style_defs('.highlight'))))
    return display(HTML(data=highlight(string, HtmlLexer(), formatters.HtmlFormatter())))

styles = list(pygments.styles.get_all_styles())

<h1>Extracting data from the web</h1>

<h2>Getting Data</h2>
<h3>Download</h3>

In [7]:
# Download a webpage
import requests

url = 'https://de.wikipedia.org/wiki/Python_(Programmiersprache)'

response = requests.get(url=url)
response

<Response [200]>

<h3>Saving locally</h3>

In [8]:
# Saving a downloaded web page local
localfile = 'localpage.html'
content = response.content.decode('utf-8')

with open(localfile, 'w', encoding='utf-8') as f:
    f.write(content)

<h2>Accessing the information</h2>
<h3>Header Information and Raw HTML</h3>

In [9]:
# Access to header information
df = dict2df(response.headers)

qgrid.show_grid(data_frame=df)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [10]:
# Access to the raw html
raw_html = response.text
raw_html

'<!DOCTYPE html>\n<html class="client-nojs" lang="de" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Python (Programmiersprache) – Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Python_(Programmiersprache)","wgTitle":"Python (Programmiersprache)","wgCurRevisionId":185346534,"wgRevisionId":185346534,"wgArticleId":13638,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Wikipedia:Defekte Weblinks/Ungeprüfte Archivlinks 2018-03","Wikipedia:Vorlagenfehler/Vorlage:Webarchiv/Linktext fehlt","Wikipedia:Weblink offline IABot","Wikipedia:Defekte Weblinks/Ungeprüfte Botmarkierungen 2018-03","Wikipedia:Gesprochener Artikel","Wikipedia:Lesenswert","Pyth

In [11]:
# Just in case the style fails somehow...
# style = """
# <style>
#   .output_area { 
#     width:100% !important;
#     background-color: white; //black;
#     color: black; //white;
#   }
# </style>"""
# display(HTML(style))

colorize(raw_html)
# interactive(colorize, string=raw_html, style=styles) # Interactive Styles
# interactive(colorize, string=raw_html[:10000], style=styles) # Same as above but acceptable speed...

# Fastest, easiest and most error resistent solution but needs a system shell with a few basic libraries 
#!cat localpage.html | hxclean | hxnormalize -xe | highlight -S html -s navy -O xterm256

<h3>Textual Information</h3>

In [12]:
# Example how the webpage would look like after removing inline scripts with cleantext()
display(HTML(cleantext(raw_html)))

Python,Python.1
,
Basisdaten,Basisdaten
Paradigmen:,multiparadigmatisch
Erscheinungsjahr:,1991
Designer:,Guido van Rossum
Entwickler:,Python Software Foundation
Aktuelle Version:,"3.7.2 (24. Dezember 2018), 2.7.15 (1. Mai 2018)"
Typisierung:,"stark, dynamisch („Duck-Typing“)"
Wichtige Implementierungen:,"CPython, Jython, IronPython, PyPy"
Beeinflusst von:,"ABC, Algol 68, Modula-3, Icon, C, C++, Perl, Java, Lisp, Haskell"

0,1
,Dieser Artikel ist als Audiodatei verfügbar:
,"Speichern | Informationen | 20.30 min (13,0 MB) Text der gesprochenen Version (5. Januar 2010)"
,Mehr Informationen zur gesprochenen Wikipedia

0,1
,Dieser Artikel wurde am 23. Oktober 2005 in dieser Version in die Liste der lesenswerten Artikel aufgenommen.


In [13]:
# Accessing the written text in raw html
from lxml import html
import lxml.html.clean
from bs4 import BeautifulSoup

def cleantext(text):
    ''' Removes Scripts and CSS ect. '''
    return lxml.html.clean.clean_html(text)

def html2text(text):
    ''' Converts a html formatted text to text '''
    return BeautifulSoup(text).get_text()

text = html2text(cleantext(raw_html))
print('\n\n'.join([l for l in text.splitlines() if l]))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Python (Programmiersprache) – Wikipedia

		

Python (Programmiersprache) 

aus Wikipedia, der freien Enzyklopädie 

 Zur Navigation springen

Zur Suche springen

Python

Basisdaten

Paradigmen:

multiparadigmatisch

Erscheinungsjahr:

1991

Designer:

Guido van Rossum

Entwickler:

Python Software Foundation

Aktuelle Version:

3.7.2 (24. Dezember 2018),2.7.15  (1. Mai 2018)

Typisierung:

stark, dynamisch („Duck-Typing“)

Wichtige Implementierungen:

CPython, Jython, IronPython, PyPy

Beeinflusst von:

ABC, Algol 68, Modula-3, Icon, C, C++, Perl, Java, Lisp, Haskell

Beeinflusste:

Ruby, Boo, Groovy, Cython, JavaScript, Swift

Betriebssystem:

Plattformunabhängigkeit[1]

Lizenz:

Python-Software-Foundation-Lizenz[2]

www.python.org

Python ([.mw-parser-output .IPA a{text-decoration:none}ˈpaɪθn̩], [ˈpaɪθɑn], auf Deutsch auch [ˈpyːtɔn]) ist eine universelle, üblicherweise interpretierte höhere Programmiersprache.[3] Sie hat den Anspruch, einen gut lesbaren, knappen Programmierstil zu fö

<h3>HTML Elements</h3>
<h3>Links</h3>

In [14]:
def gethtmltree(docpath):
    ''' Extract the htmltree from a htmldoc at docpath '''
    with open(docpath, 'r', encoding='utf-8') as f:
        text = f.read()
#     text = cleantext(raw_html)
    return html.fromstring(text)

def extractlinks(docpath):
    ''' Get linklist from htmldoc at docpath '''
    htmltree = gethtmltree(docpath)
    return list(htmltree.iterlinks())

def extractlinkurls(docpath):
    ''' Get link urls from htmldoc at docpath '''
    links = extractlinks(docpath)
    return [link[2] for link in links]

urllist = extractlinkurls(docpath=localfile)
for url in urllist:
    print(url)

/w/load.php?debug=false&lang=de&modules=ext.3d.styles%7Cext.cite.styles%7Cext.flaggedRevs.basic%7Cext.pygments%2CwikimediaBadges%7Cext.tmh.thumbnail.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cmediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.page.gallery.styles%7Cmediawiki.skinning.interface%7Cmediawiki.toc.styles%7Cskins.vector.styles%7Cwikibase.client.init&only=styles&skin=vector
/w/load.php?debug=false&lang=de&modules=startup&only=scripts&skin=vector
/w/load.php?debug=false&lang=de&modules=site.styles&only=styles&skin=vector
android-app://org.wikipedia/http/de.m.wikipedia.org/wiki/Python_(Programmiersprache)
/w/index.php?title=Python_(Programmiersprache)&action=edit
/w/index.php?title=Python_(Programmiersprache)&action=edit
/static/apple-touch/wikipedia.png
/static/favicon/wikipedia.ico
/w/opensearch_desc.php
//de.wikipedia.org/w/api.php?action=rsd
//creativecommons.org/licenses/by-sa/3.0/
https://de.wikipedia.org/wiki/Python_(Programmiersprach

In [15]:
[e[0].text_content() for e in extractlinks(localfile) if e[0].text_content()]

['Zur Navigation springen',
 'Zur Suche springen',
 'Paradigmen',
 'Guido van Rossum',
 'Version',
 'Typisierung',
 'stark',
 'dynamisch',
 'Duck-Typing',
 'Implementierungen',
 'CPython',
 'Jython',
 'IronPython',
 'PyPy',
 'ABC',
 'Algol 68',
 'Modula-3',
 'Icon',
 'C',
 'C++',
 'Perl',
 'Java',
 'Lisp',
 'Haskell',
 'Ruby',
 'Boo',
 'Groovy',
 'Cython',
 'JavaScript',
 'Swift',
 'Betriebssystem',
 'Plattformunabhängigkeit',
 '[1]',
 'Lizenz',
 'Python-Software-Foundation-Lizenz',
 '[2]',
 'www.python.org',
 'ˈpaɪθn̩',
 'ˈpaɪθɑn',
 'ˈpyːtɔn',
 'interpretierte',
 'höhere Programmiersprache',
 '[3]',
 '[4]',
 'Blöcke',
 'Programmierparadigmen',
 'objektorientierte',
 'aspektorientierte',
 'funktionale',
 'dynamische Typisierung',
 'dynamische Sprachen',
 'Skriptsprache',
 'Python Software Foundation',
 'de facto',
 'CPython',
 '1 Entwicklungsgeschichte',
 '2 Ziele',
 '3 Datentypen und Strukturen',
 '3.1 Sammeltypen',
 '3.2 Objektsystem',
 '4 Syntax',
 '4.1 Strukturierung durch Einrücke

In [16]:
# sqlite-doc-3260000 is availible at:
#       https://sqlite.org/2018/sqlite-doc-3260000.zip
path = 'sqlite-doc-3260000/sqlite-doc-3260000/index.html'
extractlinkurls(path)

FileNotFoundError: [Errno 2] No such file or directory: 'sqlite-doc-3260000/sqlite-doc-3260000/index.html'

In [None]:
from pathlib import Path
basedir = Path('sqlite-doc-3260000')
files = list(basedir.glob('**/*.html'))
files = [Path(file.as_posix()) for file in files][:20]
# basedir.as_posix()
linktuples = []
linkdict = {file.as_posix():extractlinkurls(file) for file in files}

for source, destinations in linkdict.items():
    source = source
    for destination in destinations:
        destination = destination[:destination.find('#')]
        destination = destination[:destination.find('?')]
        if destination and not source == destination:
            linktuple = tuple([source.replace('sqlite-doc-3260000/sqlite-doc-3260000/', ''), destination])
            linktuples.append(linktuple)
            print("%-25s⮕\t\t%-100s" % linktuple)

In [None]:
import random
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

g = nx.DiGraph()
for linktuple in random.choices(linktuples, k=300):
    g.add_edge(linktuple[0], linktuple[1])

fig, ax = plt.subplots(1, 1, figsize=(40, 25));
nx.draw_networkx(g, ax=ax)

<h3>Getting Elements by tag and/or Properties</h3>

In [None]:
response = requests.get(url="https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html")
tree = html.fromstring(response.content)

In [None]:
def iterelements(htmltree):
    for element in tree.iter():
        yield element

def get_classes(htmltree):
    classes = []
    [classes.extend(c) for c in [e.classes for e in i]]
    classcount = Counter(classes)
    return classcount

def get_tags(htmltree):
    tagcount = Counter([e.tag for e in i])
    return tagcount

def colorize(string='', style='native'):
    display(HTML("""
    <style>
    {pygments_css}
    </style>
    """.format(pygments_css=HtmlFormatter(style=style).get_style_defs('.highlight'))))
    return display(HTML(data=highlight(string, PythonLexer(), HtmlFormatter())))

i = list(iterelements(tree))
[e.text_content() for e in i if 'pre' in e.classes]
select_elements_by_tag = [e for e in i if  e.tag in {'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5'}]

selected_elements = []
for e in i:
    cls = set(list(e.classes))
    if 'input_area' in cls and not 'prompt' in cls:
        selected_elements.append(tuple([e.tag, e.text_content()]))
#     elif e.tag in {'p', 'h1', 'h2', 'h3', 'h4', 'h5'}:
#         selected_elements.append(tuple([e.tag, e.text_content()]))
    else:
        continue

selected_elements = [(e[0], e[1].replace('¶', '')) for e in selected_elements]


interactive(colorize, string=[e[1] for e in selected_elements], style=styles)

In [None]:
for code in selected_elements:
    if code[0] == 'div':
        colorize(code[1])

<h2>Storing the Information</h2>

In [None]:
# url="https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html"

import sqlite3


with sqlite3.connect('meetup.sqlite3') as con:
    con.execute("""CREATE TABLE IF NOT EXISTS codesnipps(
    no INT,
    url TEXT,
    links_from TEXT,
    link_to TEXT
    )""")
    cur = con.cursor()
    no = 0
    for t, c in selected_elements:
        cur = con.cursor()
        cur.execute("INSERT INTO codesnipps values(?, ?, ?, ?);", [no, url, t, c])
        no = no + 1
#     tabletuples = cur.execute("SELECT * FROM codesnipps").fetchall()
#     for tabletuple in tabletuples:
#         print("%-5s%-30s%-10s%-100s" % tuple([str(tte)[:100].replace('\n', '') for tte in tabletuple]))

with sqlite3.connect('meetup.sqlite3') as con:
    df = pd.read_sql_query("""select * from codesnipps;""", con=con)
qgrid.show_grid(df)

In [None]:
import sqlite3

with sqlite3.connect('meetup.sqlite3') as con:
    df = pd.read_sql_query("""select * from sqlite_master;""", con=con)
qgrid.show_grid(df)

In [None]:
# url="https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html"

# import sqlite3

# Backup solution using in memory database if something fails using the default db
# with sqlite3.connect(':memory:') as con:
#     con.execute("""CREATE TABLE codesnipps(
#     no INT,
#     url TEXT,
#     links_from TEXT,
#     link_to TEXT
#     )""")
#     cur = con.cursor()
#     no = 0
#     for t, c in selected_elements:
#         cur = con.cursor()
#         cur.execute("INSERT INTO codesnipps values(?, ?, ?, ?);", [no, url, t, c])
#         no = no + 1
#     tabletuples = cur.execute("SELECT * FROM codesnipps").fetchall()
#     for tabletuple in tabletuples:
#         print("%-2d%-30s%-10s%-100s" % tabletuple)


<h2>Tabular Data</h2>

In [None]:
# Getting tabular data
import sqlite3
import pandas as pd

# url = 'https://de.wikipedia.org/wiki/Liste_der_Millionenstdäte'
url = 'https://www.w3schools.com/colors/colors_names.asp'
# url = 'https://sqlite.org/src/mimetype_list'

def gettablelist(url):
    ''' Returns a list of dataframes with all html tables
    found on the webpage'''
    try:
        return pd.read_html(url, header=0, index_col=0)
    except:
        return pd.read_html(url)

tablelist = gettablelist(url)
# tablelist
qgrid.show_grid(tablelist[0])