# ④（リンク先のダウンロード）

## 相対パスを展開する方法

### 実行結果

In [1]:
from urllib.parse import urljoin

base = "http://example.com/html/a.html"

print( urljoin(base, "b.html") )
print( urljoin(base, "sub/c.html") )
print( urljoin(base, "../index.html") )
print( urljoin(base, "../img/hoge.png") )
print( urljoin(base, "../css/hoge.css") )

http://example.com/html/b.html
http://example.com/html/sub/c.html
http://example.com/index.html
http://example.com/img/hoge.png
http://example.com/css/hoge.css


In [2]:
from urllib.parse import urljoin

base = "http://example.com/html/a.html"

print( urljoin(base, "/hoge.html") )
print( urljoin(base, "http://kujirahand.com/wiki") )
print( urljoin(base, "//uta.pw/shodou") )

http://example.com/hoge.html
http://kujirahand.com/wiki
http://uta.pw/shodou


## 丸ごとダウンロードするプログラム

### 実行結果

In [3]:
# Pythonのマニュアルを再帰的にダウンロード 
# モジュールの取り込み --- (※1)
from bs4 import BeautifulSoup
from urllib.request import *
from urllib.parse import *
from os import makedirs
import os.path, time, re

# 処理済み判断変数 --- (※2)
proc_files = {}

# HTML内にあるリンクを抽出する関数 --- (※3)
def enum_links(html, base):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.select("link[rel='stylesheet']") # CSS
    links += soup.select("a[href]") # リンク
    result = []
    # href属性を取り出し、リンクを絶対パスに変換 --- (※4)
    for a in links:
        href = a.attrs['href']
        url = urljoin(base, href)
        result.append(url)
    return result

# ファイルをダウンロードし保存する関数 --- (※5)
def download_file(url):
    o = urlparse(url)
    savepath = "./" + o.netloc + o.path
    if re.search(r"/$", savepath): # ディレクトリならindex.html
        savepath += "index.html"
    savedir = os.path.dirname(savepath)
    # 既にダウンロード済み?
    if os.path.exists(savepath): return savepath
    # ダウンロード先のディレクトリを作成
    if not os.path.exists(savedir):
        print("mkdir=", savedir)
        makedirs(savedir)
    # ファイルをダウンロード --- (※6)
    try:
        print("download=", url)
        urlretrieve(url, savepath)
        time.sleep(1) # 礼儀として1秒スリープ --- (※7)
        return savepath
    except:
        print("ダウンロード失敗:", url)
        return None        

# HTMLを解析してダウンロードする関数 --- (※8)
def analize_html(url, root_url):
    savepath = download_file(url)
    if savepath is None: return
    if savepath in proc_files: return # 解析済みなら処理しない --- (※9)
    proc_files[savepath] = True
    print("analize_html=", url)
    # リンクを抽出 --- (※10)
    html = open(savepath, "r", encoding="utf-8").read()
    links = enum_links(html, url)
    for link_url in links:
        # リンクがルート以外のパスを指していたら無視 --- (※11)
        if link_url.find(root_url) != 0:
            if not re.search(r".css$", link_url): continue
        # HTMLか？
        if re.search(r".(html|htm)$", link_url):
            # 再帰的にHTMLファイルを解析
            analize_html(link_url, root_url)
            continue
        # それ以外のファイル
        download_file(link_url)

if __name__ == "__main__":
    # URLを丸ごとダウンロード --- (※13)
    url = "https://docs.python.jp/3.6/library/"
    analize_html(url, url)

mkdir= ./docs.python.jp/3.6/library
download= https://docs.python.jp/3.6/library/
analize_html= https://docs.python.jp/3.6/library/
mkdir= ./docs.python.jp/3.6/_static
download= https://docs.python.jp/3.6/_static/pydoctheme.css
download= https://docs.python.jp/3.6/_static/pygments.css
download= https://docs.python.jp/3.6/library/intro.html
analize_html= https://docs.python.jp/3.6/library/intro.html
download= https://docs.python.jp/3.6/library/functions.html
analize_html= https://docs.python.jp/3.6/library/functions.html
download= https://docs.python.jp/3.6/library/constants.html
analize_html= https://docs.python.jp/3.6/library/constants.html
download= https://docs.python.jp/3.6/library/stdtypes.html
analize_html= https://docs.python.jp/3.6/library/stdtypes.html
download= https://docs.python.jp/3.6/library/exceptions.html
analize_html= https://docs.python.jp/3.6/library/exceptions.html
download= https://docs.python.jp/3.6/library/text.html
analize_html= https://docs.python.jp/3.6/librar

analize_html= https://docs.python.jp/3.6/library/lzma.html
download= https://docs.python.jp/3.6/library/zipfile.html
analize_html= https://docs.python.jp/3.6/library/zipfile.html
download= https://docs.python.jp/3.6/library/tarfile.html
analize_html= https://docs.python.jp/3.6/library/tarfile.html
download= https://docs.python.jp/3.6/library/fileformats.html
analize_html= https://docs.python.jp/3.6/library/fileformats.html
download= https://docs.python.jp/3.6/library/csv.html
analize_html= https://docs.python.jp/3.6/library/csv.html
download= https://docs.python.jp/3.6/library/configparser.html
analize_html= https://docs.python.jp/3.6/library/configparser.html
download= https://docs.python.jp/3.6/library/netrc.html
analize_html= https://docs.python.jp/3.6/library/netrc.html
download= https://docs.python.jp/3.6/library/xdrlib.html
analize_html= https://docs.python.jp/3.6/library/xdrlib.html
download= https://docs.python.jp/3.6/library/plistlib.html
analize_html= https://docs.python.jp/3

analize_html= https://docs.python.jp/3.6/library/email.headerregistry.html
download= https://docs.python.jp/3.6/library/email.contentmanager.html
analize_html= https://docs.python.jp/3.6/library/email.contentmanager.html
download= https://docs.python.jp/3.6/library/email.examples.html
analize_html= https://docs.python.jp/3.6/library/email.examples.html
download= https://docs.python.jp/3.6/library/email.compat32-message.html
analize_html= https://docs.python.jp/3.6/library/email.compat32-message.html
download= https://docs.python.jp/3.6/library/email.mime.html
analize_html= https://docs.python.jp/3.6/library/email.mime.html
download= https://docs.python.jp/3.6/library/email.header.html
analize_html= https://docs.python.jp/3.6/library/email.header.html
download= https://docs.python.jp/3.6/library/email.charset.html
analize_html= https://docs.python.jp/3.6/library/email.charset.html
download= https://docs.python.jp/3.6/library/email.encoders.html
analize_html= https://docs.python.jp/3.6/l

analize_html= https://docs.python.jp/3.6/library/chunk.html
download= https://docs.python.jp/3.6/library/colorsys.html
analize_html= https://docs.python.jp/3.6/library/colorsys.html
download= https://docs.python.jp/3.6/library/imghdr.html
analize_html= https://docs.python.jp/3.6/library/imghdr.html
download= https://docs.python.jp/3.6/library/sndhdr.html
analize_html= https://docs.python.jp/3.6/library/sndhdr.html
download= https://docs.python.jp/3.6/library/ossaudiodev.html
analize_html= https://docs.python.jp/3.6/library/ossaudiodev.html
download= https://docs.python.jp/3.6/library/i18n.html
analize_html= https://docs.python.jp/3.6/library/i18n.html
download= https://docs.python.jp/3.6/library/gettext.html
analize_html= https://docs.python.jp/3.6/library/gettext.html
download= https://docs.python.jp/3.6/library/locale.html
analize_html= https://docs.python.jp/3.6/library/locale.html
download= https://docs.python.jp/3.6/library/frameworks.html
analize_html= https://docs.python.jp/3.6/

analize_html= https://docs.python.jp/3.6/library/symtable.html
download= https://docs.python.jp/3.6/library/symbol.html
analize_html= https://docs.python.jp/3.6/library/symbol.html
download= https://docs.python.jp/3.6/library/token.html
analize_html= https://docs.python.jp/3.6/library/token.html
download= https://docs.python.jp/3.6/library/keyword.html
analize_html= https://docs.python.jp/3.6/library/keyword.html
download= https://docs.python.jp/3.6/library/tokenize.html
analize_html= https://docs.python.jp/3.6/library/tokenize.html
download= https://docs.python.jp/3.6/library/tabnanny.html
analize_html= https://docs.python.jp/3.6/library/tabnanny.html
download= https://docs.python.jp/3.6/library/pyclbr.html
analize_html= https://docs.python.jp/3.6/library/pyclbr.html
download= https://docs.python.jp/3.6/library/py_compile.html
analize_html= https://docs.python.jp/3.6/library/py_compile.html
download= https://docs.python.jp/3.6/library/compileall.html
analize_html= https://docs.python.