# BeautifulSoup 网页解析: 基础
##### https://morvanzhou.github.io/static/scraping/basic-structure.html

In [46]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="https://morvanzhou.github.io/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


In [47]:
soup = BeautifulSoup(html, features='lxml')  # lxml解析器
# https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id9
print(soup.h1)
print('\n\n', soup.p)

<h1>爬虫测试1</h1>


 <p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>


In [48]:
all_href = soup.find_all('a')
all_href = [l['href'] for l in all_href]
for i in range(len(all_href)):
    print(all_href[i])

https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/


# BeautifulSoup 网页解析: CSS
##### https://morvanzhou.github.io/static/scraping/list.html

In [49]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/list.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>爬虫练习 列表 class | 莫烦 Python</title>
	<style>
	.jan {
		background-color: yellow;
	}
	.feb {
		font-size: 25px;
	}
	.month {
		color: red;
	}
	</style>
</head>

<body>

<h1>列表 爬虫练习</h1>

<p>这是一个在 <a href="https://morvanzhou.github.io/" >莫烦 Python</a> 的 <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/" >爬虫教程</a>
	里无敌简单的网页, 所有的 code 让你一目了然, 清晰无比.</p>

<ul>
	<li class="month">一月</li>
	<ul class="jan">
		<li>一月一号</li>
		<li>一月二号</li>
		<li>一月三号</li>
	</ul>
	<li class="feb month">二月</li>
	<li class="month">三月</li>
	<li class="month">四月</li>
	<li class="month">五月</li>
</ul>

</body>
</html>


In [52]:
soup = BeautifulSoup(html, features='lxml')

# use class to narrow search
month = soup.find_all('li', {"class": "month"})
for m in month:
    print(m.get_text())

一月
二月
三月
四月
五月


In [54]:
jan = soup.find('ul', {"class": 'jan'})
d_jan = jan.find_all('li')              # use jan as a parent
for d in d_jan:
    print(d.get_text())

一月一号
一月二号
一月三号


# BeautifulSoup 网页解析: RegEx
##### https://morvanzhou.github.io/static/scraping/table.html

In [71]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
# print(html)

In [75]:
import re
soup = BeautifulSoup(html, features='lxml')

img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
print('Image Link：')
for link in img_links:
    print(link['src'])
    
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
print('\n\nCourse Link：')
for link in course_links:
    print(link['href'])

Image Link：
https://morvanzhou.github.io/static/img/course_cover/tf.jpg
https://morvanzhou.github.io/static/img/course_cover/rl.jpg
https://morvanzhou.github.io/static/img/course_cover/scraping.jpg


Course Link：
https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/
https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/


# 练习： 百度百科
##### https://baike.baidu.com

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random


base_url = "https://baike.baidu.com"
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]  # history

ImportError: No module named request

In [77]:
url = base_url + his[-1]

html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
print(soup.find('h1').get_text(), '    url: ', his[-1])

网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711


In [78]:
# find valid urls
sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})

if len(sub_urls) != 0:
    his.append(random.sample(sub_urls, 1)[0]['href'])
else:
    # no valid sub link found
    his.pop()
print(his)

['/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711', '/item/%E7%BD%91%E7%BB%9C%E6%95%B0%E6%8D%AE']


In [80]:
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]

for i in range(20):
    url = base_url + his[-1]

    html = urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html, features='lxml')
    print(i, soup.find('h1').get_text(), '    url: ', his[-1])

    # find valid urls
    sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})

    if len(sub_urls) != 0:
        his.append(random.sample(sub_urls, 1)[0]['href'])
    else:
        # no valid sub link found
        his.pop()

0 网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711
1 蠕虫     url:  /item/%E8%A0%95%E8%99%AB
2 蜥蜴     url:  /item/%E8%9C%A5%E8%9C%B4
3 海藻     url:  /item/%E6%B5%B7%E8%97%BB
4 海藻酒     url:  /item/%E6%B5%B7%E8%97%BB%E9%85%92
5 海藻     url:  /item/%E6%B5%B7%E8%97%BB
6 多糖类     url:  /item/%E5%A4%9A%E7%B3%96%E7%B1%BB
7 糖肽     url:  /item/%E7%B3%96%E8%82%BD
8 多糖类     url:  /item/%E5%A4%9A%E7%B3%96%E7%B1%BB
9 糖肽     url:  /item/%E7%B3%96%E8%82%BD
10 多糖类     url:  /item/%E5%A4%9A%E7%B3%96%E7%B1%BB
11 相对分子质量     url:  /item/%E5%88%86%E5%AD%90%E9%87%8F
12 高分子化合物     url:  /item/%E8%81%9A%E5%90%88%E7%89%A9
13 线型高分子     url:  /item/%E7%BA%BF%E5%9E%8B%E9%AB%98%E5%88%86%E5%AD%90
14 高分子化合物     url:  /item/%E9%AB%98%E5%88%86%E5%AD%90
15 酚醛树脂     url:  /item/%E9%85%9A%E9%86%9B%E6%A0%91%E8%84%82
16 硬度     url:  /item/%E7%A1%AC%E5%BA%A6
17 压入硬度     url:  /item/%E5%8E%8B%E5%85%A5%E7%A1%AC%E5%BA%A6
18 硬度计     url:  /item/%E7%A1%AC%E5%BA%A6%E8%AE%A1
19 状态     url:  /item/%E7%8A%B6%E6%80%81
