### 1.2 Python的网页操作
#### 1.2.1 用python登录网页并打印信息

In [1]:
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="https://morvanzhou.github.io/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


#### 1.2.2 用Python的正则表达式匹配网页信息

In [4]:
import re
res = re.findall(r'<p>(.*?)</p>',html, flags=re.DOTALL)
print('\nPage paragraph is ',res[0])


Page paragraph is  
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	


In [6]:
res = re.findall(r'href="(.*?)"',html)
print('\nAll links: ')
for r in res:
    print(r)


All links: 
https://morvanzhou.github.io/static/img/description/tab_icon.png
https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/


### 2.2 BeautifulSoup应用实例

In [7]:
from bs4 import BeautifulSoup
from urllib.request import urlopen


In [8]:
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="https://morvanzhou.github.io/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
		<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


In [21]:
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)
print(soup.p)
all_href = soup.find_all('a')
print(all_href)
all_href = [item['href'] for item in all_href]
print(all_href)

<h1>爬虫测试1</h1>
<p>
		这是一个在 <a href="https://morvanzhou.github.io/">莫烦Python</a>
<a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>
[<a href="https://morvanzhou.github.io/">莫烦Python</a>, <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/">爬虫教程</a>]
['https://morvanzhou.github.io/', 'https://morvanzhou.github.io/tutorials/data-manipulation/scraping/']


### 3.1 CSS Class

In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen('https://morvanzhou.github.io/static/scraping/list.html').read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>爬虫练习 列表 class | 莫烦 Python</title>
	<style>
	.jan {
		background-color: yellow;
	}
	.feb {
		font-size: 25px;
	}
	.month {
		color: red;
	}
	</style>
</head>

<body>

<h1>列表 爬虫练习</h1>

<p>这是一个在 <a href="https://morvanzhou.github.io/" >莫烦 Python</a> 的 <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/" >爬虫教程</a>
	里无敌简单的网页, 所有的 code 让你一目了然, 清晰无比.</p>

<ul>
	<li class="month">一月</li>
	<ul class="jan">
		<li>一月一号</li>
		<li>一月二号</li>
		<li>一月三号</li>
	</ul>
	<li class="feb month">二月</li>
	<li class="month">三月</li>
	<li class="month">四月</li>
	<li class="month">五月</li>
</ul>

</body>
</html>


In [5]:
soup = BeautifulSoup(html, features='lxml')
month = soup.find_all('li', {'class':'month'})
for m in month:
    print(m.get_text())

一月
二月
三月
四月
五月


In [6]:
jan = soup.find('ul', {"class":"jan"})
d_jan = jan.find_all('li')
for d in d_jan:
    print(d.get_text())

一月一号
一月二号
一月三号


### 3.1 正则表达式匹配网页

In [8]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>爬虫练习 表格 table | 莫烦 Python</title>

	<style>
	img {
		width: 250px;
	}
	table{
		width:50%;
	}
	td{
		margin:10px;
		padding:15px;
	}
	</style>
</head>
<body>

<h1>表格 爬虫练习</h1>

<p>这是一个在 <a href="https://morvanzhou.github.io/" >莫烦 Python</a> 的 <a href="https://morvanzhou.github.io/tutorials/data-manipulation/scraping/" >爬虫教程</a>
	里无敌简单的网页, 所有的 code 让你一目了然, 清晰无比.</p>

<br>
<table id="course-list">
	<tr>
		<th>
			分类
		</th><th>
			名字
		</th><th>
			时长
		</th><th>
			预览
		</th>
	</tr>

	<tr id="course1" class="ml">
		<td>
			机器学习
		</td><td>
			<a href="https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/">
				Tensorflow 神经网络</a>
		</td><td>
			2:00
		</td><td>
			<img src="https://morvanzhou.github.io/static/img/course_cover/tf.jpg">
		</td>
	</tr>

	<tr id="course2" class="ml">
		<td>
			机器学习
		</td><td>
			<a href="https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/">
			

In [12]:
#找出所有图片的链接
import re
soup = BeautifulSoup(html, features='lxml')
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
    print(link['src'])

https://morvanzhou.github.io/static/img/course_cover/tf.jpg
https://morvanzhou.github.io/static/img/course_cover/rl.jpg
https://morvanzhou.github.io/static/img/course_cover/scraping.jpg


### iv. 用爬虫爬取百度百科

In [15]:
#找出所有课程的链接：以https://morvan开头
course_links = soup.find_all('a', {'href':re.compile('https://morvan.*')})
for link in course_links:from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random
base_url = "https://baike.baidu.com"
his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]
for i in range(20):
    url = base_url + his[-1]
    html = urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html, features='lxml')
    print(i, soup.find('h1').get_text(), ' url: https://baike.baidu.com'+his[-1])
#     print(' url', his[-1])
    sub_urls = soup.find_all("a", {"target":"_blank", "href":re.compile("^/item/(%.{2})+$")})
    if len(sub_urls)!=0:
        his.append(random.sample(sub_urls, 1)[0]['href'])
    else:
        his.pop()
    print(link['href'])

https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/
https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
