本章节主要介绍获取网上数据相关的模块和内容

In [6]:
# -*- coding: utf-8 -*-
import urllib.request
from bs4 import BeautifulSoup
import sys
import re
from tqdm import tqdm
import pandas as pd

# BeautifulSoup

https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc)
# 按照标准的缩进格式的结构输出
print (soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [4]:
# Tag
tag = soup.body
print (type(tag))
# Name:每个tag都有自己的名字,通过 .name 来获取
print (tag.name)

<class 'bs4.element.Tag'>
body


### Attributes:一个tag可能有很多个属性

In [5]:
# tag 有一个 “class” 的属性,值为 “boldest” . tag的属性的操作方法与字典相同.
tag_p = tag.p
print (tag_p['class'])

#也可以直接”点”取属性, 比如: .attrs
print (tag_p.attrs)

#多值属性：最常见的多值的属性是 class (一个tag可以有多个CSS的class). 在Beautiful Soup中多值属性的返回类型是list。
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
print (css_soup.p['class'])

['title']
{'class': ['title']}
['body', 'strikeout']


In [6]:
id_soup = BeautifulSoup('<p  class="my id">ss</p>')
id_soup_1 = BeautifulSoup('<p  id="my id">ss</p>')
print (id_soup.p.name,'\n',
       id_soup.p.attrs,'\n',
       id_soup.p['class'],'\n',
       id_soup.p.string,'\n',
       id_soup_1.p.attrs)

p 
 {'class': ['my', 'id']} 
 ['my', 'id'] 
 ss 
 {'id': 'my id'}


In [7]:
# 可以遍历的字符串
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag_e = soup.b
print (tag_e.string)
print (type(tag_e.string))

Extremely bold
<class 'bs4.element.NavigableString'>


一个 NavigableString 字符串与Python中的Unicode字符串相同,并且还支持包含在 遍历文档树 和 搜索文档树 中的一些特性. 

In [8]:
#tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法
tag_e.string.replace_with('today is not a good day')
tag_e

<b class="boldest">today is not a good day</b>

**注释及特殊字符串**

Tag , NavigableString , BeautifulSoup 几乎覆盖了html和xml中的所有内容,但是还有一些特殊对象.容易让人担心的内容是文档的注释部分

In [9]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
print( type(comment))
print (comment)
print (soup.b.prettify())

<class 'bs4.element.Comment'>
Hey, buddy. Want to buy a used parser?
<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [10]:
soup = BeautifulSoup(html_doc)
# print (soup.prettify())
# tag的名字：操作文档树最简单的方法就是告诉它你想获取的tag的name.如果想获取 <head> 标签,只要用 soup.head 
print (soup.head)

print (soup.title)

#可以在文档树的tag中多次调用这个方法
print (soup.body.b)
print (soup.b)

<head><title>The Dormouse's story</title></head>
<title>The Dormouse's story</title>
<b>The Dormouse's story</b>
<b>The Dormouse's story</b>


从结果来看，似乎可以直接从根节点访问嵌套tag

In [11]:
# 通过点取属性的方式只能获得当前名字的第一个tag
print (soup.a)

# 如果想要得到所有的<a>标签,或是通过名字得到比一个tag更多的内容的时候,就需要用到 Searching the tree 中描述的方法,比如: find_all()
print (soup.find_all('a'))

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


### contents 和 children

* tag的 `.contents` 属性可以将tag的子节点以列表的方式输出;
* 字符串没有 `.contents` 属性,因为字符串没有子节点;

In [12]:
head_tag = soup.head
print (head_tag)

title_tag = head_tag.contents[0]
print (title_tag)
print( title_tag.contents)
print ('--------------------')
for i in soup.body.contents:
    print(i)

<head><title>The Dormouse's story</title></head>
<title>The Dormouse's story</title>
["The Dormouse's story"]
--------------------


<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




* 通过tag的 `.children` 生成器,可以对tag的子节点进行循环

In [25]:
for child in soup.body.children:
    print (child)



<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




# 获取网络数据的自定义类

In [3]:
'''发送连接网页的请求'''

class web_data(object):
    def __init__(self,url):
        self.url = url
        
    def get_soup(self):
        response = urllib.request.urlopen(self.url)
        html = response.read()
        soup = BeautifulSoup(html, "html.parser", from_encoding='utf-8')
        return soup

# 获取百度地图相关API的类

In [3]:
import requests
import json

class baiduLbsAPI(object):
    """将和百度地图api相关的服务集成在一起"""
    def __init__(self,basic_dict):
        self.ak = basic_dict.get('ak','joEZrGxLI7hH0Bggf7rVBUPwB9hgooyx')
        self.outPut_type = basic_dict.get('output_type','json')
        self.getAdress_api = 'http://api.map.baidu.com/geocoder/v2/?output={0}&ak={1}'.format(self.outPut_type,self.ak)
        
    def getGps(self,address):
        """
        服务文档：http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-geocoding
        功能介绍：
        用户可通过该功能，将结构化地址（省/市/区/街道/门牌号）解析为对应的位置坐标。
        """
        api = '{0}&address={1}'.format(self.getAdress_api,address)
        if self.outPut_type == 'json':
            info_dict = requests.get(api).json()
        else:
            info_dict = {}
            print('please add this part by yourself.')
            
        print(api,'\n',info_dict)
        if info_dict is not None:
            result = info_dict['result']
            precise,confidence,comprehension,level = result['precise'],result['confidence'],result['comprehension'],result['level']
            lng,lat = result['location']['lng'],result['location']['lat']
            return lat,lng

In [4]:
# 示例
basic_dict = {'ak':'joEZrGxLI7hH0Bggf7rVBUPwB9hgooyx'}
bd = baiduLbsAPI(basic_dict=basic_dict)
bd.getGps(address='上海市杨浦区吉浦路3号')

http://api.map.baidu.com/geocoder/v2/?output=json&ak=joEZrGxLI7hH0Bggf7rVBUPwB9hgooyx&address=上海市杨浦区吉浦路3号 
 {'status': 0, 'result': {'location': {'lng': 121.49789529042646, 'lat': 31.30492357706552}, 'precise': 1, 'confidence': 80, 'comprehension': 100, 'level': '门址'}}


(31.30492357706552, 121.49789529042646)

# 案例

## 爬取google全球办公地点

https://about.google/intl/en/locations/?region=north-america

In [4]:
from collections import defaultdict

def get_officeAddress_of_region(region):
    company_address_dict = defaultdict(list)
    
    web=web_data(url='https://about.google/intl/en/locations/?region={}'.format(region[0]))
    soup=web.get_soup()
    for tag in soup.body.main.section.div.find('div',attrs={'class':"offices-list-container" }).find_all('ul'):
        for sub_tag in tag.find_all('li',attrs={'data-ng-show':"'{}' === locationsCtrl.selectedRegion".format(region)}):
            office = sub_tag.h2.string.strip()
            address = sub_tag.find('div',attrs={'class':"office-address", 'itemprop':"address"}).string.strip().replace('\n',' ') 
            
            phone_tag = sub_tag.find('div',attrs={'class':"office-phone-number"})
            phone = phone_tag.span.string.strip() if phone_tag else None
            
            direction_tag = sub_tag.find('div',attrs={'class':"directions"}).a.attrs['href']
            lat,lon = (float(_) for _ in direction_tag.split('/')[-1].split('?')[0].split(','))
            for col in ['office','address','phone','lon','lat','region']:
                company_address_dict[col].append(eval(col))
                
            address_df = pd.DataFrame(company_address_dict) 
    return address_df

def get_addressOfGoogle():
    region_list=['north-america','latin-america','europe','asia-pacific','africa-middle-east']
    df_list = []
    for _ in region_list:
        df = get_officeAddress_of_region(region=_)
        df_list.append(df)
    googleOfficeAddress = pd.concat(df_list,axis=0).reset_index(drop=True)
    return googleOfficeAddress

In [7]:
get_addressOfGoogle().to_csv('addressOfGoogle.csv',index=False)