# 做出上海疫情热点图

## 数据来源 丁香园每日数据

In [3]:
import requests

*requests包简单演示*

In [None]:
response = requests.get('https://www.bilibili.com/')

## 显示中文的html
## 方法一
## 更改.enconding，后用.text
## print(response.text) 获取响应字符串
## print(response.encoding)
## 若非utf-8，则需改为utf-8
response.encoding = 'utf-8'
## print(response.encoding)

## 方法二
## 通过.content，获取响应的二进制数据，指定decode解码参数中的编码种类
## print(response.content.decode(encoding='utf8'))

print(response.text == response.content.decode())

In [None]:
# 案例：获取丁香园网站中防疫数据
response = requests.get('https://ncov.dxy.cn/ncovh5/view/pneumonia')
print(response.encoding)
print(response.content.decode(encoding = 'utf8'))

*通过 Beautiful Soup 提取数据*

In [34]:
from bs4 import BeautifulSoup

In [38]:
## 创建BeautifulSoup对象
soup = BeautifulSoup('<html>data</html>','lxml')
# bs会自动补全html格式所需的各项标签
print(soup)

<html><body><p>data</p></body></html>


![jupyter](./正则表达式常见表达方法.png)

*正则表达式常见语法*

In [45]:
import re

 <img src = './正则表达式常见表达方法.png' width = 640 height = 1280>

![jupyter](./正则表达式常见表达方法.png)

In [59]:
## 普通字符匹配
rs = re.findall('abc','abc')
print(rs)
## .号匹配
rs = re.findall('a.c','a!c')
print(rs)
## .号转义后匹配
rs = re.findall('a\.c','abc')
print(rs)
## []应用
rs = re.findall('a[bc]d','abd')
print(rs)

['abc']
['a!c']
[]
['abd']


*正则表达式常见语法*

![jupyter](./正则表达式常见语法.png)

In [73]:
## 预定义的字符集
rs = re.findall('\d\d\d','123')
print(rs)
rs = re.findall('\w','1abc你好')
print(rs)
## 数量词应用
rs = re.findall('a\d*','a123')
print(rs)
rs = re.findall('\d+','123')
print(rs)
rs = re.findall('a\d?','a123')
print(rs)
rs = re.findall('a\d{2}','a123')
print(rs)

['123']
['1', 'a', 'b', 'c', '你', '好']
['a123']
['123']
['a1']
['a12']


*re.findall()的应用*

In [80]:
## 1. findall方法，返回匹配的结果列表
rs = re.findall('\d+','tangsuqi123sqsq234')
print(rs)

['123', '234']


In [84]:
## 2.findall方法中，flag参数的作用
rs = re.findall('a.bc','a\nbc', re.DOTALL)
print(rs)
rs = re.findall('a.bc','a\nbc',re.S)
print(rs)

['a\nbc']
['a\nbc']


In [92]:
## 3.findall方法中分组的使用
rs = re.findall('a.+bc','a\nbc',re.DOTALL)
print(rs)
rs = re.findall('a(.+)bc','a\nbc',re.DOTALL)
print(rs)

['a\nbc']
['\n']


*正则表达式中的r原串的使用*

In [96]:
## 1.在不使用r原串的时候，遇到转义符怎么做
rs = re.findall('a\nbc','a\nbc')
print(rs)
rs = re.findall('a\\\\nbc','a\\nbc')
print(rs)

['a\nbc']
['a\\nbc']


In [102]:
## 2.r原串在正则中可以消除转义符带来的影响
rs = re.findall(r'a\nbc','a\nbc')
print(rs)
## 扩展：可以解决正则的时候，不符合PEP8规范的问题
rs = re.findall(r'\d','a123')
print(rs)

['a\nbc']
['1', '2', '3']


*Json模块*

* JSON与python的对应关系

![jupyter](./JSON与Python的转化关系图.png)

* 将JSON转化为Python

In [104]:
import json

In [112]:
## 1.把JSON字符串，转换为Python的数据
json_str = '''[{"ProvinceName":"America","currentConfirmedCount":1179841,"confirmedCount":1643499},
                {"ProvinceName":"Britain","currentConfirmedCount":222227,"confirmedCount":259559}]'''
## 转换为Python数据
rs = json.loads(json_str)
print(rs)
print(type(rs))
print(type(rs[0]))
## 2.把JSON格式文件，转化为Python类型的数据
## 构建指向该文件的文件对象
with open('./test.json') as fp:
    ##加载该文件对象，转换为Python类型的数据
    python_list = json.load(fp)
    print(python_list)
    print(type(python_list))
    print(type(python_list[0]))

[{'ProvinceName': 'America', 'currentConfirmedCount': 1179841, 'confirmedCount': 1643499}, {'ProvinceName': 'Britain', 'currentConfirmedCount': 222227, 'confirmedCount': 259559}]
<class 'list'>
<class 'dict'>
[{'ProvinceName': 'America', 'currentConfirmedCount': 1179841, 'confirmedCount': 1643499}, {'ProvinceName': 'Britain', 'currentConfirmedCount': 222227, 'confirmedCount': 259559}]
<class 'list'>
<class 'dict'>


* 将Python转换为JSON

In [118]:
## 1.把Python转换为JSON字符串，ascii码指定为False，则可以显示中文
json_str = json.dumps(rs,ensure_ascii=False) 
print(json_str)
## 2.把Python以JSON格式存储到文件中
## 构建要写入的文件对象
with open('./test1.json', 'w') as fp:
    ##把Python以JSON格式存储到文件中,ascii码指定为False，则可以显示中文
    json.dump(rs, fp)

[{"ProvinceName": "America", "currentConfirmedCount": 1179841, "confirmedCount": 1643499}, {"ProvinceName": "Britain", "currentConfirmedCount": 222227, "confirmedCount": 259559}]


#### 案例解析疫情 json字符串

In [1]:
import requests
import re
from bs4 import BeautifulSoup

In [5]:
## 发送请求，获取疫情首页内容
response = requests.get('https://ncov.dxy.cn/ncovh5/view/pneumonia')
home_page = response.content.decode()
## print(home_page)

In [31]:
## 使用BeautifulSoup获取疫情数据
soup = BeautifulSoup(home_page)
## print(soup)
## print(type(soup))
script = soup.find('script', id = 'getListByCountryTypeService2true')
print(type(script))
text = script.string
print(text)

<class 'bs4.element.Tag'>
try { window.getListByCountryTypeService2true = [{"id":14860442,"createTime":1648613310000,"modifyTime":1648613310000,"tags":"","countryType":2,"continents":"欧洲","provinceId":"5","provinceName":"法国","provinceShortName":"","cityName":"","currentConfirmedCount":24602062,"confirmedCount":25111975,"confirmedCountRank":4,"suspectedCount":0,"curedCount":368023,"deadCount":141890,"deadCountRank":10,"deadRate":"0.56","deadRateRank":162,"comment":"","sort":0,"operator":"pangzhicheng","locationId":961002,"countryShortCode":"FRA","countryFullName":"France","statisticsData":"https://file1.dxycdn.com/2020/0315/929/3402160538577857318-135.json","incrVo":{"currentConfirmedIncr":29433,"confirmedIncr":29582,"curedIncr":0,"deadIncr":149},"showRank":true,"yesterdayConfirmedCount":2147383647,"yesterdayLocalConfirmedCount":2147383647,"yesterdayOtherConfirmedCount":2147383647,"highDanger":"","midDanger":"","highInDesc":"","lowInDesc":"","outDesc":""},{"id":14860451,"createTime":164

In [32]:
## 使用正则表达式，提取json字符串
json_str = re.findall(r'\[.+\]', text)[0]
print(json_str)

[{"id":14860442,"createTime":1648613310000,"modifyTime":1648613310000,"tags":"","countryType":2,"continents":"欧洲","provinceId":"5","provinceName":"法国","provinceShortName":"","cityName":"","currentConfirmedCount":24602062,"confirmedCount":25111975,"confirmedCountRank":4,"suspectedCount":0,"curedCount":368023,"deadCount":141890,"deadCountRank":10,"deadRate":"0.56","deadRateRank":162,"comment":"","sort":0,"operator":"pangzhicheng","locationId":961002,"countryShortCode":"FRA","countryFullName":"France","statisticsData":"https://file1.dxycdn.com/2020/0315/929/3402160538577857318-135.json","incrVo":{"currentConfirmedIncr":29433,"confirmedIncr":29582,"curedIncr":0,"deadIncr":149},"showRank":true,"yesterdayConfirmedCount":2147383647,"yesterdayLocalConfirmedCount":2147383647,"yesterdayOtherConfirmedCount":2147383647,"highDanger":"","midDanger":"","highInDesc":"","lowInDesc":"","outDesc":""},{"id":14860451,"createTime":1648613310000,"modifyTime":1648613310000,"tags":"","countryType":2,"continent

In [34]:
import json
## 把json字符串转换为python类型的数据、
last_day_covid = json.loads(json_str)
print(last_day_covid)

[{'id': 14860442, 'createTime': 1648613310000, 'modifyTime': 1648613310000, 'tags': '', 'countryType': 2, 'continents': '欧洲', 'provinceId': '5', 'provinceName': '法国', 'provinceShortName': '', 'cityName': '', 'currentConfirmedCount': 24602062, 'confirmedCount': 25111975, 'confirmedCountRank': 4, 'suspectedCount': 0, 'curedCount': 368023, 'deadCount': 141890, 'deadCountRank': 10, 'deadRate': '0.56', 'deadRateRank': 162, 'comment': '', 'sort': 0, 'operator': 'pangzhicheng', 'locationId': 961002, 'countryShortCode': 'FRA', 'countryFullName': 'France', 'statisticsData': 'https://file1.dxycdn.com/2020/0315/929/3402160538577857318-135.json', 'incrVo': {'currentConfirmedIncr': 29433, 'confirmedIncr': 29582, 'curedIncr': 0, 'deadIncr': 149}, 'showRank': True, 'yesterdayConfirmedCount': 2147383647, 'yesterdayLocalConfirmedCount': 2147383647, 'yesterdayOtherConfirmedCount': 2147383647, 'highDanger': '', 'midDanger': '', 'highInDesc': '', 'lowInDesc': '', 'outDesc': ''}, {'id': 14860451, 'createTi

### 采集最近一日世界各国疫情数据

In [36]:
respond = requests.get('https://ncov.dxy.cn/ncovh5/view/pneumonia')
home_page =respond.content.decode()

soup = BeautifulSoup(home_page, 'lxml')
script  = soup.find(id = 'getListByCountryTypeService2true')
text = script.string
print(text)

try { window.getListByCountryTypeService2true = [{"id":14860442,"createTime":1648613310000,"modifyTime":1648613310000,"tags":"","countryType":2,"continents":"欧洲","provinceId":"5","provinceName":"法国","provinceShortName":"","cityName":"","currentConfirmedCount":24602062,"confirmedCount":25111975,"confirmedCountRank":4,"suspectedCount":0,"curedCount":368023,"deadCount":141890,"deadCountRank":10,"deadRate":"0.56","deadRateRank":162,"comment":"","sort":0,"operator":"pangzhicheng","locationId":961002,"countryShortCode":"FRA","countryFullName":"France","statisticsData":"https://file1.dxycdn.com/2020/0315/929/3402160538577857318-135.json","incrVo":{"currentConfirmedIncr":29433,"confirmedIncr":29582,"curedIncr":0,"deadIncr":149},"showRank":true,"yesterdayConfirmedCount":2147383647,"yesterdayLocalConfirmedCount":2147383647,"yesterdayOtherConfirmedCount":2147383647,"highDanger":"","midDanger":"","highInDesc":"","lowInDesc":"","outDesc":""},{"id":14860451,"createTime":1648613310000,"modifyTime":16

In [49]:
json_str = re.findall(r'\[.+\]',text)
print(json_str)
last_day_covid = json.loads(json_str)
print(last_day_covid)

['[{"id":14860442,"createTime":1648613310000,"modifyTime":1648613310000,"tags":"","countryType":2,"continents":"欧洲","provinceId":"5","provinceName":"法国","provinceShortName":"","cityName":"","currentConfirmedCount":24602062,"confirmedCount":25111975,"confirmedCountRank":4,"suspectedCount":0,"curedCount":368023,"deadCount":141890,"deadCountRank":10,"deadRate":"0.56","deadRateRank":162,"comment":"","sort":0,"operator":"pangzhicheng","locationId":961002,"countryShortCode":"FRA","countryFullName":"France","statisticsData":"https://file1.dxycdn.com/2020/0315/929/3402160538577857318-135.json","incrVo":{"currentConfirmedIncr":29433,"confirmedIncr":29582,"curedIncr":0,"deadIncr":149},"showRank":true,"yesterdayConfirmedCount":2147383647,"yesterdayLocalConfirmedCount":2147383647,"yesterdayOtherConfirmedCount":2147383647,"highDanger":"","midDanger":"","highInDesc":"","lowInDesc":"","outDesc":""},{"id":14860451,"createTime":1648613310000,"modifyTime":1648613310000,"tags":"","countryType":2,"contine

TypeError: the JSON object must be str, bytes or bytearray, not list

In [56]:
## 将python类型数据转换为json文件保存
with open('./test2.json','w',encoding = 'utf8') as fp:
    json.dump(last_day_covid, fp , ensure_ascii=False)

### 采集各国昨日数据

In [2]:
import re
from bs4 import BeautifulSoup
import json

In [84]:
class CovidSpider(object):
    def __init__(self):
        ## 定义home_url
        self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'

    def get_content_from_url(self, url):
        ## 通过url得到网站响应并解码
        response = requests.get(url)
        return response.content.decode()

    def parse_home_page(self, home_page):
        """
        从疫情首页提取数据
        """
        soup = BeautifulSoup(home_page, 'lxml')
        ## Soup找到各国昨日数据
        script = soup.find(id = 'getListByCountryTypeService2true')
        text = script.string
        json_str = re.findall(r'\[.+\]', text)[0]
        ## 将json格式转换为python格式
        data = json.loads(json_str)
        return data

    def save(self,data,path):
        ## 将Python格式转换为json文件
        with open(path, 'w', encoding='utf8') as fp:
            json.dump(data, fp, ensure_ascii=False)

    def crawl_last_day_covid(self):
        ## 读取该网站主页数据并解码
        home_page = self.get_content_from_url(self.home_url)
        ## 主页所需各国昨日数据转为python形式并以data变量输出
        last_day_covid = self.parse_home_page(home_page)
        ##将data变量转换为json文件并保存
        self.save(last_day_covid, './last_day_covid.json')
    def run(self):
        ## 定义函数运行方式
        self.crawl_last_day_covid()

if __name__ == '__main__':
    spider = CovidSpider()
    spider.run()

### 采集各国1月23日以来的疫情数据

In [3]:
from tqdm import tqdm

In [4]:
class CovidSpider(object):
    def __init__(self):
        self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'

    def get_content_from_url(self, url):
        response = requests.get(url)
        return response.content.decode()

    def parse_home_page(self, home_page):
        """
        从疫情首页提取数据
        """
        soup = BeautifulSoup(home_page, 'lxml')
        script = soup.find(id = 'getListByCountryTypeService2true')
        text = script.string
        json_str = re.findall(r'\[.+\]', text)[0]
        data = json.loads(json_str)
        return data

    def save(self,data,path):
        with open(path, 'w', encoding='utf8') as fp:
            json.dump(data, fp, ensure_ascii=False)

    def crawl_last_day_covid(self):
        home_page = self.get_content_from_url(self.home_url)
        last_day_covid = self.parse_home_page(home_page)
        self.save(last_day_covid, './last_day_covid.json')
        
    def crawl_covid_13(self):
        """
        采集从1月23号以来的各国疫情数据
        """
        with open('./last_day_covid.json',encoding='utf8') as fp:
            last_day_covid = json.load(fp)
        covid = []
        for country in tqdm(last_day_covid, '采集1月23号以来各国疫情信息'):
            statistics_data_url = country['statisticsData']
            statistics_data_json_str = self.get_content_from_url(statistics_data_url)
            statistics_data = json.loads(statistics_data_json_str)['data']
            for one_day in statistics_data:
                one_day['proviceName'] = country['provinceName']
                one_day['countryShortCode'] = country['countryShortCode']
                
            covid.extend(statistics_data)
    def run(self):
    ## 定义函数运行方式
       # self.crawl_last_day_covid()
        self.crawl_covid_13()
if __name__ == '__main__':
    spider = CovidSpider()
    spider.run()

采集1月23号以来各国疫情信息: 100%|███████████████████████████████████████████████████| 215/215 [00:25<00:00,  8.41it/s]


### 采集最近一日全国各省疫情数据

In [95]:
class CovidSpider(object):
    def __init__(self):
        self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'

    def get_content_from_url(self, url):
        response = requests.get(url)
        return response.content.decode()

    def parse_home_page(self, home_page, tag_id):
        """
        从疫情首页提取数据
        """
        soup = BeautifulSoup(home_page, 'lxml')
        script = soup.find(id = tag_id)
        text = script.string
        json_str = re.findall(r'\[.+\]', text)[0]
        data = json.loads(json_str)
        return data

    def save(self,data,path):
        with open(path, 'w', encoding='utf8') as fp:
            json.dump(data, fp, ensure_ascii=False)
            
    def crawl_last_day_covid(self):
        ## 读取该网站主页数据并解码
        home_page = self.get_content_from_url(self.home_url)
        ## 主页所需各国昨日数据转为python形式并以data变量输出
        last_day_covid = self.parse_home_page(home_page, tag_id = 'getListByCountryTypeService2true')
        ##将data变量转换为json文件并保存
        self.save(last_day_covid, './last_day_covid.json')

    def crawl_last_day_covid_china(self):
        home_page = self.get_content_from_url(self.home_url)
        data = self.parse_home_page(home_page, tag_id = 'getAreaStat')
        ### 重构代码 提高复用性
        ### soup = BeautifulSoup(home_page, 'lxml')
        ### script = soup.find(id = 'getAreaStat')
        ### text = script.string
        ### json_str = re.findall(r'\[.+\]', text)[0]
        ### data = json.loads(json_str)
        self.save(data, './crawl_last_day_covid_china.json')
    def run(self):
        self.crawl_last_day_covid_china()
    
if __name__ == '__main__':
    spider = CovidSpider()
    spider.run()

### 采集1月22日以来的全国各省疫情数据

In [15]:
class CovidSpider(object):
    def __init__(self):
        self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'

    def get_content_from_url(self, url):
        response = requests.get(url)
        return response.content.decode()

    def parse_home_page(self, home_page, tag_id):
        """
        从疫情首页提取数据
        """
        soup = BeautifulSoup(home_page, 'lxml')
        script = soup.find(id = tag_id)
        text = script.string
        json_str = re.findall(r'\[.+\]', text)[0]
        data = json.loads(json_str)
        return data
    
    def load(self, path):
        """
        根据路径加载数据
        """
        with open(path, encoding='utf8') as fp:
            data = json.load(fp)
        return data

    def save(self,data,path):
        with open(path, 'w', encoding='utf8') as fp:
            json.dump(data, fp, ensure_ascii=False)
            
    def crawl_last_day_covid(self):
        ## 读取该网站主页数据并解码
        home_page = self.get_content_from_url(self.home_url)
        ## 主页所需各国昨日数据转为python形式并以data变量输出
        last_day_covid = self.parse_home_page(home_page, tag_id = 'getListByCountryTypeService2true')
        ##将data变量转换为json文件并保存
        self.save(last_day_covid, './last_day_covid.json')

    def crawl_last_day_covid_china(self):
        home_page = self.get_content_from_url(self.home_url)
        data = self.parse_home_page(home_page, tag_id = 'getAreaStat')
        ### 重构代码 提高复用性
        ### soup = BeautifulSoup(home_page, 'lxml')
        ### script = soup.find(id = 'getAreaStat')
        ### text = script.string
        ### json_str = re.findall(r'\[.+\]', text)[0]
        ### data = json.loads(json_str)
        self.save(data, './crawl_last_day_covid_china.json')
        
    def crawl_covid_china_22(self):
        """
        采集从1月22日以来的全国各省疫情数据
        """
        ### 加载最近一日全国疫情信息
        last_day_covid_22 = self.load('./crawl_last_day_covid_china.json')

        ### 遍历最近一日全国疫情信息，获取各省疫情URL
        #定义列表， 用于存储各省从1月22日以来的疫情数据
        covid = []
        #遍历各省疫情数据，获取统计的URL
        for country in tqdm(last_day_covid_22, '采集1月23号以来各省疫情信息'):
            statistics_data_url = country['statisticsData']
            statistics_data_json_str = self.get_content_from_url(statistics_data_url)
            ### 解析各省疫情json字符串，并添加列表中
            statistics_data = json.loads(statistics_data_json_str)['data']
            for one_day in statistics_data:
                one_day['proviceName'] = country['provinceName']
            covid.extend(statistics_data)
            # print(covid)
        ### 以json格式保存疫情数据信息
        self.save(covid, './covid_china_22.json')
    
    def run(self):
        #self.crawl_last_day_covid_china()
        self.crawl_covid_china_22()
if __name__ == '__main__':
    spider = CovidSpider()
    spider.run()

采集1月23号以来各省疫情信息: 100%|█████████████████████████████████████████████████████| 34/34 [00:04<00:00,  7.29it/s]


### 疫情数据可视化

In [None]:
! conda 

In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.figure
from matplotlib.patches import Polygon
from mpl_toolkits.basemap import Basemap

ModuleNotFoundError: No module named 'mpl_toolkits.basemap'

## 根据上海地图标注

## Heatmap作图

## 做出预测