### 国家统计局爬虫测试文件

In [88]:
from pprint import pprint
# import os
import requests
import pandas as pd
import numpy as np
# import certifi
import time
import warnings
from tqdm import tqdm

#### 高级查询 + 分省年度, 查询所有地区的地区代码

In [89]:
from urllib3.exceptions import InsecureRequestWarning # 针对ssl验证产生警告的错书
warnings.filterwarnings("ignore", category=InsecureRequestWarning)

In [90]:
base_url = 'https://data.stats.gov.cn/adv.htm?m=findZbXl&db=fsnd&wd=reg&treeId=00'
# SSLError, 关闭证书验证
r = requests.get(base_url, verify=False) # 产生警告
print(r.status_code)
print(r.encoding)

200
utf-8


[InsecureRequestWarning报错解决方案](https://blog.csdn.net/weixin_40773848/article/details/126719313?spm=1001.2101.3001.6650.3&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EYuanLiJiHua%7EPosition-3-126719313-blog-82011282.pc_relevant_recovery_v2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EYuanLiJiHua%7EPosition-3-126719313-blog-82011282.pc_relevant_recovery_v2&utm_relevant_index=4)
> 安装certifi包, pip install certifi
> 通过 certifi.where查询证书路径
> 在requests中通过verify参数指定证书路径

In [91]:
r = r.json()
print(type(r))
print(len(r))
print(type(r[0]))

<class 'list'>
31
<class 'dict'>


In [92]:
prov_id = pd.DataFrame(r)
prov_id.head()

Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd
0,fsnd,,110000,True,北京市,False,0,reg
1,fsnd,,120000,True,天津市,False,0,reg
2,fsnd,,130000,True,河北省,False,0,reg
3,fsnd,,140000,True,山西省,False,0,reg
4,fsnd,,150000,True,内蒙古自治区,False,0,reg


### 通过不同参数构造访问url

### 常规分类方式

In [93]:
list_pid = [ str(pid) for pid in np.arange(100001, 100006+1) ]
list_pid_cn = ['华北', '东北', '华东', '中南', '西南', '西北']
pid_dict = dict(zip(list_pid, list_pid_cn))
del list_pid_cn
pprint(pid_dict)

{'100001': '华北',
 '100002': '东北',
 '100003': '华东',
 '100004': '中南',
 '100005': '西南',
 '100006': '西北'}


In [94]:
# 构造params 参数
# treeId可选值为 100001 - 100006
# treeId经过分析, 是一系列可选列表, 而df是dbcode, 目前有fsnd,fsyd不影响请求结果
def set_params(tree_id:str, db='fsnd'):
    return {
        'm':'findZbXl',
        'db':db,
        'wd':'reg',
        'treeId':tree_id
    }

In [95]:
# 单页
base_url = 'https://data.stats.gov.cn/adv.htm?'
params = set_params('100001')
# verify = certifi.where() # 依旧产生ssl错误, 直接设置verify为false
r = requests.get(base_url,params=params,verify=False)
print(r.url)
print(r.encoding)
r = pd.DataFrame(r.json())
r = r.assign(pid_name=pid_dict.get('100001')) # 添加地区类别信息
r.head()

https://data.stats.gov.cn/adv.htm?m=findZbXl&db=fsnd&wd=reg&treeId=100001
utf-8


Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsnd,,110000,True,北京市,False,100001,reg,华北
1,fsnd,,120000,True,天津市,False,100001,reg,华北
2,fsnd,,130000,True,河北省,False,100001,reg,华北
3,fsnd,,140000,True,山西省,False,100001,reg,华北
4,fsnd,,150000,True,内蒙古自治区,False,100001,reg,华北


In [96]:
# 多页采集
base_url = 'https://data.stats.gov.cn/adv.htm?'
df = []
for treeId in tqdm(list_pid):
    params = set_params(treeId)
    r = requests.get(base_url,params=params,verify=False).json()
    r = pd.DataFrame(r).assign(pid_name = pid_dict.get(treeId))
    df.append(r)
    time.sleep(np.random.rand() * 0.1)
pid = pd.concat(df)

100%|██████████| 6/6 [00:02<00:00,  2.50it/s]


In [97]:
# type(np.random.rand()) # rand函数不提供形状返回float,但是IDE还是会警告array

In [98]:
pid.head(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsnd,,110000,True,北京市,False,100001,reg,华北
1,fsnd,,120000,True,天津市,False,100001,reg,华北
2,fsnd,,130000,True,河北省,False,100001,reg,华北


In [99]:
prov_id.tail(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd
28,fsnd,,630000,True,青海省,False,0,reg
29,fsnd,,640000,True,宁夏回族自治区,False,0,reg
30,fsnd,,650000,True,新疆维吾尔自治区,False,0,reg


### 数据合并,去除不必要的列

In [100]:
pid = pid.loc[:,['id', 'name','pid', 'pid_name']].rename(columns={'pid':'general_pid', 'pid_name':'general_name'})
prov_id = prov_id.drop(columns=['pid']).merge(pid,how='inner', on=['id','name'])
prov_id.head(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,wd,general_pid,general_name
0,fsnd,,110000,True,北京市,False,reg,100001,华北
1,fsnd,,120000,True,天津市,False,reg,100001,华北
2,fsnd,,130000,True,河北省,False,reg,100001,华北


### 热点地区分类方式

In [101]:
base_url = 'https://data.stats.gov.cn/adv.htm'
pid_dict = {
    '200001':'长江三角洲', '200002':'环渤海地区',
    '200003':'泛珠三角', '200004':'东部地区', '200005':'西部地区'
}

In [102]:
def get_prov_info( pid_dict:dict, base_url:str, db='fsnd' ):
    df = []
    for tree_id in tqdm(list(pid_dict)):
        params = set_params(tree_id, db=db)
        r = requests.get(base_url, params=params,verify=False).json()
        r = pd.DataFrame(r).assign(pid_name = pid_dict.get(tree_id))
        df.append(r)
        time.sleep(np.random.rand() * 0.1)
    return pd.concat(df)

In [103]:
pid = get_prov_info(pid_dict, base_url)
pid.sample(3)

100%|██████████| 5/5 [00:02<00:00,  2.36it/s]


Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
6,fsnd,,540000,True,西藏自治区,False,200005,reg,西部地区
10,fsnd,,460000,True,海南省,False,200004,reg,东部地区
7,fsnd,,610000,True,陕西省,False,200005,reg,西部地区


In [104]:
pid = pid.loc[:,['id', 'name','pid', 'pid_name']].rename(columns={'pid':'hot_spot_pid','pid_name':'hot_spot_name'})
prov_id = prov_id.merge(pid, how='inner', on=['id','name'])
prov_id.sample(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,wd,general_pid,general_name,hot_spot_pid,hot_spot_name
8,fsnd,,210000,True,辽宁省,False,reg,100002,东北,200004,东部地区
4,fsnd,,130000,True,河北省,False,reg,100001,华北,200002,环渤海地区
30,fsnd,,520000,True,贵州省,False,reg,100005,西南,200003,泛珠三角


### 八大经济地区-economic

In [105]:
base_url = 'https://data.stats.gov.cn/adv.htm'
pid_dict = {
    '800001':'东北地区', '800002':'北部沿海', '800003':'东部沿海', '800004':'南部沿海',
    '800005':'黄河中游', '800006':'长江中游', '800007':'西南地区', '800008':'大西北地区'
}

In [106]:
pid = get_prov_info(pid_dict, base_url,db='fsyd')
pid.head()

100%|██████████| 8/8 [00:03<00:00,  2.52it/s]


Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsyd,,210000,True,辽宁省,False,800001,reg,东北地区
1,fsyd,,220000,True,吉林省,False,800001,reg,东北地区
2,fsyd,,230000,True,黑龙江省,False,800001,reg,东北地区
0,fsyd,,110000,True,北京市,False,800002,reg,北部沿海
1,fsyd,,120000,True,天津市,False,800002,reg,北部沿海


In [107]:
pid = pid.loc[:,['id', 'name', 'pid', 'pid_name']].rename(columns={'pid':'economic_pid','pid_name':'economic_name'})
prov_id = prov_id.merge(pid, how='inner', on=['id','name'])
prov_id.sample(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,wd,general_pid,general_name,hot_spot_pid,hot_spot_name,economic_pid,economic_name
2,fsnd,,120000,True,天津市,False,reg,100001,华北,200002,环渤海地区,800002,北部沿海
11,fsnd,,320000,True,江苏省,False,reg,100003,华东,200001,长江三角洲,800003,东部沿海
0,fsnd,,110000,True,北京市,False,reg,100001,华北,200002,环渤海地区,800002,北部沿海


### 三大地带——region3

In [108]:
base_url = 'https://data.stats.gov.cn/adv.htm'
pid_dict = {
    '900001':'东部地带', '900002':'中部地带', '900003':'西部地带'
}

In [109]:
pid = get_prov_info(pid_dict, base_url,db='fsyd')
pid.head()

100%|██████████| 3/3 [00:01<00:00,  2.54it/s]


Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsyd,,110000,True,北京市,False,900001,reg,东部地带
1,fsyd,,120000,True,天津市,False,900001,reg,东部地带
2,fsyd,,130000,True,河北省,False,900001,reg,东部地带
3,fsyd,,210000,True,辽宁省,False,900001,reg,东部地带
4,fsyd,,310000,True,上海市,False,900001,reg,东部地带


In [110]:
pid = pid.loc[:,['id', 'name', 'pid', 'pid_name']].rename(columns={'pid':'region3_pid','pid_name':'region3_name'})
prov_id = prov_id.merge(pid, how='inner', on=['id','name'])
prov_id.sample(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,wd,general_pid,general_name,hot_spot_pid,hot_spot_name,economic_pid,economic_name,region3_pid,region3_name
30,fsnd,,520000,True,贵州省,False,reg,100005,西南,200003,泛珠三角,800007,西南地区,900003,西部地带
36,fsnd,,620000,True,甘肃省,False,reg,100006,西北,200005,西部地区,800008,大西北地区,900003,西部地带
17,fsnd,,360000,True,江西省,False,reg,100003,华东,200003,泛珠三角,800006,长江中游,900002,中部地带


In [111]:
prov_id = prov_id.drop(columns=['dbcode', 'exp','isParent', 'open', 'wd']).reset_index(drop=True)
prov_id.head()

Unnamed: 0,id,name,general_pid,general_name,hot_spot_pid,hot_spot_name,economic_pid,economic_name,region3_pid,region3_name
0,110000,北京市,100001,华北,200002,环渤海地区,800002,北部沿海,900001,东部地带
1,110000,北京市,100001,华北,200004,东部地区,800002,北部沿海,900001,东部地带
2,120000,天津市,100001,华北,200002,环渤海地区,800002,北部沿海,900001,东部地带
3,120000,天津市,100001,华北,200004,东部地区,800002,北部沿海,900001,东部地带
4,130000,河北省,100001,华北,200002,环渤海地区,800002,北部沿海,900001,东部地带


In [120]:
prov_id.to_csv('../data/prov_id.csv', index=False, encoding='utf-8')