### 国家统计局爬虫测试文件

In [54]:
from pprint import pprint
# import os
import requests
import pandas as pd
import numpy as np
# import certifi
import time
import warnings
from tqdm import tqdm

#### 高级查询 + 分省年度, 查询所有地区的地区代码

In [55]:
from urllib3.exceptions import InsecureRequestWarning # 针对ssl验证产生警告的错书
warnings.filterwarnings("ignore", category=InsecureRequestWarning)

In [56]:
base_url = 'https://data.stats.gov.cn/adv.htm?m=findZbXl&db=fsnd&wd=reg&treeId=00'
# SSLError, 关闭证书验证
r = requests.get(base_url, verify=False) # 产生警告
print(r.status_code)
print(r.encoding)

200
utf-8


[InsecureRequestWarning报错解决方案](https://blog.csdn.net/weixin_40773848/article/details/126719313?spm=1001.2101.3001.6650.3&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EYuanLiJiHua%7EPosition-3-126719313-blog-82011282.pc_relevant_recovery_v2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EYuanLiJiHua%7EPosition-3-126719313-blog-82011282.pc_relevant_recovery_v2&utm_relevant_index=4)
> 安装certifi包, pip install certifi
> 通过 certifi.where查询证书路径
> 在requests中通过verify参数指定证书路径

In [57]:
r = r.json()
print(type(r))
print(len(r))
print(type(r[0]))

<class 'list'>
31
<class 'dict'>


In [58]:
prov_id = pd.DataFrame(r)
prov_id.head()

Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd
0,fsnd,,110000,True,北京市,False,0,reg
1,fsnd,,120000,True,天津市,False,0,reg
2,fsnd,,130000,True,河北省,False,0,reg
3,fsnd,,140000,True,山西省,False,0,reg
4,fsnd,,150000,True,内蒙古自治区,False,0,reg


### 通过不同参数构造访问url

### 常规分类方式

In [59]:
list_pid = [ str(pid) for pid in np.arange(100001, 100006+1) ]
list_pid_cn = ['华北', '东北', '华东', '中南', '西南', '西北']
pid_dict = dict(zip(list_pid, list_pid_cn))
del list_pid_cn
pprint(pid_dict)

{'100001': '华北',
 '100002': '东北',
 '100003': '华东',
 '100004': '中南',
 '100005': '西南',
 '100006': '西北'}


In [60]:
# 构造params 参数
# treeId可选值为 100001 - 100006
# treeId经过分析, 是一系列可选列表, 而df是dbcode, 目前有fsnd,fsyd不影响请求结果
def set_params(tree_id:str, db='fsnd'):
    return {
        'm':'findZbXl',
        'db':db,
        'wd':'reg',
        'treeId':tree_id
    }

In [61]:
# 单页
base_url = 'https://data.stats.gov.cn/adv.htm?'
params = set_params('100001')
# verify = certifi.where() # 依旧产生ssl错误, 直接设置verify为false
r = requests.get(base_url,params=params,verify=False)
print(r.url)
print(r.encoding)
r = pd.DataFrame(r.json())
r = r.assign(pid_name=pid_dict.get('100001')) # 添加地区类别信息
r.head()

https://data.stats.gov.cn/adv.htm?m=findZbXl&db=fsnd&wd=reg&treeId=100001
utf-8


Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsnd,,110000,True,北京市,False,100001,reg,华北
1,fsnd,,120000,True,天津市,False,100001,reg,华北
2,fsnd,,130000,True,河北省,False,100001,reg,华北
3,fsnd,,140000,True,山西省,False,100001,reg,华北
4,fsnd,,150000,True,内蒙古自治区,False,100001,reg,华北


In [62]:
# 多页采集
base_url = 'https://data.stats.gov.cn/adv.htm?'
df = []
for treeId in tqdm(list_pid):
    params = set_params(treeId)
    r = requests.get(base_url,params=params,verify=False).json()
    r = pd.DataFrame(r).assign(pid_name = pid_dict.get(treeId))
    df.append(r)
    time.sleep(np.random.rand() * 0.1)
pid = pd.concat(df)

100%|██████████| 6/6 [00:02<00:00,  2.71it/s]


In [63]:
# type(np.random.rand()) # rand函数不提供形状返回float,但是IDE还是会警告array

In [64]:
pid.head(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsnd,,110000,True,北京市,False,100001,reg,华北
1,fsnd,,120000,True,天津市,False,100001,reg,华北
2,fsnd,,130000,True,河北省,False,100001,reg,华北


In [65]:
prov_id.tail(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd
28,fsnd,,630000,True,青海省,False,0,reg
29,fsnd,,640000,True,宁夏回族自治区,False,0,reg
30,fsnd,,650000,True,新疆维吾尔自治区,False,0,reg


### 数据合并,去除不必要的列

In [66]:
pid = pid.loc[:,['id', 'name', 'pid_name']].rename(columns={'pid_name':'general_name'})
prov_id = prov_id.drop(columns=['pid']).merge(pid, how='left', on=['id', 'name'])
prov_id.head(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,wd,general_name
0,fsnd,,110000,True,北京市,False,reg,华北
1,fsnd,,120000,True,天津市,False,reg,华北
2,fsnd,,130000,True,河北省,False,reg,华北


### 热点地区分类方式 - 放弃使用

In [67]:
# base_url = 'https://data.stats.gov.cn/adv.htm'
# pid_dict = {
#     '200001':'长江三角洲', '200002':'环渤海地区',
#     '200003':'泛珠三角', '200004':'东部地区', '200005':'西部地区'
# }

In [68]:
def get_prov_info( pid_dict:dict, base_url:str, db='fsnd' ):
    df = []
    for tree_id in tqdm(list(pid_dict)):
        params = set_params(tree_id, db=db)
        r = requests.get(base_url, params=params,verify=False).json()
        r = pd.DataFrame(r).assign(pid_name = pid_dict.get(tree_id))
        df.append(r)
        time.sleep(np.random.rand() * 0.1)
    return pd.concat(df)

In [69]:
# pid = get_prov_info(pid_dict, base_url)
# pid.sample(3)

In [70]:
# pid = pid.loc[:,['id', 'name', 'pid_name']].rename(columns={'pid_name':'hot_spot_name'})
# prov_id = prov_id.merge(pid, how='left', on=['id', 'name'])
# prov_id.sample(3)

### 八大经济地区-economic

In [71]:
base_url = 'https://data.stats.gov.cn/adv.htm'
pid_dict = {
    '800001':'东北地区', '800002':'北部沿海', '800003':'东部沿海', '800004':'南部沿海',
    '800005':'黄河中游', '800006':'长江中游', '800007':'西南地区', '800008':'大西北地区'
}

In [72]:
pid = get_prov_info(pid_dict, base_url,db='fsyd')
pid.head()

100%|██████████| 8/8 [00:03<00:00,  2.29it/s]


Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsyd,,210000,True,辽宁省,False,800001,reg,东北地区
1,fsyd,,220000,True,吉林省,False,800001,reg,东北地区
2,fsyd,,230000,True,黑龙江省,False,800001,reg,东北地区
0,fsyd,,110000,True,北京市,False,800002,reg,北部沿海
1,fsyd,,120000,True,天津市,False,800002,reg,北部沿海


In [73]:
pid = pid.loc[:,['id', 'name', 'pid_name']].rename(columns={'pid_name':'economic_name'})
prov_id = prov_id.merge(pid, how='left', on=['id', 'name'])
prov_id.sample(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,wd,general_name,economic_name
4,fsnd,,150000,True,内蒙古自治区,False,reg,华北,黄河中游
20,fsnd,,460000,True,海南省,False,reg,中南,南部沿海
12,fsnd,,350000,True,福建省,False,reg,华东,南部沿海


### 三大地带——region3

In [74]:
base_url = 'https://data.stats.gov.cn/adv.htm'
pid_dict = {
    '900001':'东部地带', '900002':'中部地带', '900003':'西部地带'
}

In [75]:
pid = get_prov_info(pid_dict, base_url,db='fsyd')
pid.head()

100%|██████████| 3/3 [00:01<00:00,  2.50it/s]


Unnamed: 0,dbcode,exp,id,isParent,name,open,pid,wd,pid_name
0,fsyd,,110000,True,北京市,False,900001,reg,东部地带
1,fsyd,,120000,True,天津市,False,900001,reg,东部地带
2,fsyd,,130000,True,河北省,False,900001,reg,东部地带
3,fsyd,,210000,True,辽宁省,False,900001,reg,东部地带
4,fsyd,,310000,True,上海市,False,900001,reg,东部地带


In [76]:
pid = pid.loc[:,['id', 'name', 'pid_name']].rename(columns={'pid_name':'region3_name'})
prov_id = prov_id.merge(pid, how='left', on=['id', 'name'])
prov_id.sample(3)

Unnamed: 0,dbcode,exp,id,isParent,name,open,wd,general_name,economic_name,region3_name
27,fsnd,,620000,True,甘肃省,False,reg,西北,大西北地区,西部地带
12,fsnd,,350000,True,福建省,False,reg,华东,南部沿海,东部地带
5,fsnd,,210000,True,辽宁省,False,reg,东北,东北地区,东部地带


In [77]:
prov_id = prov_id.drop(columns=['dbcode', 'exp','isParent', 'open', 'wd']).reset_index(drop=True)
prov_id

Unnamed: 0,id,name,general_name,economic_name,region3_name
0,110000,北京市,华北,北部沿海,东部地带
1,120000,天津市,华北,北部沿海,东部地带
2,130000,河北省,华北,北部沿海,东部地带
3,140000,山西省,华北,黄河中游,中部地带
4,150000,内蒙古自治区,华北,黄河中游,西部地带


In [78]:
prov_id

Unnamed: 0,id,name,general_name,economic_name,region3_name
0,110000,北京市,华北,北部沿海,东部地带
1,120000,天津市,华北,北部沿海,东部地带
2,130000,河北省,华北,北部沿海,东部地带
3,140000,山西省,华北,黄河中游,中部地带
4,150000,内蒙古自治区,华北,黄河中游,西部地带
5,210000,辽宁省,东北,东北地区,东部地带
6,220000,吉林省,东北,东北地区,中部地带
7,230000,黑龙江省,东北,东北地区,中部地带
8,310000,上海市,华东,东部沿海,东部地带
9,320000,江苏省,华东,东部沿海,东部地带


In [79]:
prov_id.to_csv('./data/prov_id.csv', index=False, encoding='utf-8')