In [1]:
import pandas as pd 
from pathlib import Path
import os

script_dir = Path(os.getcwd())
input_file = script_dir.parent / "data_origin/origin_OECD_CAPMF_all.csv"
output_file = script_dir.parent / "data/1-1-all_policy_selected.csv"

print(f"当前工作目录: {script_dir}")
print(f"输入文件: {input_file.resolve()}")
print(f"输出文件: {output_file.resolve()}")

df = pd.read_csv(input_file)

# 只保留需要的列
columns_to_keep = [
   'REF_AREA',
   'MEASURE',
   'CLIM_ACT_POL',
   'UNIT_MEASURE',
   'TIME_PERIOD',
   'OBS_VALUE',
   'OBS_STATUS'
]

df_selected = df[columns_to_keep]

# 筛选MEASURE字段，只保留POL_STRINGENCY；计数的数据不需要，避免重复
df_selected = df_selected[df_selected['MEASURE'] == 'POL_STRINGENCY']

# 筛选OBS_STATUS列,只保留值为'A'、'E'、'N'的行
""" 
'E'【Estimated value（估算值）】，'A'【Actual/Valid value（实际/有效值）】，
'K'【Data included in another category（数据已归入其他类别）】，'N'【Not significant（数据无显著性）】，
'M'【Missing, data cannot exist（缺失，数据逻辑上不可能存在）】，'Q'【Missing, suppressed（缺失，数据被屏蔽）】
"""
df_filtered = df_selected[df_selected['OBS_STATUS'].isin(['A', 'E', 'N'])]

# 删除MEASURE字段
df_filtered = df_filtered.drop(columns=['MEASURE', 'UNIT_MEASURE'])

# 将NA值替换为0
df_filtered = df_filtered.fillna(0)

# 保存处理后的数据
df_filtered.to_csv(output_file, index=False)

当前工作目录: f:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary\code
输入文件: F:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary\data_origin\origin_OECD_CAPMF_all.csv
输出文件: F:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary\data\1-1-all_policy_selected.csv


In [2]:
import pandas as pd 
from pathlib import Path
import os

script_dir = Path(os.getcwd())
input_file = script_dir.parent / "data_origin/origin_OECD_CAPMF_all.csv"
output_file = script_dir.parent / "data/1-1-all_policy_selected.csv"

print(f"当前工作目录: {script_dir}")
print(f"输入文件: {input_file.resolve()}")
print(f"输出文件: {output_file.resolve()}")

df = pd.read_csv(input_file)

# 只保留需要的列
columns_to_keep = [
   'REF_AREA',
   'MEASURE',
   'CLIM_ACT_POL',
   'UNIT_MEASURE',
   'TIME_PERIOD',
   'OBS_VALUE',
   'OBS_STATUS'
]

df_selected = df[columns_to_keep]

# 筛选MEASURE字段，只保留POL_STRINGENCY；计数的数据不需要，避免重复
df_selected = df_selected[df_selected['MEASURE'] == 'POL_STRINGENCY']

# ========== OBS_STATUS统计模块 ==========
print("\n" + "="*60)
print("OBS_STATUS 各状态统计")
print("="*60)

status_descriptions = {
    'A': 'Actual/Valid value (实际/有效值)',
    'E': 'Estimated value (估算值)',
    'K': 'Data included in another category (数据已归入其他类别)',
    'N': 'Not significant (数据无显著性)',
    'M': 'Missing, data cannot exist (缺失，数据逻辑上不可能存在)',
    'Q': 'Missing, suppressed (缺失，数据被屏蔽)'
}

total_count = len(df_selected)
status_counts = df_selected['OBS_STATUS'].value_counts()

print(f"\n筛选MEASURE后的总行数: {total_count}\n")

for status, description in status_descriptions.items():
    count = status_counts.get(status, 0)
    percentage = (count / total_count * 100) if total_count > 0 else 0
    print(f"'{status}' - {description}")
    print(f"    数量: {count:,} 行  占比: {percentage:.2f}%")
    print()

# 检查是否有其他未定义的状态
other_statuses = set(status_counts.index) - set(status_descriptions.keys())
if other_statuses:
    print("其他未定义的状态:")
    for status in other_statuses:
        count = status_counts[status]
        percentage = (count / total_count * 100)
        print(f"'{status}' - 数量: {count:,} 行  占比: {percentage:.2f}%")
    print()

print("-" * 60)
print(f"保留的状态 ('A', 'E'): {status_counts.get('A', 0) + status_counts.get('E', 0):,} 行 "
      f"({((status_counts.get('A', 0) + status_counts.get('E', 0)) / total_count * 100):.2f}%)")
      
deleted_statuses = ['K', 'N', 'M', 'Q']
deleted_count = sum(status_counts.get(s, 0) for s in deleted_statuses)
print(f"删除的状态 ('K', 'N', 'M', 'Q'): {deleted_count:,} 行 "
      f"({(deleted_count / total_count * 100):.2f}%)")
print("="*60 + "\n")

# 筛选OBS_STATUS列,只保留值为'A'或'E'的行
""" 
'E'【Estimated value（估算值）】，'A'【Actual/Valid value（实际/有效值）】，
'K'【Data included in another category（数据已归入其他类别）】，'N'【Not significant（数据无显著性）】，
'M'【Missing, data cannot exist（缺失，数据逻辑上不可能存在）】，'Q'【Missing, suppressed（缺失，数据被屏蔽）】
"""
df_filtered = df_selected[df_selected['OBS_STATUS'].isin(['A', 'E'])]

# 删除MEASURE字段
df_filtered = df_filtered.drop(columns=['MEASURE', 'UNIT_MEASURE'])

# 将NA值替换为0
df_filtered = df_filtered.fillna(0)

# 保存处理后的数据
df_filtered.to_csv(output_file, index=False)

print(f"最终保存数据: {len(df_filtered)} 行")


当前工作目录: f:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary\code
输入文件: F:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary\data_origin\origin_OECD_CAPMF_all.csv
输出文件: F:\Desktop\科研项目\1.负责科研项目\Climate Policy\CAMPF_Supplementary\data\1-1-all_policy_selected.csv

OBS_STATUS 各状态统计

筛选MEASURE后的总行数: 360672

'A' - Actual/Valid value (实际/有效值)
    数量: 295,448 行  占比: 81.92%

'E' - Estimated value (估算值)
    数量: 31,497 行  占比: 8.73%

'K' - Data included in another category (数据已归入其他类别)
    数量: 189 行  占比: 0.05%

'N' - Not significant (数据无显著性)
    数量: 40 行  占比: 0.01%

'M' - Missing, data cannot exist (缺失，数据逻辑上不可能存在)
    数量: 19,626 行  占比: 5.44%

'Q' - Missing, suppressed (缺失，数据被屏蔽)
    数量: 13,872 行  占比: 3.85%

------------------------------------------------------------
保留的状态 ('A', 'E'): 326,945 行 (90.65%)
删除的状态 ('K', 'N', 'M', 'Q'): 33,727 行 (9.35%)

最终保存数据: 326945 行
