In [3]:
# save_gdp.py
import pandas as pd
import re

# 1. 读取页面所有表格
url = "https://www.kylc.com/stats/global/yearly_per_country/g_gdp/chn-jpn-usa.html"
tables = pd.read_html(url, header=0, encoding='utf-8')   # header=0 把第一行当列名
print(f"共检测到 {len(tables)} 张表")                     # 看看哪张是我们要的

# 2. 肉眼观察后，一般第一张就是目标表；如不是自行改下标
raw = tables[0]

# 3. 清理列名
raw.columns = ['Year', 'China_GDP', 'China_Pct', 'Japan_GDP', 'Japan_Pct',
               'USA_GDP', 'USA_Pct']

# 4. 解析 GDP 数字，容错版
def parse_parentheses(s):
    if pd.isna(s):
        return pd.NA
    try:
        num_str = re.search(r'\((.*?)\)', str(s)).group(1).replace(',', '')
        return float(num_str) / 1e9          # 转为 10^9 USD
    except Exception:
        return pd.NA

raw['China'] = raw['China_GDP'].apply(parse_parentheses)
raw['Japan'] = raw['Japan_GDP'].apply(parse_parentheses)
raw['USA']   = raw['USA_GDP'].apply(parse_parentheses)

# 5. 去掉解析失败的行（可选，保险起见）
raw = raw.dropna(subset=['China', 'Japan', 'USA'])

# 6. 变长表并导出
long = raw[['Year', 'China', 'Japan', 'USA']].melt(
    id_vars='Year', var_name='Country', value_name='GDP_USD_Bil')
long.to_csv('usa_china_jpn_gdp_1960_2022.csv', index=False, encoding='utf-8-sig')
print('✅ 文件已保存：usa_china_jpn_gdp_1960_2022.csv')

共检测到 1 张表
✅ 文件已保存：usa_china_jpn_gdp_1960_2022.csv


In [4]:
# split_3_files.py
import pandas as pd

# 1. 读长表
df = pd.read_csv('usa_china_jpn_gdp_1960_2022.csv')

# 2. 按国家拆表并落盘
for country in ['China', 'USA', 'Japan']:
    tmp = df[df['Country'] == country].copy()
    fname = f"{country.lower()}_gdp_1960_2022.csv"
    tmp.to_csv(fname, index=False, encoding='utf-8-sig')
    print(f'✅ 已写入 {fname}')

✅ 已写入 china_gdp_1960_2022.csv
✅ 已写入 usa_gdp_1960_2022.csv
✅ 已写入 japan_gdp_1960_2022.csv
