这是用来测试计算Market Access部分的代码

In [7]:
import pandas as pd

# 读取主数据文件（如 WDICSV.csv 或 WDICountry.csv，具体看你的数据结构）
df = pd.read_csv('../data/raw/WDI_CSV_2025_07_02/WDICSV.csv')

# 只保留NY.GDP.MKTP.CD相关数据
gdp_df = df[df['Indicator Code'] == 'NY.GDP.MKTP.CD']

# 可选：筛选年份范围（如2019-2022）
years = [str(y) for y in range(2019, 2023)]
cols = ['Country Name', 'Country Code', 'Indicator Code'] + years
gdp_panel = gdp_df[cols]

# 输出结果
print(gdp_panel)

# 保存为csv
gdp_panel.to_csv('../data/processed/gdp_country_year_2019_2022.csv', index=False)

                          Country Name Country Code  Indicator Code  \
515        Africa Eastern and Southern          AFE  NY.GDP.MKTP.CD   
2031        Africa Western and Central          AFW  NY.GDP.MKTP.CD   
3547                        Arab World          ARB  NY.GDP.MKTP.CD   
5063            Caribbean small states          CSS  NY.GDP.MKTP.CD   
6579    Central Europe and the Baltics          CEB  NY.GDP.MKTP.CD   
...                                ...          ...             ...   
396191           Virgin Islands (U.S.)          VIR  NY.GDP.MKTP.CD   
397707              West Bank and Gaza          PSE  NY.GDP.MKTP.CD   
399223                     Yemen, Rep.          YEM  NY.GDP.MKTP.CD   
400739                          Zambia          ZMB  NY.GDP.MKTP.CD   
402255                        Zimbabwe          ZWE  NY.GDP.MKTP.CD   

                2019          2020          2021          2022  
515     1.009747e+12  9.334072e+11  1.085605e+12  1.191639e+12  
2031    8.332889e

现在需要把表格改成我们希望的结构

In [8]:
import pandas as pd

# 读取数据
df = pd.read_csv('../data/processed/gdp_country_year_2019_2022.csv')

# 只保留NY.GDP.MKTP.CD指标
df = df[df['Indicator Code'] == 'NY.GDP.MKTP.CD']

# 选择需要的年份
years = ['2019', '2020', '2021', '2022']  # 可自定义年份
df_long = df.melt(
    id_vars=['Country Code'],
    value_vars=years,
    var_name='year',
    value_name='gdp'
)

# 重命名列
df_long = df_long.rename(columns={'Country Code': 'iso3'})

# 输出结果
print(df_long.head())
df_long.to_csv('../data/processed/gdp_country_year_2019_2022_cleaned.csv', index=False)

  iso3  year           gdp
0  AFE  2019  1.009747e+12
1  AFW  2019  8.332889e+11
2  ARB  2019  2.949355e+12
3  CSS  2019  6.192930e+10
4  CEB  2019  1.686042e+12


现在来检查一下数据质量

In [19]:
# 读取长表格式的GDP数据
df = pd.read_csv('../data/processed/gdp_country_year_2019_2022_cleaned.csv')

# 查看基本信息
print("数据行数和列数:", df.shape)
print("列名:", df.columns.tolist())
print("前5行:\n", df.head())

# 检查数据类型和基本统计信息
print("\n数据类型和统计信息:")
df.info()
display(df['gdp'].describe())

# 检查缺失值
print("\n每列缺失值数量:")
display(df.isnull().sum())

# 检查iso3是否有重复（同一年同国家是否唯一）
dup = df.duplicated(subset=['iso3', 'year'])
display(f"\n重复行数: {dup.sum()}")

# 检查gdp是否有负值或异常
print("\nGDP为负或者极大的行:")
display(df[df['gdp'] <= 0])
display(df.sort_values('gdp', ascending=False).head(5))

数据行数和列数: (1064, 3)
列名: ['iso3', 'year', 'gdp']
前5行:
   iso3  year           gdp
0  AFE  2019  1.009747e+12
1  AFW  2019  8.332889e+11
2  ARB  2019  2.949355e+12
3  CSS  2019  6.192930e+10
4  CEB  2019  1.686042e+12

数据类型和统计信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1064 entries, 0 to 1063
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   iso3    1064 non-null   object 
 1   year    1064 non-null   int64  
 2   gdp     1028 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 25.1+ KB


count    1.028000e+03
mean     3.008484e+12
std      1.025125e+13
min      5.174659e+07
25%      1.029698e+10
50%      6.119920e+10
75%      6.034914e+11
max      1.024349e+14
Name: gdp, dtype: float64


每列缺失值数量:


iso3     0
year     0
gdp     36
dtype: int64

'\n重复行数: 0'


GDP为负或者极大的行:


Unnamed: 0,iso3,year,gdp


Unnamed: 0,iso3,year,gdp
846,WLD,2022,102434900000000.0
580,WLD,2021,98365750000000.0
48,WLD,2019,88492390000000.0
314,WLD,2020,86116570000000.0
814,HIC,2022,65248660000000.0


In [20]:
missing_gdp = df[df['gdp'].isnull()]
print("🔍 缺失 GDP 的记录数:", len(missing_gdp))
# 看看缺失值集中在哪些国家和年份
display(missing_gdp.groupby('iso3')['year'].apply(list).reset_index(name='缺失年份'))


🔍 缺失 GDP 的记录数: 36


Unnamed: 0,iso3,缺失年份
0,CUB,"[2021, 2022]"
1,ERI,"[2019, 2020, 2021, 2022]"
2,GIB,"[2019, 2020, 2021, 2022]"
3,INX,"[2019, 2020, 2021, 2022]"
4,MAF,"[2020, 2022]"
5,PRK,"[2019, 2020, 2021, 2022]"
6,SSD,"[2019, 2020, 2021, 2022]"
7,VEN,"[2019, 2020, 2021, 2022]"
8,VGB,"[2019, 2020, 2021, 2022]"
9,YEM,"[2019, 2020, 2021, 2022]"


In [21]:
import pandas as pd

# 读取GDP缺失国家列表
missing_gdp = pd.read_csv('../data/processed/gdp_country_year_2019_2022_cleaned.csv')
missing_iso3 = missing_gdp[missing_gdp['gdp'].isnull()]['iso3'].unique()

# 读取migration数据（假设有iso3字段，或origin/destination字段）
migration = pd.read_csv('../data/processed/migration_flow_cleaned.csv')

# 检查这些国家是否在迁移流数据中出现
in_origin = migration['origin'].isin(missing_iso3)
in_dest = migration['destination'].isin(missing_iso3)

# 输出结果
print("GDP缺失国家在migration数据中作为origin出现的有：", migration.loc[in_origin, 'origin'].unique())
print("GDP缺失国家在migration数据中作为destination出现的有：", migration.loc[in_dest, 'destination'].unique())

GDP缺失国家在migration数据中作为origin出现的有： []
GDP缺失国家在migration数据中作为destination出现的有： []


然后开始整理CEPII数据

In [27]:
import pandas as pd

# 读取CEPII距离数据（根据实际文件格式选择read_excel或read_csv）
df = pd.read_excel('../data/raw/dist_cepii/dist_cepii.xls')  # 或 .csv

# 只保留需要的列
# 一般CEPII数据的国家代码是 iso_o（origin），iso_d（destination），距离是 distw（加权距离）
cols = ['iso_o', 'iso_d', 'distw']
df_dist = df[cols]

# 强制转换distw为数值型，异常值变为NaN
df_dist['distw'] = pd.to_numeric(df_dist['distw'], errors='coerce')

# 输出结果
display(df_dist.dtypes)
display(df_dist.head())

# 保存为CSV文件
df_dist.to_csv('../data/processed/dist_cepii_cleaned.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dist['distw'] = pd.to_numeric(df_dist['distw'], errors='coerce')


iso_o     object
iso_d     object
distw    float64
dtype: object

Unnamed: 0,iso_o,iso_d,distw
0,ABW,ABW,25.09354
1,ABW,AFG,13168.22
2,ABW,AGO,9587.316
3,ABW,AIA,976.8974
4,ABW,ALB,9091.576


In [None]:
# 读取整理后的CEPII距离数据
df_dist = pd.read_csv('../data/processed/dist_cepii_cleaned.csv')

# 查看基本信息
print("数据行数和列数:", df_dist.shape)
print("列名:", df_dist.columns.tolist())
print("前5行:\n", df_dist.head())

# 检查缺失值
print("\n每列缺失值数量:")
print(df_dist.isnull().sum())

# 检查是否有重复的国家对
dup = df_dist.duplicated(subset=['iso_o', 'iso_d'])
print(f"\n重复的国家对数量: {dup.sum()}")

# 检查距离的最小值、最大值、分布
print("\n距离统计描述:")
print(df_dist['distw'].describe())

数据行数和列数: (50176, 3)
列名: ['iso_o', 'iso_d', 'distw']
前5行:
   iso_o iso_d        distw
0   ABW   ABW     25.09354
1   ABW   AFG  13168.22000
2   ABW   AGO   9587.31600
3   ABW   AIA    976.89740
4   ABW   ALB   9091.57600

每列缺失值数量:
iso_o       0
iso_d       0
distw    2215
dtype: int64

重复的国家对数量: 0

距离统计描述:
count    47961.000000
mean      8392.728338
std       4670.531315
min          0.995137
25%       4687.852000
50%       8006.123000
75%      11894.690000
max      19781.390000
Name: distw, dtype: float64

异常iso_o代码:
Empty DataFrame
Columns: [iso_o, iso_d, distw]
Index: []

异常iso_d代码:
Empty DataFrame
Columns: [iso_o, iso_d, distw]
Index: []


对缺失值的细节

In [32]:
# 找出distw缺失的行
missing_distw = df_dist[df_dist['distw'].isnull()]

# 列出所有涉及的国家代码（去重）
origin_missing = set(missing_distw['iso_o'].unique())
dest_missing = set(missing_distw['iso_d'].unique())
all_missing = origin_missing.union(dest_missing)

print("涉及距离缺失的国家代码（iso3）：")
print(sorted(all_missing))

涉及距离缺失的国家代码（iso3）：
['ABW', 'AFG', 'AGO', 'AIA', 'ALB', 'AND', 'ANT', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COG', 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CXR', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNG', 'MNP', 'MOZ', 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 

In [31]:
total = len(df_dist)
missing = df_dist['distw'].isnull().sum()
percent = missing / total * 100
print(f"distw缺失值数量: {missing}")
print(f"总记录数: {total}")
print(f"缺失值占比: {percent:.2f}%")

distw缺失值数量: 2215
总记录数: 50176
缺失值占比: 4.41%
