测试链接表格，首先检查表格内容

In [1]:
import pandas as pd

# 1. 迁移流
mig = pd.read_csv('../data/processed/migration_flow_cleaned.csv')
print("迁移流表 columns:", mig.columns.tolist())
print(mig.head())

# 2. GDP
gdp = pd.read_csv('../data/processed/gdp_country_year_2019_2022_cleaned.csv')
print("GDP表 columns:", gdp.columns.tolist())
print(gdp.head())

# 3. Market Access
ma = pd.read_csv('../data/processed/market_access_panel.csv')
print("Market Access表 columns:", ma.columns.tolist())
print(ma.head())

# 4. SPEI
spei = pd.read_csv('../data/processed/spei03_country_month_cleaned.csv')
print("SPEI表 columns:", spei.columns.tolist())
print(spei.head())

# 5. 边境摩擦
border = pd.read_csv('../data/processed/border_friction_panel_ew.csv')
print("边境摩擦表 columns:", border.columns.tolist())
print(border.head())

迁移流表 columns: ['origin', 'destination', 'migration_month', 'flow', 'year', 'month', 'log_flow']
  origin destination migration_month  flow  year  month  log_flow
0    AND         ARE      2019-01-01    12  2019      1  2.564949
1    AND         ARE      2019-02-01     2  2019      2  1.098612
2    AND         ARE      2019-03-01     1  2019      3  0.693147
3    AND         ARE      2019-04-01     7  2019      4  2.079442
4    AND         ARE      2019-05-01     0  2019      5  0.000000
GDP表 columns: ['iso3', 'year', 'gdp']
  iso3  year           gdp
0  AFE  2019  1.009747e+12
1  AFW  2019  8.332889e+11
2  ARB  2019  2.949355e+12
3  CSS  2019  6.192930e+10
4  CEB  2019  1.686042e+12
Market Access表 columns: ['iso3', 'year', 'MA']
  iso3  year            MA
0  NLD  2019  8.862921e+08
1  SUR  2019  5.983159e+07
2  EST  2019  2.702608e+08
3  SDN  2019  8.305597e+07
4  NER  2019  8.458146e+07
SPEI表 columns: ['country', 'ISO_A3', 'date', 'spei']
       country ISO_A3        date      spei
0 

In [2]:
def show_info(df, name):
    print(f"\n{name}  shape: {df.shape}")
    print(f"{name} columns: {df.columns.tolist()}")
    print(f"{name} 缺失值统计:\n{df.isnull().sum()}")
    print(f"{name} 数据类型:\n{df.dtypes}")
    print(f"{name} 基本统计:\n{df.describe(include='all')}")

show_info(mig, "迁移流表（merge前）")


迁移流表（merge前）  shape: (1545874, 7)
迁移流表（merge前） columns: ['origin', 'destination', 'migration_month', 'flow', 'year', 'month', 'log_flow']
迁移流表（merge前） 缺失值统计:
origin             0
destination        0
migration_month    0
flow               0
year               0
month              0
log_flow           0
dtype: int64
迁移流表（merge前） 数据类型:
origin              object
destination         object
migration_month     object
flow                 int64
year                 int64
month                int64
log_flow           float64
dtype: object
迁移流表（merge前） 基本统计:
         origin destination migration_month          flow          year  \
count   1545874     1545874         1545874  1.545874e+06  1.545874e+06   
unique      180         180              48           NaN           NaN   
top         AND         ARE      2019-04-01           NaN           NaN   
freq       8592        8592           32207           NaN           NaN   
mean        NaN         NaN             NaN  7.678771e+01  2.0205

In [4]:
spei['date'] = pd.to_datetime(spei['date'])
spei['year'] = spei['date'].dt.year
spei['month'] = spei['date'].dt.month
spei = spei.rename(columns={'ISO_A3': 'origin', 'spei': 'spei_origin'})
mig1 = mig.merge(spei[['origin', 'year', 'month', 'spei_origin']], on=['origin', 'year', 'month'], how='left')
show_info(mig1, "迁移流+origin SPEI（merge后）")


迁移流+origin SPEI（merge后）  shape: (1545874, 8)
迁移流+origin SPEI（merge后） columns: ['origin', 'destination', 'migration_month', 'flow', 'year', 'month', 'log_flow', 'spei_origin']
迁移流+origin SPEI（merge后） 缺失值统计:
origin                  0
destination             0
migration_month         0
flow                    0
year                    0
month                   0
log_flow                0
spei_origin        146016
dtype: int64
迁移流+origin SPEI（merge后） 数据类型:
origin              object
destination         object
migration_month     object
flow                 int64
year                 int64
month                int64
log_flow           float64
spei_origin        float64
dtype: object
迁移流+origin SPEI（merge后） 基本统计:
         origin destination migration_month          flow          year  \
count   1545874     1545874         1545874  1.545874e+06  1.545874e+06   
unique      180         180              48           NaN           NaN   
top         AND         ARE      2019-04-01           NaN

In [5]:
# 找出缺失 spei_origin 的行
missing = mig1[mig1['spei_origin'].isnull()]

# 1. 缺失的 origin 国家有哪些
print("缺失SPEI的国家数量：", missing['origin'].nunique())
print("缺失SPEI的国家代码：", missing['origin'].unique())

# 2. 缺失的年份和月份分布
print("缺失SPEI的年份分布：", missing['year'].value_counts().sort_index())
print("缺失SPEI的月份分布：", missing['month'].value_counts().sort_index())

# 3. 这些 origin-year-month 组合在 SPEI 数据中是否存在
spei_keys = set(zip(spei['origin'], spei['year'], spei['month']))
missing_keys = set(zip(missing['origin'], missing['year'], missing['month']))
not_in_spei = missing_keys - spei_keys
print(f"完全在SPEI表中找不到的origin-year-month组合数量: {len(not_in_spei)}")
print("举例：", list(not_in_spei)[:10])

# 4. 检查这些 origin 是否在 SPEI 数据中完全没有
missing_origins = set(missing['origin'])
spei_origins = set(spei['origin'])
no_spei_country = missing_origins - spei_origins
print("SPEI表中完全没有的国家代码：", no_spei_country)

缺失SPEI的国家数量： 17
缺失SPEI的国家代码： ['AND' 'BRB' 'BHR' 'FSM' 'FRA' 'GRD' 'HKG' 'KIR' 'LCA' 'MAC' 'MLT' 'MDV'
 'NOR' 'SGP' 'STP' 'TON' 'XKX']
缺失SPEI的年份分布： year
2019    36504
2020    36504
2021    36504
2022    36504
Name: count, dtype: int64
缺失SPEI的月份分布： month
1     12168
2     12168
3     12168
4     12168
5     12168
6     12168
7     12168
8     12168
9     12168
10    12168
11    12168
12    12168
Name: count, dtype: int64
完全在SPEI表中找不到的origin-year-month组合数量: 816
举例： [('NOR', 2020, 5), ('FRA', 2021, 4), ('MLT', 2021, 3), ('SGP', 2022, 1), ('MLT', 2021, 12), ('BRB', 2020, 9), ('SGP', 2022, 10), ('AND', 2019, 3), ('MAC', 2019, 1), ('STP', 2022, 9)]
SPEI表中完全没有的国家代码： {'STP', 'TON', 'MAC', 'MLT', 'NOR', 'AND', 'HKG', 'XKX', 'BRB', 'BHR', 'FSM', 'FRA', 'MDV', 'GRD', 'SGP', 'KIR', 'LCA'}
