In [86]:
import pandas as pd

# 读取数据
mig = pd.read_csv('../data/processed/migration_flow_cleaned.csv')
spei = pd.read_csv('../data/processed/spei03_country_month_cleaned.csv')

In [73]:
# 检查主键字段类型
print("迁移数据主键字段类型：\n", mig.dtypes)
print("\nSPEI数据主键字段类型：\n", spei.dtypes)


迁移数据主键字段类型：
 origin_iso3          object
destination_iso3     object
migration_month      object
year                  int64
month                 int64
flow                  int64
log_flow            float64
origin_iso2          object
destination_iso2     object
dtype: object

SPEI数据主键字段类型：
 country     object
ISO_A3      object
date        object
spei       float64
dtype: object


In [None]:
# 检查国家代码格式
assert all(mig['origin_iso3'].str.len() == 3)
assert all(spei['ISO_A3'].str.len() == 3)

# 找出在mig中但不在spei中的国家代码
missing_codes = set(mig['origin_iso3']) - set(spei['ISO_A3'])
print("以下国家在SPEI数据中缺失：", missing_codes)

以下国家代码在SPEI数据中缺失： {'GRD', 'STP', 'LCA', 'SGP', 'MAC', 'TON', 'MLT', 'MDV', 'BRB', 'HKG', 'AND', 'BHR', 'FSM', 'KIR'}


In [68]:
# 1. 读取或获取missing_countries_gdf
missing_countries_gdf = pd.read_csv('../data/processed/spei_missing_countries.csv')  # 或直接用变量

# 2. 检查missing_codes哪些在missing_countries_gdf里
missing_codes = {'GRD', 'STP', 'LCA', 'SGP', 'MAC', 'TON', 'MLT', 'MDV', 'BRB', 'HKG', 'AND', 'BHR', 'FSM', 'KIR'}

# 属于spei_missing_countries的
in_missing = [code for code in missing_codes if code in set(missing_countries_gdf['ISO_A3'])]
print("属于spei_missing_countries的：", in_missing)

# 不属于spei_missing_countries的
not_in_missing = [code for code in missing_codes if code not in set(missing_countries_gdf['ISO_A3'])]
print("不属于spei_missing_countries的：", not_in_missing)

属于spei_missing_countries的： ['GRD', 'STP', 'SGP', 'MAC', 'TON', 'MLT', 'BHR', 'MDV', 'BRB', 'AND', 'HKG', 'FSM', 'LCA']
不属于spei_missing_countries的： ['KIR']


检查了一下，除了KIR，其他都是因为国家太小，没有spei网格点数。

In [76]:
# 检查缺失值
print(mig.isnull().sum())
print(spei.isnull().sum())

origin_iso3         0
destination_iso3    0
migration_month     0
year                0
month               0
flow                0
log_flow            0
origin_iso2         0
destination_iso2    0
dtype: int64
country    0
ISO_A3     0
date       0
spei       0
dtype: int64


In [77]:
# 检查重复
print(mig.duplicated(subset=['origin_iso3', 'destination_iso3', 'year', 'month']).sum())
print(spei.duplicated(subset=['ISO_A3', 'date']).sum())

0
0


In [78]:
# 检查主键唯一性
assert mig.duplicated(subset=['origin_iso3', 'destination_iso3', 'year', 'month']).sum() == 0
assert spei.duplicated(subset=['ISO_A3', 'date']).sum() == 0

In [87]:
# 处理 SPEI 字段
spei['date'] = pd.to_datetime(spei['date'])
spei['year'] = spei['date'].dt.year
spei['month'] = spei['date'].dt.month
spei = spei.rename(columns={'ISO_A3': 'origin_iso3','spei':'origin_spei'})
spei = spei[['origin_iso3', 'year', 'month', 'origin_spei']]

# 处理mig字段
mig = mig.rename(columns={'migration_month':'migration_date'})
mig = mig[['origin_iso3', 'destination_iso3', 'year', 'month', 'migration_date','flow','log_flow']]

# 合并
mig_spei = mig.merge(spei, on=['origin_iso3', 'year', 'month'], how='left')

# 检查合并效果
print(mig_spei.head())
print("合并后缺失值统计：\n", mig_spei.isnull().sum())

  origin_iso3 destination_iso3  year  month migration_date  flow  log_flow  \
0         AND              ARE  2019      1     2019-01-01    12  2.564949   
1         AND              ARE  2019      2     2019-02-01     2  1.098612   
2         AND              ARE  2019      3     2019-03-01     1  0.693147   
3         AND              ARE  2019      4     2019-04-01     7  2.079442   
4         AND              ARE  2019      5     2019-05-01     0  0.000000   

   origin_spei  
0          NaN  
1          NaN  
2          NaN  
3          NaN  
4          NaN  
合并后缺失值统计：
 origin_iso3              0
destination_iso3         0
year                     0
month                    0
migration_date           0
flow                     0
log_flow                 0
origin_spei         120288
dtype: int64


In [89]:
# 保存
mig_spei.to_csv('../data/processed/migration_with_spei.csv', index=False)
print("已保存 migration_with_spei.csv")

已保存 migration_with_spei.csv
