**这是用来读取spei nc文件的测试代码**

In [14]:
# Step 1: 导入所需库
import xarray as xr
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
import matplotlib.pyplot as plt

In [15]:
# Step 2: 读取 SPEI NetCDF 文件
spei_nc_path = '../data/raw/spei03.nc'  # 修改为你的实际路径
ds = xr.open_dataset(spei_nc_path)
ds

In [17]:
spei = ds['spei']  
print(spei)
lats = ds['lat'].values
lons = ds['lon'].values
times = pd.to_datetime(ds['time'].values)
print('纬度范围:', lats.min(), '-', lats.max())
print('经度范围:', lons.min(), '-', lons.max())
print('时间范围:', times.min(), '-', times.max())

<xarray.DataArray 'spei' (time: 1476, lat: 360, lon: 720)> Size: 2GB
[382579200 values with dtype=float32]
Coordinates:
  * lon      (lon) float64 6kB -179.8 -179.2 -178.8 -178.2 ... 178.8 179.2 179.8
  * lat      (lat) float64 3kB -89.75 -89.25 -88.75 -88.25 ... 88.75 89.25 89.75
  * time     (time) datetime64[ns] 12kB 1901-01-16 1901-02-15 ... 2023-12-16
Attributes:
    units:         1
    long_name:     Standardized Precipitation-Evapotranspiration Index
    grid_mapping:  crs
纬度范围: -89.75 - 89.75
经度范围: -179.75 - 179.75
时间范围: 1901-01-16 00:00:00 - 2023-12-16 00:00:00


In [18]:
# Step 3: 读取国家边界矢量文件
shapefile_path = '../data/raw/ne_50m_admin_0_countries/ne_50m_admin_0_countries.shp'  # 修改为你的实际路径
countries = gpd.read_file(shapefile_path).to_crs('EPSG:4326')
countries[['ISO_A3', 'geometry']].head()

Unnamed: 0,ISO_A3,geometry
0,ZWE,"POLYGON ((31.28789 -22.40205, 31.19727 -22.344..."
1,ZMB,"POLYGON ((30.39609 -15.64307, 30.25068 -15.643..."
2,YEM,"MULTIPOLYGON (((53.08564 16.64839, 52.58145 16..."
3,VNM,"MULTIPOLYGON (((104.06396 10.39082, 104.08301 ..."
4,VEN,"MULTIPOLYGON (((-60.82119 9.13838, -60.94141 9..."


In [19]:
# Step 4: 构建SPEI网格点GeoDataFrame
lon_grid, lat_grid = np.meshgrid(lons, lats)
points = [Point(lon, lat) for lon, lat in zip(lon_grid.flatten(), lat_grid.flatten())]
points_gdf = gpd.GeoDataFrame({'orig_idx': np.arange(len(points))}, geometry=points, crs='EPSG:4326')

In [20]:
# Step 5: 空间连接，将每个网格点分配到国家
points_gdf = gpd.sjoin(points_gdf, countries[['ADMIN','ISO_A3', 'geometry']], how='inner', predicate='within')
points_gdf = points_gdf.rename(columns={'ADMIN': 'country'})
points_gdf.head



<bound method NDFrame.head of         orig_idx                geometry  index_right     country ISO_A3
0              0  POINT (-179.75 -89.75)          239  Antarctica    ATA
1              1  POINT (-179.25 -89.75)          239  Antarctica    ATA
2              2  POINT (-178.75 -89.75)          239  Antarctica    ATA
3              3  POINT (-178.25 -89.75)          239  Antarctica    ATA
4              4  POINT (-177.75 -89.75)          239  Antarctica    ATA
...          ...                     ...          ...         ...    ...
249423    249423    POINT (-28.25 83.25)          181   Greenland    GRL
249424    249424    POINT (-27.75 83.25)          181   Greenland    GRL
249425    249425    POINT (-27.25 83.25)          181   Greenland    GRL
249426    249426    POINT (-26.75 83.25)          181   Greenland    GRL
249427    249427    POINT (-26.25 83.25)          181   Greenland    GRL

[85699 rows x 5 columns]>

In [21]:
# Step 6: 提取2019-2022每国每月SPEI均值
records = []
for t_idx, t in enumerate(times):
    if t.year < 2019 or t.year > 2022:
        continue
    spei_slice = spei.isel(time=t_idx).values.flatten()
    points_gdf['spei'] = spei_slice[points_gdf['orig_idx'].values]
    grouped = points_gdf.groupby(['country','ISO_A3'])['spei'].mean().reset_index()
    grouped['date'] = t
    records.append(grouped)
result = pd.concat(records, ignore_index=True)
result = result.dropna(subset=['country'])



In [None]:
# Step 7: 保存面板数据表
result = result[['country', 'date', 'spei']]
result.to_csv('../data/processed/spei_country_month_2019_2022.csv', index=False)
print('保存成功，面板数据表 shape:', result.shape)
result.head()

保存成功，面板数据表 shape: (9120, 3)


Unnamed: 0,country,date,spei
0,Afghanistan,2019-01-16,-0.101823
1,Aland,2019-01-16,-0.584555
2,Albania,2019-01-16,-0.229678
3,Algeria,2019-01-16,-1.081419
4,Angola,2019-01-16,0.090749


In [23]:
import pandas as pd

# 读取 SPEI 面板数据
spei = pd.read_csv('../data/processed/spei_country_month_2019_2022.csv')

# 检查整体缺失
print("总行数：", len(spei))
print("每列缺失值数量：\n", spei.isnull().sum())

# 检查哪些国家有缺失
missing_country = spei[spei['spei'].isnull()]
print("有缺失的国家数量：", missing_country['country'].nunique())
print("有缺失的国家代码：", missing_country['country'].unique())

# 检查哪些月份有缺失
spei['date'] = pd.to_datetime(spei['date'])
spei['year'] = spei['date'].dt.year
spei['month'] = spei['date'].dt.month
print("有缺失的年份：", missing_country['date'].dropna().apply(lambda x: str(x)[:4]).unique())
print("有缺失的月份：", missing_country['date'].dropna().apply(lambda x: str(x)[5:7]).unique())

# 查看缺失的前几行
print("缺失值样例：")
print(missing_country.head())

总行数： 9120
每列缺失值数量：
 country      0
date         0
spei       144
dtype: int64
有缺失的国家数量： 3
有缺失的国家代码： ['Antarctica' 'Cayman Islands' 'Kiribati']
有缺失的年份： ['2019' '2020' '2021' '2022']
有缺失的月份： ['01' '02' '03' '04' '05' '06' '07' '08' '09' '10' '11' '12']
缺失值样例：
            country        date  spei
5        Antarctica  2019-01-16   NaN
29   Cayman Islands  2019-01-16   NaN
88         Kiribati  2019-01-16   NaN
195      Antarctica  2019-02-15   NaN
219  Cayman Islands  2019-02-15   NaN


In [1]:
import pandas as pd

# 读取 SPEI 面板数据
spei = pd.read_csv('../data/processed/spei_country_month_2019_2022.csv')

# 删除有缺失的三个国家
drop_countries = ['Antarctica', 'Cayman Islands', 'Kiribati']
spei_clean = spei[~spei['country'].isin(drop_countries)].copy()

# 检查是否还有缺失
print("每列缺失值数量：\n", spei_clean.isnull().sum())

# 保存为 clean 文件
spei_clean.to_csv('../data/processed/spei_country_month_2019_2022_cleaned.csv', index=False)
print("已保存为 spei_country_month_2019_2022_cleaned.csv")

每列缺失值数量：
 country    0
date       0
spei       0
dtype: int64
已保存为 spei_country_month_2019_2022_cleaned.csv


现在完成了数据的清理，得到的spei_country_month_2019_2022_cleaned.csv是没有缺失值的

In [4]:
import pandas as pd

# 读取 clean 后的 SPEI 面板数据
spei = pd.read_csv('../data/processed/spei03_country_month_cleaned.csv')

# 统计所有国家及其观测次数
country_stats = spei['country'].value_counts().reset_index()
country_stats.columns = ['country', 'n_obs']

# 可选：加上 ISO3 代码
iso_map = spei[['country', 'ISO_A3']].drop_duplicates()
country_stats = country_stats.merge(iso_map, on='country', how='left')

# 输出前几行
print(country_stats.head())

# 保存为表格
country_stats.to_csv('../data/processed/spei_country_list.csv', index=False)
print("已保存 spei_country_list.csv，包含所有国家及观测次数。")

       country  n_obs ISO_A3
0  Afghanistan     48    AFG
1        Aland     48    ALA
2      Albania     48    ALB
3      Algeria     48    DZA
4       Angola     48    AGO
已保存 spei_country_list.csv，包含所有国家及观测次数。
