In [1]:
import os
import xarray as xr
import shutil
from tqdm import tqdm
import logging

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def process_nc_files(input_dir, output_dir, min_time_dim=24, required_vars=None):
    """
    检查NC文件时间维度和必需变量，并移动不符合要求的文件
    
    参数:
    input_dir: 包含NC文件的目录
    output_dir: 移动不符合要求文件的目标目录
    min_time_dim: 时间维度的最小值要求，默认为24
    required_vars: 需要检查的必需变量列表，默认为None
    """
    # 确保输出目录存在
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        logger.info(f"创建输出目录: {output_dir}")
    
    # 获取所有NC文件
    nc_files = [f for f in os.listdir(input_dir) if f.endswith('.nc')]
    logger.info(f"找到 {len(nc_files)} 个NC文件")
    
    moved_files = 0
    
    # 设置默认的必需变量列表
    if required_vars is None:
        required_vars = ["TAIR", "UWIN", "VWIN", "PRE", "STATION"]
    
    # 处理每个文件
    for filename in tqdm(nc_files, desc="处理文件"):
        file_path = os.path.join(input_dir, filename)
        try:
            # 打开NC文件
            ds = xr.open_dataset(file_path)
            
            should_move = False
            reason = ""
            
            # 检查时间维度
            time_len = len(ds.time)
            if time_len < min_time_dim:
                should_move = True
                reason = f"时间维度: {time_len} < {min_time_dim}"
            
            # 检查必需变量和它们的维度
            missing_vars = []
            wrong_dims_vars = []
            
            for var in required_vars:
                # 检查变量是否存在
                if var not in ds.data_vars:
                    missing_vars.append(var)
                else:
                    # 检查变量维度是否为 (time, lat, lon)
                    var_dims = ds[var].dims
                    expected_dims = ('time', 'lat', 'lon')
                    if var_dims != expected_dims:
                        wrong_dims_vars.append(f"{var}{var_dims}")
            
            if missing_vars or wrong_dims_vars:
                should_move = True
                if missing_vars:
                    if reason:
                        reason += "; "
                    reason += f"缺少变量: {', '.join(missing_vars)}"
                if wrong_dims_vars:
                    if reason:
                        reason += "; "
                    reason += f"变量维度错误: {', '.join(wrong_dims_vars)}"
            
            # 移动不符合要求的文件
            if should_move:
                dest_path = os.path.join(output_dir, filename)
                shutil.move(file_path, dest_path)
                logger.info(f"已移动: {filename} ({reason})")
                moved_files += 1
            else:
                logger.debug(f"保留: {filename} (符合所有要求)")
            
            # 关闭数据集
            ds.close()
            
        except Exception as e:
            logger.error(f"处理 {filename} 时出错: {str(e)}")
            # 可选：把有问题的文件也移动到输出目录
            try:
                dest_path = os.path.join(output_dir, filename)
                shutil.move(file_path, dest_path)
                logger.info(f"已移动出错文件: {filename}")
                moved_files += 1
            except Exception as move_err:
                logger.error(f"移动出错文件 {filename} 失败: {str(move_err)}")
    
    logger.info(f"处理完成。共移动 {moved_files} 个文件到 {output_dir}")

if __name__ == "__main__":
    # 配置参数
    INPUT_DIR = "/mnt/h/DataSet/Merged_padded"  # 修改为你的NC文件目录
    OUTPUT_DIR = "/mnt/h/DataSet/other"         # 修改为你想移动文件的目标目录
    MIN_TIME_DIM = 24                           # 时间维度的最小要求
    
    # 需要检查的变量列表
    REQUIRED_VARS = ["TAIR", "UWIN", "VWIN", "PRE", "STATION"]
    
    # 执行处理
    process_nc_files(INPUT_DIR, OUTPUT_DIR, MIN_TIME_DIM, REQUIRED_VARS)

2025-05-05 20:13:25,310 - INFO - 找到 271 个NC文件
处理文件:   0%|          | 1/271 [00:00<00:41,  6.50it/s]2025-05-05 20:13:25,574 - INFO - 已移动: 20220406.nc (时间维度: 1 < 24)
处理文件:   1%|▏         | 4/271 [00:00<00:15, 17.05it/s]2025-05-05 20:13:25,610 - INFO - 已移动: 20220407.nc (时间维度: 16 < 24)
处理文件:   4%|▎         | 10/271 [00:00<00:10, 24.30it/s]2025-05-05 20:13:25,861 - INFO - 已移动: 20220414.nc (时间维度: 17 < 24)
2025-05-05 20:13:25,900 - INFO - 已移动: 20220416.nc (时间维度: 21 < 24)
处理文件:  38%|███▊      | 104/271 [00:03<00:05, 28.48it/s]2025-05-05 20:13:29,203 - INFO - 已移动: 20220717.nc (时间维度: 7 < 24)
2025-05-05 20:13:29,246 - INFO - 已移动: 20220719.nc (时间维度: 23 < 24)
处理文件: 100%|██████████| 271/271 [00:09<00:00, 28.59it/s]
2025-05-05 20:13:34,792 - INFO - 处理完成。共移动 6 个文件到 /mnt/h/DataSet/other


In [2]:
import xarray as xr

# ds = xr.open_dataset("/mnt/h/DataSet/3-DEM/CHN_dem_1km_clipped.nc")
ds = xr.open_dataset("/mnt/h/DataSet/station_precipitation_data.nc")
# ds = xr.open_dataset("/mnt/h/DataSet/Pre_mm/2022-04.nc")
# ds
# 查看 rain1h_qc 变量的基本统计特征
print("rain1h_qc 最大值:", ds["rain1h_qc"].max().values)
print("rain1h_qc 平均值:", ds["rain1h_qc"].mean().values)
print("rain1h_qc 最小值:", ds["rain1h_qc"].min().values)
print("rain1h_qc 缺失值数量:", ds["rain1h_qc"].isnull().sum().values)

# 将 rain1h_qc 中的 NaN 替换为 0
ds_filled = ds.copy()
ds_filled["rain1h_qc"] = ds_filled["rain1h_qc"].fillna(0)

# 保存为新文件
output_path = "/mnt/h/DataSet/station_precipitation_data_filled.nc"
ds_filled.to_netcdf(output_path)
print(f"已保存填充后的数据到: {output_path}")

rain1h_qc 最大值: 9.0
rain1h_qc 平均值: 0.12761095652399695
rain1h_qc 最小值: 0.0
rain1h_qc 缺失值数量: 1512392
已保存填充后的数据到: /mnt/h/DataSet/station_precipitation_data_filled.nc


In [None]:
import os
import xarray as xr

def convert_all_tp_to_mm(input_dir, output_dir, months=range(4, 13), year=2022):
    os.makedirs(output_dir, exist_ok=True)
    for month in months:
        input_file = os.path.join(input_dir, f"{year}-{month:02d}.nc")
        output_file = os.path.join(output_dir, f"{year}-{month:02d}.nc")
        if not os.path.exists(input_file):
            print(f"文件不存在: {input_file}")
            continue
        ds = xr.open_dataset(input_file)
        if "tp" not in ds:
            print(f"{input_file} 没有 'tp' 变量，跳过")
            continue
        ds["tp"] = ds["tp"] * 1000
        ds["tp"].attrs["units"] = "mm"
        ds.to_netcdf(output_file)
        print(f"已保存: {output_file}")

# 用法示例
input_dir = "/mnt/h/DataSet/Pre"         # 输入文件夹路径
output_dir = "/mnt/h/DataSet/Pre_mm"     # 输出文件夹路径
convert_all_tp_to_mm(input_dir, output_dir)

已保存: /mnt/h/DataSet/Pre_mm/2022-04.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-05.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-06.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-07.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-08.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-09.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-10.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-11.nc
已保存: /mnt/h/DataSet/Pre_mm/2022-12.nc


In [1]:
import xarray as xr
import pandas as pd
import numpy as np
from datetime import datetime

def extract_zarr_to_netcdf(zarr_path, output_path, 
                          start_date=None, end_date=None, 
                          time_indices=None, 
                          compression_level=5):
    """
    从Zarr数据集提取特定时间段并保存为NetCDF文件
    
    参数:
    zarr_path: Zarr数据集路径
    output_path: 输出的NetCDF文件路径
    start_date: 开始日期，格式为 'YYYY-MM-DD' 或 'YYYY-MM-DD HH:MM:SS'
    end_date: 结束日期，与start_date格式相同
    time_indices: 时间索引范围，如 (0, 10) 或 slice(0, 10)，如果提供则忽略日期范围
    compression_level: NetCDF压缩级别 (1-9)，更高的级别压缩更小但更慢
    """
    # 打开Zarr数据集
    print(f"正在打开Zarr数据集: {zarr_path}")
    ds = xr.open_zarr(zarr_path)
    
    # 显示数据集基本信息
    print(f"数据集时间范围: {ds.time.values[0]} 到 {ds.time.values[-1]}")
    print(f"总时间点数量: {len(ds.time)}")
    
    # 选择数据子集
    if time_indices is not None:
        # 使用时间索引
        if isinstance(time_indices, tuple) and len(time_indices) == 2:
            data_subset = ds.isel(time=slice(time_indices[0], time_indices[1]))
        elif isinstance(time_indices, slice):
            data_subset = ds.isel(time=time_indices)
        else:
            raise ValueError("time_indices 必须是元组 (start, end) 或 slice 对象")
        
        print(f"通过索引提取数据: {time_indices}")
    else:
        # 使用日期范围
        if start_date and end_date:
            # 将字符串转换为datetime64
            start_date = np.datetime64(start_date)
            end_date = np.datetime64(end_date)
            
            # 选择日期范围内的数据
            data_subset = ds.sel(time=slice(start_date, end_date))
            print(f"提取日期范围: {start_date} 到 {end_date}")
        else:
            # 如果没有指定日期范围，使用全部数据
            data_subset = ds
            print("未指定日期范围，使用全部数据")
    
    # 添加地理元数据，便于GIS软件识别
    data_subset = data_subset.assign_coords(
        lon=data_subset.lon,
        lat=data_subset.lat
    )
    
    # 添加变量属性
    for var in data_subset.data_vars:
        if var == "corrected_precip":
            data_subset[var].attrs.update({
                "units": "mm/h",
                "long_name": "Corrected Precipitation",
                "standard_name": "precipitation_amount"
            })
    
    # 添加全局属性
    data_subset.attrs.update({
        "title": "Corrected Precipitation Data",
        "description": "Precipitation data extracted from Zarr dataset",
        "created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "source": f"Original Zarr dataset: {zarr_path}"
    })
    
    # 设置压缩编码
    encoding = {}
    for var in data_subset.data_vars:
        encoding[var] = {
            "zlib": True, 
            "complevel": compression_level, 
            "dtype": "float32"
        }
    
    # 保存为NetCDF文件
    print(f"正在保存到: {output_path}")
    data_subset.to_netcdf(output_path, encoding=encoding)
    print(f"保存完成！提取了 {len(data_subset.time)} 个时间点的数据")
    
    # 返回提取的数据集，以便后续处理
    return data_subset

# 提取特定日期范围的数据
zarr_path = "/mnt/h/DataSet/PreGrids/temp_output.zarr"
output_file = "/mnt/h/DataSet/PreGrids/precip_2022_05_01_to_2022_05_31.nc"

# 提取2022年5月的数据
extract_zarr_to_netcdf(
    zarr_path=zarr_path,
    output_path=output_file,
    start_date="2022-04-02",
    end_date="2022-04-03 21:59:59"
)


正在打开Zarr数据集: /mnt/h/DataSet/PreGrids/temp_output.zarr
数据集时间范围: 2022-04-02T22:00:00.000000000 到 2022-12-31T23:00:00.000000000
总时间点数量: 6554
提取日期范围: 2022-04-02 到 2022-04-03T21:59:59
正在保存到: /mnt/h/DataSet/PreGrids/precip_2022_05_01_to_2022_05_31.nc
保存完成！提取了 24 个时间点的数据


Unnamed: 0,Array,Chunk
Bytes,41.38 MiB,337.50 kiB
Shape,"(24, 570, 793)","(24, 36, 100)"
Dask graph,128 chunks in 3 graph layers,128 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 41.38 MiB 337.50 kiB Shape (24, 570, 793) (24, 36, 100) Dask graph 128 chunks in 3 graph layers Data type float32 numpy.ndarray",793  570  24,

Unnamed: 0,Array,Chunk
Bytes,41.38 MiB,337.50 kiB
Shape,"(24, 570, 793)","(24, 36, 100)"
Dask graph,128 chunks in 3 graph layers,128 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
import xarray as xr
import cfgrib
# flie0 = "/mnt/h/DataSet/2022/202205/20220501/Z_SURF_C_BABJ_20220501080533_P_CMPA_RT_BCGZ_0P01_HOR-PRE-2022050108.GRB2"
# file0 = "/mnt/h/DataSet/station_precipitation_data_filled.nc"
# file1 = "/mnt/h/DataSet/Merge/temp/merged_data_batch_0011.nc"
# file1 = "/mnt/h/DataSet/3-DEM/CHN_dem_1km.nc"
# file1 = "/mnt/h/DataSet/Pre_DEM/2022-04.nc"
file1 = "/mnt/d/Data/train/merged_data_batch_0006.nc"
ds1 = xr.open_dataset(file1)

ds1

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/d/Data/train/merged_data_batch_0263.nc'

In [None]:
import os
import xarray as xr
from tqdm import tqdm
import shutil

def remove_short_time_nc(input_dir, min_time_len=24, backup_dir=None):
    """
    删除time长度不足min_time_len的nc文件，可选备份到backup_dir。

    参数:
    input_dir: nc文件所在目录
    min_time_len: time维度最小长度
    backup_dir: 若指定，则先将文件移动到此目录再删除
    """
    nc_files = [f for f in os.listdir(input_dir) if f.endswith('.nc')]
    print(f"共找到 {len(nc_files)} 个nc文件")
    removed = 0

    if backup_dir:
        os.makedirs(backup_dir, exist_ok=True)

    for fname in tqdm(nc_files, desc="检查文件"):
        fpath = os.path.join(input_dir, fname)
        try:
            ds = xr.open_dataset(fpath)
            if "time" not in ds.dims and "time" not in ds.coords:
                print(f"{fname} 无time维度，跳过")
                ds.close()
                continue
            time_len = len(ds["time"])
            var_len = len(ds.data_vars)
            ds.close()
            if time_len < min_time_len:
                if backup_dir:
                    shutil.move(fpath, os.path.join(backup_dir, fname))
                    print(f"{fname} 已移动到备份目录")
                else:
                    os.remove(fpath)
                    print(f"{fname} 已删除")
                removed += 1
            if var_len < 5:
                if backup_dir:
                    shutil.move(fpath, os.path.join(backup_dir, fname))
                    print(f"{fname} 已移动到备份目录")
                else:
                    os.remove(fpath)
                    print(f"{fname} 已删除")
                removed += 1
        except Exception as e:
            print(f"处理 {fname} 出错: {e}")

    print(f"共处理完成，移除/移动了 {removed} 个文件。")

# 用法示例
input_dir = "/mnt/d/Data/"      # 修改为你的nc文件夹路径
backup_dir = "/mnt/d/Data/"      # 可选：不想直接删除可指定备份目录
remove_short_time_nc(input_dir, min_time_len=24, backup_dir=backup_dir)

共找到 40 个nc文件


检查文件: 100%|██████████| 40/40 [00:01<00:00, 25.34it/s]

共处理完成，移除/移动了 0 个文件。



