## 将下载好的ipynb文件重新分配至以格网为单元的文件夹里

In [18]:
from glob import glob
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import os
import shutil
from tqdm import tqdm
import numpy as np


1. 整理下载好图片的metadata (存储到/nas/houce/Alphaearth_embedding/metadata/downloaded_grid_cells_5x5_merged.csv中)

In [19]:
all_download_records = glob("/nas/houce/Alphaearth_embedding/GEE_extracted/*/metadata/*_grid_cells.csv")
for i, file_path in enumerate(all_download_records):
    if i == 0:
        all_download_file_df = pd.read_csv(file_path)
    else:
        temp_df = pd.read_csv(file_path)
        all_download_file_df = pd.concat([all_download_file_df, temp_df], axis=0, ignore_index=True)

all_download_file_df['geometry'] = all_download_file_df.apply(
    lambda row: Polygon([
        (row['lon_min'], row['lat_min']),
        (row['lon_min'], row['lat_max']),
        (row['lon_max'], row['lat_max']),
        (row['lon_max'], row['lat_min']),
        (row['lon_min'], row['lat_min'])
    ]), axis=1
)
all_download_file_gdf = gpd.GeoDataFrame(all_download_file_df, geometry='geometry', crs="EPSG:4326")

In [20]:
# 使用已知的total_boundary生成5x5度的网格
lon_min, lat_min, lon_max, lat_max = [-180, -90, 180, 90]  # total_boundary

lon_bins = np.arange(np.floor(lon_min), np.ceil(lon_max), 5)
lat_bins = np.arange(np.floor(lat_min), np.ceil(lat_max), 5)

grid_polygons = []
for lon1 in lon_bins:
    for lat1 in lat_bins:
        lon2 = lon1 + 5
        lat2 = lat1 + 5
        poly = Polygon([
            (lon1, lat1),
            (lon1, lat2),
            (lon2, lat2),
            (lon2, lat1),
            (lon1, lat1)
        ])
        grid_polygons.append({'lon_min': lon1, 'lon_max': lon2, 'lat_min': lat1, 'lat_max': lat2, 'geometry': poly})

grid_gdf = gpd.GeoDataFrame(grid_polygons, geometry='geometry', crs="EPSG:4326")
grid_gdf['folder_name'] = grid_gdf.apply(
    lambda row: f"grid_{int(row['lon_min'])}_{int(row['lat_min'])}_{int(row['lon_max'])}_{int(row['lat_max'])}", axis=1
)
grid_gdf_merged = grid_gdf[['folder_name','geometry']].sjoin(all_download_file_gdf, how='left', predicate='intersects', lsuffix='grid', rsuffix='download')
# unique_grid_gdf_merged = grid_gdf_merged[grid_gdf_merged['index_download'].notna()].drop_duplicates(subset='file_path')

In [21]:
AEF_file_paths = pd.DataFrame(glob("/nas/houce/Alphaearth_embedding/GEE_extracted/Africa_grid_*/*/*.tif"), columns=['file_path'])
AEF_file_paths2 = pd.DataFrame(glob("/nas/houce/Alphaearth_embedding/GEE_extracted/Africa_grid_*/*.tif"), columns=['file_path'])
AEF_file_paths_all = pd.concat([AEF_file_paths, AEF_file_paths2], axis=0, ignore_index=True).reset_index(drop=True)
AEF_transferred_file_paths = pd.DataFrame(glob("/nas/houce/Alphaearth_embedding/AEF_tiles/*/*/*.tif"), columns=['file_path_copied'])

In [22]:
AEF_file_paths_all['grid_name'] = AEF_file_paths_all['file_path'].apply(lambda x: x.split("/")[-1][20:-26])
AEF_transferred_file_paths['grid_name'] = AEF_transferred_file_paths['file_path_copied'].apply(lambda x: x.split("/")[-1][20:-26])
AEF_transferred_file_paths['start_time'] = AEF_transferred_file_paths['file_path_copied'].apply(lambda x: x.split("_")[-2])
AEF_transferred_file_paths['end_time'] = AEF_transferred_file_paths['file_path_copied'].apply(lambda x: x.split("_")[-1][:-4])

AEF_file_paths_all_merged = grid_gdf_merged.merge(AEF_file_paths_all, on='grid_name', how='left')
AEF_file_paths_all_merged = AEF_file_paths_all_merged.merge(AEF_transferred_file_paths, on='grid_name', how='left')
# all_AEF_files_merged = AEF_transferred_file_paths.merge(AEF_file_paths_all_merged, on='grid_name', how='left')
AEF_file_paths_all_merged.to_file("/nas/houce/Alphaearth_embedding/metadata/all_grid_cells_5x5_merged.geojson", driver='GeoJSON')

3. 转移图片到指定目录下，并删除源文件

In [23]:
updated_AEF_path = AEF_file_paths_all_merged[AEF_file_paths_all_merged['file_path'].notna()].drop_duplicates(subset='file_path')
for i, row in tqdm(updated_AEF_path.iterrows(), total=updated_AEF_path.shape[0]):
    if pd.isna(row['file_path_copied']):
        src_path = row['file_path']
        folder_name = row['folder_name']
        dest_dir = os.path.join(f"/nas/houce/Alphaearth_embedding/AEF_tiles/{row['file_path'].split('_')[-2][:4]}", folder_name)
        os.makedirs(dest_dir, exist_ok=True)
        dest_path = os.path.join(dest_dir, os.path.basename(src_path))
        try:
            shutil.copy(src_path, dest_path)
            os.remove(src_path)
        except Exception as e:
            print(f"Error copying {src_path} to {dest_path}: {e}")

0it [00:00, ?it/s]


In [25]:
# all_dirs = [d for d in glob("/nas/houce/Alphaearth_embedding/AEF_tiles/*") if os.path.isdir(d)]

# for path in tqdm(all_dirs):
#     if not os.listdir(path):
#         os.rmdir(path)