In [9]:
import pandas as pd
from glob import glob
import os 
from tqdm import tqdm
import geopandas as gpd
import numpy as np

data = pd.read_csv("/code/GEE_download/human_settlement_grid_cells_deg_25.csv")

In [10]:
years = pd.DataFrame({'year': range(2008, 2025)})
data['key'] = 1
years['key'] = 1

data = pd.merge(data, years, on='key', how='outer').drop('key', axis=1)
data['time_start'] = data['year'].astype(str) + '-01-01'
data['time_end'] = data['year'].astype(str) + '-12-31'
# data['download_command'] = data.apply(lambda row: f"grid_{row.name}|{row['lon_min']},{row['lat_min']}|{row['lon_max']},{row['lat_max']}|14|{row['time_start']}|12", axis=1)
data.head()

Unnamed: 0,lon_min,lat_min,lon_max,lat_max,value,year,time_start,time_end
0,167.5,-47.5,170.0,-45.0,18.451653,2008,2008-01-01,2008-12-31
1,167.5,-47.5,170.0,-45.0,18.451653,2009,2009-01-01,2009-12-31
2,167.5,-47.5,170.0,-45.0,18.451653,2010,2010-01-01,2010-12-31
3,167.5,-47.5,170.0,-45.0,18.451653,2011,2011-01-01,2011-12-31
4,167.5,-47.5,170.0,-45.0,18.451653,2012,2012-01-01,2012-12-31


In [11]:
data

Unnamed: 0,lon_min,lat_min,lon_max,lat_max,value,year,time_start,time_end
0,167.5,-47.5,170.0,-45.0,18.451653,2008,2008-01-01,2008-12-31
1,167.5,-47.5,170.0,-45.0,18.451653,2009,2009-01-01,2009-12-31
2,167.5,-47.5,170.0,-45.0,18.451653,2010,2010-01-01,2010-12-31
3,167.5,-47.5,170.0,-45.0,18.451653,2011,2011-01-01,2011-12-31
4,167.5,-47.5,170.0,-45.0,18.451653,2012,2012-01-01,2012-12-31
...,...,...,...,...,...,...,...,...
18933,160.0,67.5,162.5,70.0,2.504991,2020,2020-01-01,2020-12-31
18934,160.0,67.5,162.5,70.0,2.504991,2021,2021-01-01,2021-12-31
18935,160.0,67.5,162.5,70.0,2.504991,2022,2022-01-01,2022-12-31
18936,160.0,67.5,162.5,70.0,2.504991,2023,2023-01-01,2023-12-31


In [27]:
import json
import numpy as np
import os
import pandas as pd

# kenya_bound = [33.89, -4.68, 41.86, 5.51] # [west, south, east, north] bounds of Kenya
# bound = [2, 4, 16, 14] # [west, south, east, north] bounds 
# bound = [-20, 0, 10, 50]

# bound = [120,30,122.25,32] # [west, south, east, north] bounds of Shanghai
save_name = f"Africa_all_2008_2024"
os.makedirs(f'/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata', exist_ok=True)


# Create grid cells of 0.25x0.25 degree
# lon_min, lat_min, lon_max, lat_max = bound
# lons = np.arange(lon_min, lon_max + 0.25, 0.25)
# lats = np.arange(lat_min, lat_max + 0.25, 0.25)

if not os.path.exists(f'/code/GEE_download/{save_name}/{save_name}_grid_cells.json'):
    grid_dict = {}
    for i, row in data.iterrows():
        lon_min = row['lon_min']
        lat_min = row['lat_min']
        lon_max = row['lon_max']
        lat_max = row['lat_max']
        cell_name = f"{save_name}_grid_{i // (2024 - 2008 + 1)}_{row['time_start'][:4]}"
        grid_dict[cell_name] = {
            "lon_min": f"{float(lon_min):.4f}",
            "lon_max": f"{float(lon_max):.4f}",
            "lat_min": f"{float(lat_min):.4f}",
            "lat_max": f"{float(lat_max):.4f}",
            "start_time": row['time_start'],
            "end_time": row['time_end']
        }

    # Save the grid cells as a JSON file
    with open(f'/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata/{save_name}.json', 'w') as f:
        json.dump(grid_dict, f, indent=4)

    print(f"Created {len(grid_dict)} grid cells and saved to {save_name}.json")

else:
    grid_dict = json.load(open(f'/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata/{save_name}.json', 'r'))
pd.DataFrame(grid_dict).T.reset_index().rename(columns={'index':'grid_name'}).to_csv(f'/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata/{save_name}_grid_cells.csv', index=False)

Created 18938 grid cells and saved to Africa_all_2008_2024.json


In [None]:
def generate_code_script(batch_size, save_name, grid_dict, mode='python'):
    regions = []
    for cell_name, coords in grid_dict.items():
        region = {
            'name': cell_name,
            'geometry': f"ee.Geometry.Rectangle([{coords['lon_min']}, {coords['lat_min']}, {coords['lon_max']}, {coords['lat_max']}])"
        }
        regions.append(region)

    if mode == "GEE":
        gee_code = """// =================================================================================
        // **第二步：设置通用的参数**
        // =================================================================================

        // 加载影像集合
        var dataset = ee.ImageCollection('GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL');

        // 开始和结束时间
        var date_s = '2023-01-01';
        var date_e = '2024-01-01';

        // 下载分辨率
        var resolution = 10;

        // 可视化参数
        var visParams = {min: -0.3, max: 0.3, bands: ['A01', 'A16', 'A09']};


        // =================================================================================
        // **第三步：循环遍历每个区域并创建导出任务**
        // =================================================================================

        // 使用 forEach 循环来处理 'regions' 列表中的每一个元素
        regions.forEach(function(region) {

            // 1. 筛选影像
            // 注意：这里的 filterBounds 使用的是当前循环中的 region.geometry
            var image = dataset
            .filterDate(date_s, date_e)
            .filterBounds(region.geometry)
            .first();

            // 2. 在地图上显示影像（可选，方便检查）
            // 为了区分，我们用区域的名字来命名图层
            Map.addLayer(image, visParams, region.name + ' embeddings');
            Map.centerObject(region.geometry, 10); // 将地图中心移动到当前区域

            // 3. 为当前区域创建导出任务
            Export.image.toDrive({
            image: image,
            // 在描述和文件名中加入区域名称，以保证每个任务和文件都独一无二
            description: 'Satellite_Embedding_' + region.name,
            folder: 'GEE_Exports', // Google Drive 中的文件夹
            fileNamePrefix: 'Satellite_Embedding_' + region.name, // 文件名前缀
            scale: resolution,
            region: region.geometry, // 使用当前循环中的区域范围
            maxPixels: 1e13
            });
        });"""


        # Format the output in JavaScript syntax for Earth Engine
        js_regions = "var regions = [\n"
        for region in regions:
            js_regions += f"  {{\n"
            js_regions += f"    name: '{region['name']}',\n"
            js_regions += f"    geometry: {region['geometry']}\n"
            js_regions += f"    'time_start': '{grid_dict[region['name']]['time_start']}',\n"
            js_regions += f"    'time_end': '{grid_dict[region['name']]['end_time']}'\n"
            js_regions += f"  }},\n"
        js_regions += "];"

        
        batched_regions = []
        for i in range(0, len(regions), batch_size):
            batch = regions[i:i+batch_size]
            batched_regions.append(batch)

        # Create separate JavaScript files for each batch
        for batch_idx, batch in enumerate(batched_regions):
            batch_js = "var regions = [\n"
            for region in batch:
                batch_js += f"  {{\n"
                batch_js += f"    name: '{region['name']}',\n"
                batch_js += f"    geometry: {region['geometry']},\n"
                batch_js += f"    time_start: '{grid_dict[region['name']]['start_time']}',\n"
                batch_js += f"    time_end: '{grid_dict[region['name']]['end_time']}'\n"
                batch_js += f"  }},\n"
            batch_js += "];" + "\n" + "\n" + gee_code

            # Create the directory if it doesn't exist
            os.makedirs(f'/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata/{save_name}_download_GEE', exist_ok=True)
            # Save each batch to a separate file
            with open(f'/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata/{save_name}_download_GEE/{save_name}_regions_batch_{batch_idx+1}.txt', 'w') as f:
                f.write(batch_js)

            print(f"Batch {batch_idx+1} with {len(batch)} regions saved to {save_name}_regions_batch_{batch_idx+1}.txt")
            print(f"Generated {len(regions)} regions for Earth Engine")


    elif mode=="python":
        js_regions = "var regions = [\n"
        for region in regions:
            js_regions += f"  {{\n"
            js_regions += f"    'name': '{region['name']}',\n"
            js_regions += f"    'geometry': {region['geometry']}\n"
            js_regions += f"    'time_start': '{grid_dict[region['name']]['start_time']}',\n"
            js_regions += f"    'time_end': '{grid_dict[region['name']]['end_time']}'\n"
            js_regions += f"  }},\n"
        js_regions += "]"


        # Split regions into groups of 25
        batched_regions = []
        for i in range(0, len(regions), batch_size):
            batch = regions[i:i+batch_size]
            batched_regions.append(batch)

            # os.makedirs(f'/code/GEE_download/{save_name}/{save_name}_download_python/', exist_ok=True)

            json_batch = [
                {
                    "name": region["name"],
                    "geometry": region["geometry"],
                    "time_start": grid_dict[region['name']]['start_time'],
                    "time_end": grid_dict[region['name']]['end_time']
                }
                for region in batch
            ]
            os.makedirs(f"/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata/{save_name}_download_python/", exist_ok=True)
            with open(f'/nas/houce/Alphaearth_embedding/GEE_extracted/{save_name}/metadata/{save_name}_download_python/{save_name}_regions_batch_{i//batch_size+1}.json', 'w', encoding='utf-8') as f_json:
                json.dump(json_batch, f_json, ensure_ascii=False, indent=4)
            print(f"Batch {i//batch_size+1} with {len(batch)} regions saved to {save_name}_regions_batch_{i//batch_size+1}.json")

# save_name = f"Africa_grid_{bound[0]}_{bound[1]}_{bound[2]}_{bound[3]}"
generate_code_script(25, save_name, grid_dict, mode='python')

Batch 1 with 25 regions saved to Africa_all_2008_2024_regions_batch_1.json
Batch 2 with 25 regions saved to Africa_all_2008_2024_regions_batch_2.json
Batch 3 with 25 regions saved to Africa_all_2008_2024_regions_batch_3.json
Batch 4 with 25 regions saved to Africa_all_2008_2024_regions_batch_4.json
Batch 5 with 25 regions saved to Africa_all_2008_2024_regions_batch_5.json
Batch 6 with 25 regions saved to Africa_all_2008_2024_regions_batch_6.json
Batch 7 with 25 regions saved to Africa_all_2008_2024_regions_batch_7.json
Batch 8 with 25 regions saved to Africa_all_2008_2024_regions_batch_8.json
Batch 9 with 25 regions saved to Africa_all_2008_2024_regions_batch_9.json
Batch 10 with 25 regions saved to Africa_all_2008_2024_regions_batch_10.json
Batch 11 with 25 regions saved to Africa_all_2008_2024_regions_batch_11.json
Batch 12 with 25 regions saved to Africa_all_2008_2024_regions_batch_12.json
Batch 13 with 25 regions saved to Africa_all_2008_2024_regions_batch_13.json
Batch 14 with 25 

In [2]:
# years = pd.DataFrame({'year': range(2008, 2025)})
# data['key'] = 1
# years['key'] = 1

# data = pd.merge(data, years, on='key', how='outer').drop('key', axis=1)
# data['time_start'] = data['year'].astype(str) + '0101'
# data['time_end'] = data['year'].astype(str) + '1231'
# data['download_command'] = data.apply(lambda row: f"grid_{row.name}|{row['lon_min']},{row['lat_min']}|{row['lon_max']},{row['lat_max']}|14|{row['time_start']}|12", axis=1)
# data.head()

data['download_command'] = data.apply(lambda row: f"grid_{row.name}|{row['lon_min']},{row['lat_min']}|{row['lon_max']},{row['lat_max']}|14|20080101|12", axis=1)

In [3]:
global_map = gpd.read_file("/nas/houce/country_boundary/ne_110m_admin_0_countries.geojson")
africa_map = global_map[global_map['CONTINENT'] == 'Africa']

In [4]:
from shapely.geometry import Polygon

polygons = data.apply(
    lambda row: Polygon([
        (row['lon_min'], row['lat_min']),
        (row['lon_max'], row['lat_min']),
        (row['lon_max'], row['lat_max']),
        (row['lon_min'], row['lat_max']),
        (row['lon_min'], row['lat_min'])
    ]),
    axis=1
)

data_gpd = gpd.GeoDataFrame(data, geometry=polygons, crs="EPSG:4326")
data_gpd.to_csv("/nas/houce/global_slum/RS_download_data/human_settlement_grid_cells_deg_05_with_geometry.csv", index=False)

In [5]:
africa_data_gpd = data_gpd.sjoin(africa_map[['CONTINENT', 'SOVEREIGNT','geometry']], how='inner', predicate='intersects')

In [6]:
for i in range(0, len(africa_data_gpd), 25):
    batch = africa_data_gpd.iloc[i:i+25]
    np.savetxt(f'/nas/houce/SGDownload/download_commands/africa_download_commands_part_{i//25 + 1}.txt', batch['download_command'].values, fmt='%s')

In [7]:
# np.savetxt('/nas/houce/SGDownload/africa_download_commands.txt', africa_data_gpd['download_command'].values, fmt='%s')

In [8]:
# download_files = pd.read_json("/nas/houce/Landsat_30m/metadata/metadata/All_exist_slum.json").T.reset_index().rename(columns={'index':'grid_id'})
# download_files['time_start'] = pd.to_datetime(download_files['time_start'], format='mixed').dt.strftime('%Y%m%d')
# download_files['time_end'] = pd.to_datetime(download_files['time_end'], format='mixed').dt.strftime('%Y%m%d')

# download_files['download_command'] = download_files.apply(lambda row: f"{row['grid_id']}|{row['lon_min']},{row['lat_min']}|{row['lon_max']},{row['lat_max']}|14|{row['time_start']}|6", axis=1)
# # download_files['download_command'].to_csv('download_commands.txt', index=False, header=False)
# np.savetxt('download_commands.txt', download_files['download_command'].values, fmt='%s')