In [1]:
import dask.array as da
import pandas as pd
import numpy as np
import rasterio
from rasterio.windows import Window
import dask
from dask import delayed
import os

# Set working directory
os.chdir('/Volumes/Kew Back Up 1/KEW/Cluster_Code/March_25_Run/June_Output/Processing')

# Input/output paths
input_tiff = 'stacked_numanalog_slope.tif'
output_csv = 'NumAnalog_Slope.csv'

# Function to process a chunk of the raster
def process_chunk(window):
    with rasterio.open(input_tiff) as src:
        # Read both bands
        distance = src.read(1, window=window)
        slope = src.read(2, window=window)

        # Get row/col indices
        rows, cols = np.indices(distance.shape)
        x_coords, y_coords = rasterio.transform.xy(src.transform, rows.flatten(), cols.flatten(), offset='center')

        # Flatten arrays
        x_coords = np.array(x_coords)
        y_coords = np.array(y_coords)
        distance_flat = distance.flatten()
        slope_flat = slope.flatten()

        # Keep only valid (non-NaN) pixels in both bands
        valid_mask = ~np.isnan(distance_flat) & ~np.isnan(slope_flat)

        return pd.DataFrame({
            'X_Coordinate': x_coords[valid_mask],
            'Y_Coordinate': y_coords[valid_mask],
            'Climate_Distance': distance_flat[valid_mask],
            'Slope': slope_flat[valid_mask]
        })

# Chunk size
chunk_size = 1024

# Process all chunks
with rasterio.open(input_tiff) as src:
    nrows, ncols = src.shape
    total_chunks = (nrows // chunk_size + 1) * (ncols // chunk_size + 1)
    processed_chunks = 0

    delayed_dfs = []
    for row_off in range(0, nrows, chunk_size):
        for col_off in range(0, ncols, chunk_size):
            window = Window(col_off, row_off, chunk_size, chunk_size)
            delayed_dfs.append(delayed(process_chunk)(window))
            processed_chunks += 1
            print(f'Progress: {processed_chunks / total_chunks * 100:.2f}%')

    # Compute and combine all chunks
    dfs = dask.compute(*delayed_dfs)
    df = pd.concat(dfs)

# Save to CSV
df.to_csv(output_csv, index=False)
print(f"CSV file created: {output_csv}")


Progress: 0.11%
Progress: 0.22%
Progress: 0.33%
Progress: 0.44%
Progress: 0.55%
Progress: 0.66%
Progress: 0.78%
Progress: 0.89%
Progress: 1.00%
Progress: 1.11%
Progress: 1.22%
Progress: 1.33%
Progress: 1.44%
Progress: 1.55%
Progress: 1.66%
Progress: 1.77%
Progress: 1.88%
Progress: 1.99%
Progress: 2.10%
Progress: 2.21%
Progress: 2.33%
Progress: 2.44%
Progress: 2.55%
Progress: 2.66%
Progress: 2.77%
Progress: 2.88%
Progress: 2.99%
Progress: 3.10%
Progress: 3.21%
Progress: 3.32%
Progress: 3.43%
Progress: 3.54%
Progress: 3.65%
Progress: 3.77%
Progress: 3.88%
Progress: 3.99%
Progress: 4.10%
Progress: 4.21%
Progress: 4.32%
Progress: 4.43%
Progress: 4.54%
Progress: 4.65%
Progress: 4.76%
Progress: 4.87%
Progress: 4.98%
Progress: 5.09%
Progress: 5.20%
Progress: 5.32%
Progress: 5.43%
Progress: 5.54%
Progress: 5.65%
Progress: 5.76%
Progress: 5.87%
Progress: 5.98%
Progress: 6.09%
Progress: 6.20%
Progress: 6.31%
Progress: 6.42%
Progress: 6.53%
Progress: 6.64%
Progress: 6.76%
Progress: 6.87%
Progress