#### coordinates left=-124.00055555649317, bottom=41.9994444436071, right=-122.99944444340576, top=43.00055555579519

In [1]:
import os
import numpy as np
import rasterio
from rasterio.mask import mask
from shapely.geometry import box, mapping
from datetime import datetime

def read_raster_within_bounds(raster_path, min_lon, max_lon, min_lat, max_lat):
    """Read raster and extract matrix based on the bounding box."""
    with rasterio.open(raster_path) as src:
        # Create a bounding box using Shapely geometry
        bbox = box(min_lon, min_lat, max_lon, max_lat)
        geo_json = [mapping(bbox)]  # Convert to GeoJSON format
        
        # Apply mask (crop raster)
        out_image, out_transform = mask(src, geo_json, crop=True)
        
        return out_image

def process_raster_files(directory, min_lon, max_lon, min_lat, max_lat, start_date, end_date):
    """Process all raster files in the given directory within the date range."""
    all_matrices = []
    
    # Iterate over all the files in the given directory
    for file_name in os.listdir(directory):
        # Only process .tif files that match the pattern
        if file_name.endswith('.tif') and file_name.startswith('Global_Landslide_Nowcast_v1.1'):
            
            # Extract the date part (yyyyMMdd) from the filename
            try:
                # Filename format: Global_Landslide_Nowcast_v1.1_yyyyMMdd.tif
                date_str = file_name.split('_')[-1].split('.')[0]  # Extract yyyyMMdd
                file_date = datetime.strptime(date_str, '%Y%m%d')  # Convert to datetime object
                
                # Check if the file date is within the specified date range
                if start_date <= file_date <= end_date:
                    #print(file_name)
                    raster_path = os.path.join(directory, file_name)
                    # Read the raster and get the matrix
                    matrix = read_raster_within_bounds(raster_path, min_lon, max_lon, min_lat, max_lat)
                    #matrix = matrix.reshape((1, 122, 29641))  # Reshape to desired shape if needed
                    all_matrices.append(matrix)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    
    # Convert list to a numpy array (if you want to have it as a 3D numpy array)
    all_matrices = np.array(all_matrices)

    return all_matrices

# Define the bounds (latitudes and longitudes)
# Define the bounds (latitudes and longitudes)
min_lon = 124.00055555649317
max_lon = -122.99944444340576
min_lat = 41.9994444436071
max_lat = 43.00055555579519

# Date range for filtering the files
start_date = datetime(2018, 1, 1)
end_date = datetime(2020, 12, 31)

# Directory containing your raster files
raster_directory = 'GlobalLandslidesfrom2015to2020'

# Process the files and get the appended matrices
all_raster_matrices = process_raster_files(raster_directory, min_lon, max_lon, min_lat, max_lat, start_date, end_date)

# Output the shape of the final 3D numpy array
print(f"Shape of the final 3D matrix: {all_raster_matrices.shape}")


Shape of the final 3D matrix: (1096, 1, 122, 29641)


In [2]:
squeezed_matrix = np.squeeze(all_raster_matrices)
modified_matrix = squeezed_matrix[:, 1:-1, :-1]
print(f"Shape of the modified matrix: {modified_matrix.shape}")
unique_values, counts = np.unique(modified_matrix, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")


Shape of the modified matrix: (1096, 120, 29640)
Value: 0, Count: 3651416987
Value: 1, Count: 21298098
Value: 2, Count: 6744115
Value: 255, Count: 218793600


In [19]:
import numpy as np
from sklearn.impute import KNNImputer

# Reshape the matrix to a 2D array for KNN imputation (Each 2D slice is processed separately)
num_slices = modified_matrix.shape[0]
num_rows = modified_matrix.shape[1]
num_columns = modified_matrix.shape[2]

# KNN Imputer setup with k=3 for less resource consumption
imputer = KNNImputer(n_neighbors=3)

# Create an empty matrix to store the imputed values
imputed_matrix = np.empty_like(modified_matrix, dtype=np.float32)

for i in range(num_slices):
    if i%50 == 0:
        print(i)
    # Extract the 2D slice
    slice_data = modified_matrix[i]

    # Create a mask for missing values (255 signifies missing)
    mask = slice_data == 255

    # Replace 255 with NaN for imputation
    slice_data_with_nan = np.where(mask, np.nan, slice_data)

    # Fix columns that are entirely NaN by replacing them with a default value (e.g., mean of non-NaN values)
    # Check for columns that are entirely NaN and replace them with a fixed value (e.g., 0 or mean of other columns)
    for col in range(slice_data_with_nan.shape[1]):
        if np.all(np.isnan(slice_data_with_nan[:, col])):
            # You can replace the column with a default value like 0 or mean of other columns
            slice_data_with_nan[:, col] = 0  # Or replace with np.nanmean(slice_data_with_nan[:, ~mask.any(axis=0)], axis=0)
    
    # Perform KNN imputation on this 2D slice
    slice_imputed = imputer.fit_transform(slice_data_with_nan)

    # Store the imputed slice back into the result matrix
    imputed_matrix[i] = slice_imputed

# Now, imputed_matrix contains the imputed data
print(f"Shape of the imputed matrix: {imputed_matrix.shape}")


0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
Shape of the imputed matrix: (1096, 120, 29640)


In [20]:
unique_values, counts = np.unique(imputed_matrix, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")


Value: 0.0, Count: 3870210587
Value: 1.0, Count: 21298098
Value: 2.0, Count: 6744115


In [21]:
# Save multiple matrices (or just one matrix) to a compressed .npz file
np.savez_compressed('landslide_labels_stored_sequentially_compressed.npz', matrix=imputed_matrix)

# # To load it later:
# loaded_data = np.load('matrix_compressed.npz')
# loaded_matrix = loaded_data['matrix']
# print(loaded_matrix.shape)


In [22]:
# To load it later: checking
loaded_data = np.load('landslide_labels_stored_sequentially_compressed.npz')
loaded_matrix = loaded_data['matrix']
print(loaded_matrix.shape)


(1096, 120, 29640)
