This code merges the different tiles (tar files) from Landsat data considering one date folder at a time. The tar files are copied from the sharepoint folder and extracted to a local temporary folder. It then processes bands 1 to 11. The mosaicked files are stored in a folder named "output_mosaicked" within the output folder for each date. These output tiff files are named as satellite_merged_date[yyyymmdd]_band.tif (eg. Landsat_merged_20250324_B2.tif). After mosaicking, the code clips all the band tiffs to the three study provinces (which is input as a shape file) and saves the outputs to tiff files (one file for each band) also inside "output_mosaicked". The merged and clipped output tiffs are named as satellite_merged_clipped_date[yyyymmdd]_band.tif (eg. Landsat_merged_clipped_20250324_B2.tif). All the calculations are done locally. Temporary files are created locally during the calculations, but they (including the input tar files) are deleted at the end of the code execution. This has been done so as to conveninently execute the code even in a normal computer with limited storage capacity.

In [1]:
import os
import sys
import subprocess

# This gives the name of the environment directory
print("Environment name:", os.path.basename(sys.prefix))

Environment name: A4I064-ML


In [2]:
# Install necessary packages, if needed

required_packages = ["zipfile", "glob", "rasterio", "geopandas", "datetime", "re", "shutil"]

for package in required_packages:
    try:
        __import__(package if package != "scikit-learn" else "sklearn")
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print("All packages have been installed!")

zipfile is already installed.
glob is already installed.
rasterio is already installed.
geopandas is already installed.
datetime is already installed.
re is already installed.
shutil is already installed.
papermill not found. Installing...
All packages have been installed!


In [None]:
# Import necessary packages
import tempfile
import os
import rasterio
from rasterio.merge import merge
from rasterio.plot import show
import numpy as np
from rasterio.warp import reproject, Resampling
from rasterio.enums import Resampling
import tarfile
import geopandas as gpd
from rasterio.mask import mask
import shutil
from rasterio.crs import CRS

In [None]:
# User defined input and output paths
source_folder = r"C:\Users\U8019357\OneDrive - UniSQ\00_Projects\2025.06.06 A4I Crop Monitoring Vietnam\04_Data\A4I Geospatial Tech - Global Shared Folder - Put Data Here\Satellite Data\Landsat"

# Date string (i.e., sub-folder name)
date_str = "20250324"

# Output folder
output_folder = r"C:\Users\U8019357\UniSQ\A4I Geospatial Tech - UniSQ Internal - UniSQ Internal\1 - Image Processing\Processed_Landsat_VI_TIFFs"

# Read AOI shapefile (e.g., provincial borders)
aoi_path = r"C:\Users\U8019357\UniSQ\A4I Geospatial Tech - UniSQ Internal - UniSQ Internal\1 - Image Processing\Raw Data\GIS Maps and Shapefiles\Three Provinces (Old)\StudyAreaA4I.shp"  

# List of band numbers to process
bands = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B9', 'B10', 'B11']

In [None]:
input_folder_path = os.path.join(source_folder, date_str)
print(input_folder_path)

output_dir_date = os.path.join(output_folder, date_str)
os.makedirs(output_dir_date, exist_ok=True)
print(output_dir_date)

# Temporary folder to extract zip files
temp_folder = tempfile.TemporaryDirectory()
print(temp_folder.name)

In [None]:
# Read AOI shapefile
aoi_gdf = gpd.read_file(aoi_path)
aoi_geom = [feature["geometry"] for feature in aoi_gdf.__geo_interface__["features"]]

In [None]:
# Iterate over each tar file in the input folder
tar_files = []
for tar_file_name in os.listdir(input_folder_path):
    if tar_file_name.endswith('.tar'):

        tar_file_path = os.path.join(input_folder_path, tar_file_name)
        tar_files.append(tar_file_path)
        
        # Create a separate temporary directory for each tar file within the input folder
        tar_dir = os.path.join(temp_folder.name, os.path.splitext(tar_file_name)[0])
        
        # Extract the tar file into its respective temporary directory
        with tarfile.open(tar_file_path, 'r') as tar:
            tar.extractall(path=tar_dir)
            print(f"Contents of {tar_file_name} extracted to {tar_dir}")
        
        # List the contents of the temporary directory
        print(f"Contents of the temporary directory for {tar_file_name}:")
        print(os.listdir(tar_dir))

print("\n All files have been successfully untarred.")

In [None]:
# Revert the tar files to cloud-only
for tar_file in tar_files:
    print(f"Cleaning {os.path.basename(tar_file)} ...")
    subprocess.run(["attrib", "+U", "-P", tar_file], check=True)

In [None]:
# Performing the mosaick
for band in bands:
    print(f"=== Processing Band: {band} ===")

    band_files = []
    for folder_name in os.listdir(temp_folder.name):
        folder_path = os.path.join(temp_folder.name, folder_name)
        if os.path.isdir(folder_path) and folder_name != os.path.basename(output_dir_date):
            for file_name in os.listdir(folder_path):
                if file_name.endswith(f'{band}.TIF'):
                    band_files.append(os.path.join(folder_path, file_name))

    if not band_files:
        print(f"No files found for band {band}. Skipping...\n")
        continue

    print(f"Files found for band {band} ({len(band_files)} files):")
    for f in band_files:
        print(f" - {f}")

    # Step 1: Create base mosaic to determine extent/shape/transform
    src_files = [rasterio.open(f) for f in band_files]
    base_mosaic, base_transform = merge(src_files)
    base_shape = base_mosaic.shape[1:]  # (height, width)
    base_crs = src_files[0].crs
    base_dtype = src_files[0].dtypes[0]

    # Step 2: Create overlap mask and sum arrays
    overlap_sum = np.zeros(base_shape, dtype='float32')
    overlap_count = np.zeros(base_shape, dtype='uint16')

    for src in src_files:
        temp_array = np.zeros(base_shape, dtype='float32')
        reproject(
            source=rasterio.band(src, 1),
            destination=temp_array,
            src_transform=src.transform,
            src_crs=src.crs,
            dst_transform=base_transform,
            dst_crs=base_crs,
            resampling=Resampling.nearest
        )
        ov_mask = temp_array > 0
        overlap_sum += temp_array * ov_mask
        overlap_count += ov_mask.astype('uint16')

    overlap_mask = overlap_count > 1
    overlap_avg = np.zeros_like(overlap_sum, dtype='float32')
    overlap_avg[overlap_mask] = overlap_sum[overlap_mask] / overlap_count[overlap_mask]

    # Save the overlap average to a temporary file
    temp_overlap_tif = os.path.join(temp_folder.name, f'temp_overlap_{date_str}_{band}.TIF')
    out_meta = src_files[0].profile.copy()
    out_meta.update({
        "height": base_shape[0],
        "width": base_shape[1],
        "transform": base_transform,
        "driver": "GTiff",
        "count": 1,
        "dtype": 'float32'
    })
    
    with rasterio.open(temp_overlap_tif, 'w', **out_meta) as dst:
        dst.write(overlap_avg, 1)

    # Step 3: Create mosaic using first method
    final_mosaic, final_transform = merge(src_files)
    final_output = final_mosaic[0]  # single band

    # Step 4: Patch the overlapping areas
    final_output[overlap_mask] = overlap_avg[overlap_mask]

    # Step 5: Write final output
    final_out_path = os.path.join(output_dir_date, f'Landsat_Merged_{date_str}_{band}.TIF')
    out_meta.update({"dtype": final_output.dtype})
    with rasterio.open(final_out_path, 'w', **out_meta) as dst:
        dst.write(final_output, 1)

    for src in src_files:
        src.close()

    print(f"✅ Mosaic completed for band {band}. Output saved to: {final_out_path}\n")

print("✅ All files have been successfully mosaicked and saved.")

In [None]:
# Clip each band TIF
for file_name in os.listdir(output_dir_date):
    if file_name.startswith(f"Landsat_Merged_{date_str}_B") and file_name.endswith(".TIF"):
        band_path = os.path.join(output_dir_date, file_name)
        band_num = file_name.split("_B")[-1].replace(".TIF", "")  # Extract band number

        with rasterio.open(band_path) as src:
            try:
                clipped_image, clipped_transform = mask(src, aoi_geom, crop=True)
                clipped_meta = src.meta.copy()
                clipped_meta.update({
                    "height": clipped_image.shape[1],
                    "width": clipped_image.shape[2],
                    "transform": clipped_transform
                })

                # Save clipped raster (without 'B' in the band number)
                clipped_out_path = os.path.join(
                    output_dir_date, f"Landsat_Merged_Clipped_{date_str}_B{band_num}.TIF"
                )
                with rasterio.open(clipped_out_path, "w", **clipped_meta) as dst:
                    dst.write(clipped_image)
                print(f"Clipped B{band_num} saved to: {clipped_out_path}")
            except Exception as e:
                print(f"Failed to clip {file_name}: {e}")

In [None]:
# Clean the temporary folder
temp_folder.cleanup()