# Download and process Combined MODIS Land classification into a geotiff at native resolution

Main page: https://lpdaac.usgs.gov/products/mcd12q1v061/

See here for legend: https://developers.google.com/earth-engine/datasets/catalog/MODIS_061_MCD12Q1#bands

In [16]:
import os
import glob
import sys
import shutil
import tarfile
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling                                                                                        

In [17]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [18]:
# Get the required info from the config file
data_path            = cs.read_from_config(config_file,'data_path')
geospatial_temp_path = cs.read_from_config(config_file,'geospatial_temp_path')
land_path            = cs.read_from_config(config_file,'land2_path')
land_url             = cs.read_from_config(config_file,'land2_url')
doc_url              = cs.read_from_config(config_file,'land2_docs')
download_area        = cs.read_from_config(config_file,'geospatial_area')

### Setup

In [31]:
# Folders
download_folder = Path(data_path) / geospatial_temp_path / land_path / 'download'
raw_folder = Path(data_path) / geospatial_temp_path / land_path / 'raw'
doc_folder = Path(data_path) / geospatial_temp_path / land_path

In [6]:
download_folder.mkdir(parents=True, exist_ok=True)
raw_folder.mkdir(parents=True, exist_ok=True)

In [7]:
# Convert subsetting area into a usable GDAL setting
# subset_area = [lon_min, lon_max, lat_min, lat_max]
# GDAL window = [ulx, uly, lrx, lry]; [upper left x, upper left y, lower right x, lower right y]
# Mapping:
#   ulx = lon_min = subset_area[0]
#   uly = lat_max = subset_area[3]
#   lrx = lon_max = subset_area[1]
#   lry = lat_min = subset_area[2]
subset_coor = download_area.split(',')
window = [subset_coor[0], subset_coor[3], subset_coor[1], subset_coor[2]]

### Processing
The sheer size of the data compared to what we need (300MB per day, 25% or so of which we want), it makes sense to do the whole thing in one big loop and limit disk space usage that way.

In [9]:
# Find all folders we wish to process (each contains a global map of satellite data, annual)
folder_urls = cs.find_folders_on_webpage(land_url, product='MCD12Q1.061')

In [18]:
for folder_url in folder_urls:

    # Find which individual files this online folder contains
    file_urls = cs.find_file_urls_in_webpage_folders([folder_url], extension='.hdf')

    # Check if we already have processed the files contained in this web folder and skip if so
    if cs.check_modis_interrupt_status(file_urls[0],raw_folder):
        print(f'NOTE: {folder_url} has already been processed. Skipping.')
        continue

    # Download the files
    for url in file_urls:
        sub_folder = cs.download_modis_into_day_folder(download_folder, url)

    # Merge daily files into a single geotiff of domain of interest
    cs.process_daily_modis_hdf_to_tif(sub_folder, raw_folder,
                                      subdataset_front='HDF4_EOS:EOS_GRID',
                                      subdataset_back='MCD12Q1:LC_Type1', # Found by inspecting a download HDF file with gdalinfo - see end of notebook
                                      to_CRS='EPSG:4326',
                                      subset_window=window)

    # Remove the download folder to save space
    shutil.rmtree(sub_folder)

File E:\CAMELS_spat\geospatial_temp\modis_land\download\2001.01.01\MCD12Q1.A2001001.h07v03.061.2022146054828.hdf exists and download_url_into_folder() argument overwrite is False. Skipping file.
File E:\CAMELS_spat\geospatial_temp\modis_land\download\2001.01.01\MCD12Q1.A2001001.h07v05.061.2022146033046.hdf exists and download_url_into_folder() argument overwrite is False. Skipping file.
File E:\CAMELS_spat\geospatial_temp\modis_land\download\2001.01.01\MCD12Q1.A2001001.h07v06.061.2022146033902.hdf exists and download_url_into_folder() argument overwrite is False. Skipping file.
File E:\CAMELS_spat\geospatial_temp\modis_land\download\2001.01.01\MCD12Q1.A2001001.h07v07.061.2022146034917.hdf exists and download_url_into_folder() argument overwrite is False. Skipping file.
File E:\CAMELS_spat\geospatial_temp\modis_land\download\2001.01.01\MCD12Q1.A2001001.h08v03.061.2022146025122.hdf exists and download_url_into_folder() argument overwrite is False. Skipping file.
File E:\CAMELS_spat\geosp

In [19]:
# Get the legend
cs.download_url_into_folder(doc_url, doc_folder)

Successfully downloaded https://lpdaac.usgs.gov/documents/1409/MCD12_User_Guide_V61.pdf


### Find the mode land class

In [25]:
# Find the annual files
annual_files = sorted(glob.glob( str(raw_folder / '*.tif') ))

In [39]:
modis_to_file = find_mode_modis(annual_files)

In [40]:
# Save
first_data_year = os.path.basename(annual_files[0])[0:4] 
last_data_year = os.path.basename(annual_files[-1])[0:4]
des_name = f'{first_data_year}_{last_data_year}_mode{os.path.basename(annual_files[0])[8:]}'
des_file = str( raw_folder / des_name )

In [42]:
modis_noData = get_geotif_noData(annual_files[0])

In [43]:
write_geotif_sameDomain(annual_files[0], des_file, modis_to_file.filled(), no_data_value=modis_noData)

### GeoTIFF functions

In [38]:
import numpy as np
import numpy.ma as ma
import os
from osgeo import gdal, osr
import scipy.stats as sc

In [37]:
def find_mode_modis(modis_files):

    # Load the data as numpy arrays, stack vertically, and find the mean value (ignoring nan)
    data = [get_geotif_data_as_array(file) for file in modis_files] # Get data as uint8
    stacked = np.dstack(data) # Create a 3D stack
    mode_land_class = sc.mode(stacked, axis=2)

    # Get the noData value 
    modis_noData = get_geotif_noData(modis_files[0])

    # Transfer into masked array for GeoTIFF writing
    modis_to_file = ma.masked_equal(mode_land_class[0], modis_noData)

    return modis_to_file

In [12]:
def get_geotif_data_as_array(file, band=1):
    ds = gdal.Open(file) # open the file
    band = ds.GetRasterBand(band) # get the data band
    data = band.ReadAsArray() # convert to numpy array for further manipulation   
    return data

In [11]:
def get_geotif_noData(src_file, band=1):
    src_ds = gdal.Open(src_file)
    src_band = src_ds.GetRasterBand(band)
    no_data_value = src_band.GetNoDataValue()
    src_ds = None
    return no_data_value

In [35]:
def write_geotif_sameDomain(src_file, des_file, des_data, no_data_value=None):
    
    # load the source file to get the appropriate attributes
    src_ds = gdal.Open(src_file)
    
    # get the geotransform
    des_transform = src_ds.GetGeoTransform()

    # Get the scale factor from the source metadata
    scale_factor = src_ds.GetRasterBand(1).GetScale()
    offset = src_ds.GetRasterBand(1).GetOffset()
    
    # get the data dimensions
    ncols = des_data.shape[1]
    nrows = des_data.shape[0]
    
    # make the file
    driver = gdal.GetDriverByName("GTiff")
    dst_ds = driver.Create(des_file,ncols,nrows,1,gdal.GDT_Float32, options = [ 'COMPRESS=DEFLATE' ])
    
    # Write the data
    #dst_ds.GetRasterBand(1).WriteArray( des_data )
    dst_band = dst_ds.GetRasterBand(1)
    dst_band.WriteArray(des_data)
    if no_data_value:
        dst_band.SetNoDataValue(no_data_value)
    
    # Set the scale factor and offset in the destination band, if they were defined in the source
    if scale_factor: dst_ds.GetRasterBand(1).SetScale(scale_factor)
    if offset: dst_ds.GetRasterBand(1).SetOffset(offset)
    
    # Set the geotransform
    dst_ds.SetGeoTransform(des_transform)

    # Set the projection
    wkt = src_ds.GetProjection()
    srs = osr.SpatialReference()
    srs.ImportFromWkt(wkt)
    dst_ds.SetProjection( srs.ExportToWkt() )
    
    # close files
    src_ds = None
    des_ds = None

    return

### Remove the (empty) download folder

In [20]:
shutil.rmtree(download_folder)

### Find download settings
Only needed to find the correct HDF band names. No need to re-run.

In [11]:
# Find which individual files this online folder contains
file_urls = cs.find_file_urls_in_webpage_folders([folder_urls[0]], extension='.hdf')

In [13]:
for url in file_urls:
    sub_folder = cs.download_modis_into_day_folder(download_folder, url)

Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h07v03.061.2022146054828.hdf
Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h07v05.061.2022146033046.hdf
Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h07v06.061.2022146033902.hdf
Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h07v07.061.2022146034917.hdf
Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h08v03.061.2022146025122.hdf
Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h08v04.061.2022146034934.hdf
Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h08v05.061.2022146033917.hdf
Successfully downloaded https://e4ftl01.cr.usgs.gov/MOTA/MCD12Q1.061/2001.01.01/MCD12Q1.A2001001.h08v06.061.2022146035

In [14]:
from osgeo import gdal

In [15]:
gdal.UseExceptions()

In [17]:
hdf_file = 'E:/CAMELS_spat/geospatial_temp/modis_land/download/2001.01.01/MCD12Q1.A2001001.h07v06.061.2022146033902.hdf'
info = gdal.Info(hdf_file)
print(info)

Driver: HDF4/Hierarchical Data Format Release 4
Files: E:/CAMELS_spat/geospatial_temp/modis_land/download/2001.01.01/MCD12Q1.A2001001.h07v06.061.2022146033902.hdf
Size is 512, 512
Metadata:
  ALGORITHMPACKAGEACCEPTANCEDATE=1998-01-01
  ALGORITHMPACKAGEMATURITYCODE=LAUNCH
  ALGORITHMPACKAGENAME=MOD12Q1
  ALGORITHMPACKAGEVERSION=V2.0
  ASSOCIATEDINSTRUMENTSHORTNAME.1=MODIS
  ASSOCIATEDINSTRUMENTSHORTNAME.2=MODIS
  ASSOCIATEDPLATFORMSHORTNAME.1=Terra
  ASSOCIATEDPLATFORMSHORTNAME.2=Aqua
  ASSOCIATEDSENSORSHORTNAME.1=MODIS
  ASSOCIATEDSENSORSHORTNAME.2=MODIS
  AUTOMATICQUALITYFLAG.1=Passed
  AUTOMATICQUALITYFLAGEXPLANATION.1=To be set as 'Passed' or 'Failed' to indicate failure of PGE test.
  CHARACTERISTICBINANGULARSIZE=15.0
  CHARACTERISTICBINSIZE=463.3127165
  DATACOLUMNS=2400
  DATAROWS=2400
  DAYNIGHTFLAG=Day
  DESCRREVISION=6.1
  EASTBOUNDINGCOORDINATE=-106.408909
  EXCLUSIONGRINGFLAG.1=N
  GEOANYABNORMAL=False
  GEOESTMAXRMSERROR=50.0
  GLOBALGRIDCOLUMNS=86400
  GLOBALGRIDROWS=43200