In [2]:
import numpy as np
import pandas as pd
from collections.abc import Iterable

import rasterio as rio
import earthpy.spatial as es
import pyproj;  

import urllib
import pickle

import time
from ipypb import track
import os

### Set directories

In [3]:
## home directory for downloads
os.chdir("E:/University College London/O'Sullivan, Aidan - SDG6/")

## path for where dsid csvs and sites3.p file are stored
d_path = "./Landsat data/Distributed download/"

## path where downloaded rasters to be saved
c1_path = "./Landsat data/Cropped level 1 data/"

### Cropper function

In [4]:
def cropper(raster, geoms, outpath):
    """
    This function accepts a raster object, an interable list of geometrys (or a single geometry),
    and a filepath to save the cropped raster to, the cropped raster is then read back in and returned.
    """    
    ## As crop accepts an iterable of geoms we first put any single geoms into a list
    if not isinstance(geoms, Iterable):
        geoms = [geoms]

    ## Next we crop the image
    raster_crop, raster_crop_meta = es.crop_image(raster, geoms)

    ## We now need to update the metadata with the spatial data
    raster_crop_meta.update({'transform': raster_crop_meta['transform'],
                             'height': raster_crop.shape[1],
                             'width': raster_crop.shape[2],
                             'nodata': raster_crop.min()}) # <- This is the 'mask' value
    
    with rio.open(outpath, 'w', **raster_crop_meta) as file:
        file.write(raster_crop[0], 1)
        
    raster_crop = rio.open(outpath)
    
    return raster_crop

### Read in sites data

In [5]:
sites = pd.read_pickle("sites4.p")#.set_index(['sid','dt'])
sites = sites.set_geometry('geometry_poly')

#### Filter site data for download

In [6]:
## Filter sites and dates for those with scenes
dll = sites[sites.display_id.notna()]

## Filter sites for those with geometry
dll = dll[dll.geometry_poly.notna()].set_geometry('geometry_poly')

## Optional filter to remove sites with v large polygons
dll = dll[dll.geometry_poly.area<4]

len(dll)

209393

### Read in list of scenes to download and filter 

In [16]:
## change dsid file to read in
dsids = list(pd.read_csv(d_path+'dsids_14.csv')['0'])

## filter for for already downloaded scenes
dowloaded_scenes = [ i.replace('_MTL.txt','') for i in os.listdir(c1_path) if 'MTL' in i ]

dsids = [ i for i in dsids if i not in dowloaded_scenes ]

## filter LM05 and LT08 scenes
dsids = [ i for i in dsids if 'LM05' not in i ]
dsids = [ i for i in dsids if 'LT08' not in i ]

## filter for whether in updated sites data
dll_dsids = dll.reset_index().display_id
dll_dsids = [ i for i in dll_dsids.unique() if i is not np.nan ]

dsids = [ i for i in dsids if i in dll_dsids ]


f'Number of scenes to download: {len(dsids)}'

'Number of scenes to download: 901'

### Read in exceptions lists

In [8]:
## change directory for
crop_exceptions = [pd.read_csv(d_path+'crop_exceptions.csv')]
meta_exceptions = [pd.read_csv(d_path+'meta_exceptions.csv')]

### Loop through scene list and sites to download and crop rasters

In [18]:
## set crop buffer distance (300m)
bbox_bufd = 300   # to calculate in degrees 360*300/(40000*1000)

## create bands and metadata lists
bands_5 = ['B1','B2','B3','B4','B5','BQA','B6']
bands_7 = ['B1','B2','B3','B4','B5','BQA','B6_VCID_1','B6_VCID_2']
bands_8 = ['B2','B3','B4','B5','BQA','B10','B11']

meta_file_5 = ['MTL','ANG']
meta_file_7 = ['MTL','ANG','GCP']
meta_file_8 = ['MTL','ANG']

for dsid in track(dsids):
    
    #~~~~ TIF download ~~~~#
    
    sitesdl = dll[dll.display_id==dsid]
    
    ## create iterable based on platform and bands
    if 'LT05' in dsid:
        zip_list = list(zip(bands_5,[dsid]*len(bands_5)))               
    elif 'LE07' in dsid:
        zip_list = list(zip(bands_7,[dsid]*len(bands_7)))
    elif 'LC08' in dsid:
        zip_list = list(zip(bands_8,[dsid]*len(bands_8)))
    
    ## open scene for each band in turn
    for band,dsid in zip_list:

        ## create url elements
        platform = dsid[0:4]
        key = dsid[10:13]+'/'+dsid[13:16]

        ## TIF construct source url
        filepath = f'https://storage.googleapis.com/gcp-public-data-landsat/{platform}/01/{key}/{dsid}/{dsid}_{band}.TIF'

        with rio.open(filepath) as src:
            
            ## loop through sites with that scene
            for sid in sitesdl.index.get_level_values(0):
                
                ## subset site list for site
                sitedl = sitesdl.loc[sid,:]

                ## setup polygon for cropping
                polygon_bbox = sitedl.envelope.to_crs(src.crs) ## create bounding box and change to source crs                     
                polygon_bbox = polygon_bbox.buffer(bbox_bufd) ## buffer bounding box by duffer distance e.g. 300m
                polygon_geom = polygon_bbox.geometry          ## select geometry

                try:

                    cropped = cropper(src, polygon_geom, c1_path+sid+'__'+dsid+'_'+band+'.TIF')            

                except:
                    crop_exceptions.append(dsid+band)
                    print(f'Crop exception for {dsid} {band}')


            src.close()

        time.sleep(1)
        
    #~~~~ metadata download ~~~~#
    
    ## create iterable based on platform and meta file
    if 'LT05' in dsid:
        mzip_list = list(zip(meta_file_5,[dsid]*len(meta_file_5)))               
    elif 'LE07' in dsid:
        mzip_list = list(zip(meta_file_7,[dsid]*len(meta_file_7)))  
    elif 'LC08' in dsid:
        mzip_list = list(zip(meta_file_8,[dsid]*len(meta_file_8)))  
    
    ## loop through meta dile and scenes
    
    for mf,dsid in mzip_list:

        ## create url elements
        platform = dsid[0:4]
        key = dsid[10:13]+'/'+dsid[13:16]

        ## Metadata construct source url
        filepath = f'https://storage.googleapis.com/gcp-public-data-landsat/{platform}/01/{key}/{dsid}/{dsid}_{mf}.txt'

        try:

            ## MTL file
            remote_file = urllib.request.urlopen(filepath).read()

            local_file = open(c1_path+dsid+'_'+mf+'.txt','wb')
            local_file.write(remote_file)
            local_file.close()

        except:
            meta_exceptions.append(dsid)

            time.sleep(1)

            print(f'Scene meta not available {dsid}')


Scene meta not available LE07_L1GT_129208_20100709_20161213_01_T2
Scene meta not available LE07_L1GT_129208_20100319_20161216_01_T2
Scene meta not available LE07_L1GT_129208_20100911_20161212_01_T2
Scene meta not available LE07_L1TP_031036_20120731_20160910_01_T1
Scene meta not available LE07_L1TP_031036_20120731_20160910_01_T1
Scene meta not available LE07_L1TP_030036_20130201_20160909_01_T1
Scene meta not available LE07_L1GT_025037_20131012_20160907_01_T2
Scene meta not available LE07_L1GT_024038_20171101_20171128_01_T2
Crop exception for LT05_L1GS_026037_20090509_20160906_01_T2 B1
Crop exception for LT05_L1GS_026037_20090509_20160906_01_T2 B1
Crop exception for LT05_L1GS_026037_20090509_20160906_01_T2 B2
Crop exception for LT05_L1GS_026037_20090509_20160906_01_T2 B2
Crop exception for LT05_L1GS_026037_20090509_20160906_01_T2 B3
Crop exception for LT05_L1GS_026037_20090509_20160906_01_T2 B3
Crop exception for LT05_L1GS_026037_20090509_20160906_01_T2 B4
Crop exception for LT05_L1GS_02

KeyboardInterrupt: 

### write exceptions list to csv

In [None]:
pd.Series(meta_exceptions).to_csv('./Landsat data/meta_exceptions.csv')
pd.Series(crop_exceptions).to_csv('./Landsat data/crop_exceptions.csv')  

In [1]:
732-392

340