<a href="https://colab.research.google.com/github/Drfengze/ANN-CBIR/blob/main/1.2_training_dataset_corn_sentinelonly_june1_julyend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data preparation

In [1]:
from google.colab import auth

auth.authenticate_user()
import ee

ee.Authenticate()
ee.Initialize()
import numpy as np
from datetime import datetime, timedelta, date
import random
import pandas as pd


To authorize access needed by Earth Engine, open the following URL in a web browser and follow the instructions. If the web browser does not start automatically, please manually browse the URL below.

    https://code.earthengine.google.com/client-auth?scopes=https%3A//www.googleapis.com/auth/earthengine%20https%3A//www.googleapis.com/auth/devstorage.full_control&request_id=5PEjbZ2a9X3cUUilgpFZiPnX3T09xvwLr8lsUY7SQoY&tc=-8iRbCBH46fqXOHr4n7O8IuMcOZnpFwXOHjTXtvGioE&cc=TxdF4lF8s1xJSXPOgno-lR7TFBozLp-vkjwvBOig7gQ

The authorization workflow will generate a code, which you should paste in the box below.
Enter verification code: 4/1Adeu5BVHafk2C2WEAXXyGRhjw-FQdOr11w5b_cyqwDZyWKaxC4f5HatauxA

Successfully saved authorization token.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Cloud masking function.
def maskL8sr(image):
    cloudShadowBitMask = ee.Number(2).pow(4).int()
    cloudsBitMask = ee.Number(2).pow(3).int()
    qa = image.select("QA_PIXEL")
    mask1 = (
        qa.bitwiseAnd(cloudShadowBitMask).eq(0).And(qa.bitwiseAnd(cloudsBitMask).eq(0))
    )
    mask2 = image.mask().reduce("min")
    # mask3 = image.select(opticalBands).gt(0).And(
    #     image.select(opticalBands).lt(10000)).reduce('min')
    mask = mask1.And(mask2)  # .And(mask3)
    return image.updateMask(mask)


# Cloud masking function for s2
def maskS2clouds(image):
    qa = image.select('QA60')
    # Bits 10 and 11 are clouds and cirrus, respectively.
    cloudBitMask = 1 << 10
    cirrusBitMask = 1 << 11
    # Both flags should be set to zero, indicating clear conditions.
    mask = qa.bitwiseAnd(cloudBitMask).eq(0) \
        .And(qa.bitwiseAnd(cirrusBitMask).eq(0))

    return image.updateMask(mask)


def merge_s2_l8(s2, l8):
    merged = ee.ImageCollection([s2, l8]).mean()
    return merged

def filter_type(cdl):
    return cdl.eq(1).selfMask().multiply(1).unmask().add(cdl.eq(5).selfMask().multiply(1).unmask())

def img_vi(img):
    img = img.addBands(img.normalizedDifference(["nir", "red"]).rename("ndvi"))
    img = img.addBands(img.select(["ndvi"]).multiply(img.select(["nir"])).rename("nirv"))
    return img

In [12]:
bandNamesOut = ["blue", "green", "red", "nir", "swir1", "swir2"]
bandNamesl8 = ["SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B6", "SR_B7"]
bandNamesS2 = ["B2", "B3", "B4", "B8", "B11", "B12"]
BANDS = ["blue", "green", "red", "nir", "swir1", "swir2", "ndvi", "nirv"]
RESPONSE = "cropland"
FEATURES = BANDS + [RESPONSE]

# Specify the size and shape of patches expected by the model.
KERNEL_SIZE = 128
KERNEL_SHAPE = [KERNEL_SIZE, KERNEL_SIZE]
MAX_CLOUD_PROBABILITY = 65
l8sr = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2").map(maskL8sr)
l8sr = l8sr.select(bandNamesl8, bandNamesOut)

s2Sr = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED").map(maskS2clouds)
s2Sr = s2Sr.select(bandNamesS2, bandNamesOut)
# df = pd.read_csv('/content/drive/MyDrive/RDA/model_and_env/calendar.csv')
country = 'United States of America'
crop = 'Corn Soybean'
# calendar = df.loc[(df['Country'] == country) & (df['Crop'] == crop)].to_dict(orient='records')[0]
us = ee.FeatureCollection("FAO/GAUL_SIMPLIFIED_500m/2015/level1").filter(
    'ADM0_NAME == "{}"'.format(country)
)
states = ['Iowa', 'Illinois', 'Indiana', 'Michigan', 'Ohio', 'Nebraska', 'Kansas', 'Minnesota', 'Missouri']
trainingPolys = us.filter(ee.Filter.inList("ADM1_NAME", states))
trainingPolysList = trainingPolys.toList(trainingPolys.size())

"""# Data Preparation"""
FOLDER = "{}_{}".format(crop,country).replace(' ','_')
TRAINING_BASE = "training_{}".format(crop).replace(' ','_')
EVAL_BASE = "eval_patches"

In [13]:
class DATA_EXPORTER:
  def __init__(self, image, cdl, kernel, randomseed, binary,scale,FOLDER):
      self.image = image
      self.cdl = cdl
      self.kernel = kernel
      self.randomseed = randomseed
      self.binary = binary
      # self.classcode = classcode
      self.folder = FOLDER
      self.scale = scale


  def export(self):
      wheat_lc = filter_type(self.cdl)
      mask = wheat_lc.neq(0)
      if self.binary:
          featureStack = ee.Image.cat([self.image, wheat_lc.select("cropland")]).float()
          # self.cdl_export(wheat_lc,self.description,self.trainingPoly)
      else:
          featureStack = ee.Image.cat([self.image, self.cdl.select("cropland")]).float()
          # self.cdl_export(self.cdl,self.description,self.trainingPoly)
      arrays = featureStack.neighborhoodToArray(self.kernel)
      arrays_masked = arrays.updateMask(mask)
      return arrays_masked,wheat_lc

  def train_export(self,trainingPoly,desc,n=200):
      '''
      arrays: image collection
      trainingPoly: polygon to sample
      n: number of samples
      folder: folder to save to
      desc: description of task
      i: seed
      '''
      arrays,_ = self.export()
      geomSample = ee.FeatureCollection([])
      for j in np.arange(n):
          sample = arrays.sample(
              region=trainingPoly,
              numPixels=20,  # Size of the shard.
              seed=random.randint(0, 10*n),
              scale=self.scale,
              tileScale=8,
          )
          geomSample = geomSample.merge(sample)

      task = ee.batch.Export.table.toDrive(
          folder=self.folder,
          collection=geomSample,
          description='train_'+desc,
          fileFormat="TFRecord",
      )
      task.start()

  # def cdl_export(self,desc,trainingPoly):
  #     _,wheat_lc = self.export()
  #     exportTask = ee.batch.Export.image.toDrive(
  #         image=wheat_lc,
  #         folder=self.folder,
  #         description='cdl_'+desc,
  #         scale=30,
  #         maxPixels = 1e10,
  #         region=trainingPoly.geometry(),
  #         fileFormat='GeoTIFF'
  #         )
  #     exportTask.start()


In [15]:
binary = True
FOLDER = "{}_{}_{}".format(crop,country,str(binary)).replace(' ','_')
list1 = ee.List.repeat(1, KERNEL_SIZE)
lists = ee.List.repeat(list1, KERNEL_SIZE)
kernel = ee.Kernel.fixed(KERNEL_SIZE, KERNEL_SIZE, lists)
for YEAR in range(2017, 2023):
    # using the calendar to get two months before harvest
    dte1 = date(YEAR, 6, 1)
    l8_growend = date(YEAR, 7,18)
    s2_growend = date(YEAR, 8,1)
    l8_criteria = ee.Filter.date(
        dte1.strftime("%Y-%m-%d"), l8_growend.strftime("%Y-%m-%d")
    )
    s2_criteria = ee.Filter.date(
        dte1.strftime("%Y-%m-%d"), s2_growend.strftime("%Y-%m-%d")
    )
    l8_reduced = l8sr.filter(l8_criteria).median().multiply(0.0000275).add(-0.2).float()
    s2_reduced = (
        s2Sr.median().divide(10000).float()
    )

    image = merge_s2_l8(s2_reduced, l8_reduced)
    image = img_vi(image).float()
    start_year = date(YEAR, 1, 1).strftime("%Y-%m-%d")
    end_year = '2022-12-31'
    cdl = ee.ImageCollection('USDA/NASS/CDL').filterDate(start_year, end_year).first().select('cropland')

    # These numbers determined experimentally.
    n = 200  # Number of shards in each polygon.
    N = 1000  # Total sample size in each polygon.
    # crop_code = int(calendar['Crop Code'])
    i = random.randint(0, 100)
    data_export = DATA_EXPORTER( image, cdl, kernel, i, binary,30,FOLDER)
    for g in states:
        trainingPoly = us.filter(ee.Filter.eq("ADM1_NAME", g))
        # run_task(arrays,trainingPoly,desc,i)
        if binary:
            desc = 'binary_{}_{}_{}'.format(crop,g,YEAR).replace(' ','_')
            data_export.train_export(trainingPoly,desc)
        else:
            desc = 'multiclass_{}_{}_{}'.format(crop,g,YEAR).replace(' ','_')
            data_export.train_export(trainingPoly,desc)



## Export task

In [None]:
from pprint import pprint

pprint(ee.batch.Task.list())