In [1]:
import numpy as np
from tqdm import tqdm
from sen12ms_dataLoader import *

In [2]:
def load_data():

    print('Loading data...')

    sen12ms = SEN12MSDataset("/data/PublicData/DF2020/trn/")

    IDs = np.load('/data/PublicData/DF2020/trn/clean.npy')

    N = IDs.shape[0]

#     idx=np.arange(N)
#     np.random.shuffle(idx)

    trn_ids=IDs[:int(0.2*N),:]
 
    s1_trn_name=[]
    s2_trn_name=[]
    y_trn_name=[]


    season_dict={1:Seasons.SPRING,2:Seasons.SUMMER,3:Seasons.FALL,4:Seasons.WINTER}

    print('loading training files...')

    for i in tqdm(range(trn_ids.shape[0])):
        s1_name,s2_name,y_name=sen12ms.get_s1s2lc_triplet(season_dict[trn_ids[i,0]], trn_ids[i,1], trn_ids[i,2],
                                                                               s1_bands=S1Bands.ALL,s2_bands=S2Bands.ALL, lc_bands=LCBands.ALL)
        s1_trn_name.append(s1_name)
        s2_trn_name.append(s2_name)
        y_trn_name.append(y_name)

    s1_trn_name = np.array(s1_trn_name)
    s2_trn_name = np.array(s2_trn_name)
    y_trn_name = np.array(y_trn_name)

    return s1_trn_name,s2_trn_name,y_trn_name

In [3]:
s1_trn_name,s2_trn_name,y_trn_name=load_data()

 17%|█▋        | 3759/22335 [00:00<00:00, 37581.52it/s]

Loading data...
loading training files...


100%|██████████| 22335/22335 [00:00<00:00, 41287.69it/s]


In [4]:
s1_data=np.zeros([s1_trn_name.shape[0],2,256,256],dtype='float16')
s2_data=np.zeros([s1_trn_name.shape[0],13,256,256],dtype='float16')
y_data=np.zeros([s1_trn_name.shape[0],1,256,256],dtype='float16')

In [5]:
import rasterio

In [6]:
for i in tqdm(range(s1_data.shape[0])):
    with rasterio.open(s1_trn_name[i]) as patch:
        s1_data[i] = patch.read(list(range(1,3)))

100%|██████████| 22335/22335 [03:19<00:00, 112.18it/s]


In [7]:
for i in tqdm(range(s1_data.shape[0])):
    with rasterio.open(s2_trn_name[i]) as patch:
        s2_data[i] = patch.read(list(range(1,14)))

100%|██████████| 22335/22335 [07:29<00:00, 49.67it/s]


In [8]:
s2_data=s2_data[:,[1,2,3,4,5,6,7,10,11,12],:,:]

In [9]:
s_data=np.concatenate((s1_data,s2_data),axis=1)

In [10]:
del s1_data,s2_data

In [11]:
for i in tqdm(range(s_data.shape[0])):
    with rasterio.open(y_trn_name[i]) as patch:
        y_data[i] = patch.read(list(range(1,2)))

100%|██████████| 22335/22335 [02:29<00:00, 149.72it/s]


data clean & label mapping

In [12]:
s_data[np.isnan(s_data)] = 0

In [13]:
lab_dict={1:1,2:1,3:1,4:1,5:1,6:2,7:2,8:3,9:3,10:4,11:5,12:6,14:6,13:7,15:8,16:9,17:10}
for i in tqdm(range(s_data.shape[0])):
    tmp=y_data[i,:,:,:]
    tmp=tmp.reshape(-1)
    y = list(map(lambda x: lab_dict[x], tmp))
    y=np.array(y)#list->array
    y=y.reshape(1,256,256)
    y_data[i,:,:,:]=y

100%|██████████| 22335/22335 [52:32<00:00,  7.08it/s] 


In [17]:
y_data=y_data-1

In [18]:
np.unique(y_data)

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=float16)

In [19]:
y_data=y_data.astype('uint8')

filter

In [16]:
s_data.shape

(22335, 12, 256, 256)

In [21]:
for i in tqdm(range(s_data.shape[0])):
    x=s_data[i]
    R = x[4, :, :]
    G = x[3, :, :]
    B = x[2, :, :]
    Nir = x[8, :, :]  # TM4
    Mir = x[-2, :, :]  # TM5
    SWir = x[-1, :, :]  # TM7

    MSI = SWir / Nir
    NDWI = (G - Nir) / (G + Nir)
    NDVI = (Nir - R) / (Nir + R)
    NDBBI = (1.5 * SWir - (Nir + G) / 2.) / (1.5 * SWir + (Nir + G) / 2.)  # 归一化差值裸地与
    BSI = ((Mir + R) - (Nir + B)) / ((Mir + R) + (Nir + B))  # 裸土指数
    
    y_clean=y_data[i,0,:,:].copy()
    y=y_data[i,0,:,:].copy()
    
    # 修正不符合要求的森林类
    y_clean[np.where((NDVI > 0.75) & (y != 0))] = 10
    # 修正不符合要求的灌木类
    y_clean[np.where((NDVI > 0.2) & (NDVI < 0.35) & (MSI > 1.5) & (y != 1))] = 10
    # 修正不符合要求的草地类
    y_clean[np.where((NDVI > 0.4) & (NDVI < 0.55) & (y != 3))] = 10
    # 修正不符合要求的湿地类
    y_clean[np.where((NDVI > 0.6) & (NDVI < 0.75) & (y != 4))] = 10
    # 修正不符合要求的农田类
    y_clean[np.where((NDVI > 0.2) & (NDVI < 0.35) & (MSI > 1) & (MSI < 1.5) & (y != 5))] = 10
    # 修正不符合要求的建筑类
    y_clean[np.where((NDVI > 0.2) & (NDVI < 0.35) & (MSI > 0.9) & (MSI < 1) & (y != 6))] = 10
    # 修正不符合要求的裸地类
    y_clean[np.where((NDVI > 0) & (NDVI < 0.15) & (y != 8))] = 10
    # 修正裸土建筑错分到其他类
    y_clean[np.where((BSI > -0.4) & (NDVI < 0.15) & (y != 6) & (y != 8))] = 10
    # 修正不符合要求的水体类
    y_clean[np.where((NDWI > 0.) & (y != 10))] = 10
    
    # 修正其他类错标到森林
    y_clean[np.where((NDVI < 0.75) & (y == 0))] = 10

    # shrubland

    # 将灌木标签修正为草地
    y_clean[np.where((NDVI > 0.4) & (NDVI < 0.55) & (y == 1))] = 3

    # savanna

    # 将热带草原标签修正为草地
    y_clean[np.where((NDVI > 0.4) & (NDVI < 0.55) & (y == 2) & (np.sum(y == 9) < 2000))] = 3
    # 将热带草原标签修正为湿地
    y_clean[np.where((NDVI > 0.6) & (NDVI < 0.75) & (y == 2) & (np.sum(y == 9) > 10000))] = 4

    # grassland
    
    # 将草地标签修正为湿地
    y_clean[np.where((NDVI > 0.6) & (NDVI < 0.75) & (y == 3) & (np.sum(y==9)>10000))] = 4

    # wetland
    # 将湿地修正为森林
    y_clean[np.where((NDVI > 0.75) & (y == 4))] = 0
    
    y_data[i,0,:,:]=y_clean

  
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 22335/22335 [16:37<00:00, 22.40it/s]


In [22]:
y_data[y_data==2]=10
y_data[y_data==7]=10

In [23]:
np.unique(y_data)

array([ 0,  1,  3,  4,  5,  6,  8,  9, 10], dtype=uint8)

data norm

In [24]:
#s1
tmp1=s_data[:,:2,:,:]
tmp1[tmp1<-25]=-25
tmp1[tmp1>0]=0

tmp1 = (tmp1 + 25) / 25 * 1.0

In [25]:
np.min(tmp1)

0.0

In [26]:
s_data[:,:2,:,:]=tmp1

In [27]:
del tmp1

In [28]:
#s2
tmp2=s_data[:,2:,:,:]
tmp2[tmp2>10000]=10000
tmp2[tmp2<0]=0

tmp2/=10000*1.0

In [30]:
s_data[:,2:,:,:]=tmp2

In [31]:
np.max(tmp2)

1.0

delete bg data

In [32]:
X=s_data.transpose(0,2,3,1).reshape(-1,12)

In [34]:
y=y_data.reshape(-1)

In [35]:
X.shape

(1463746560, 12)

In [36]:
y.shape

(1463746560,)

In [37]:
idx=y!=10

In [39]:
X=X[idx]

In [40]:
X.shape

(624407841, 12)

In [41]:
y=y[idx]

In [42]:
y.shape

(624407841,)

In [43]:
np.save('/data/PublicData/DF2020/trn/transform/ML_trn.npy',X)

In [44]:
np.save('/data/PublicData/DF2020/trn/transform/ML_lab.npy',y)