In [1]:
from sklearn.model_selection import train_test_split
import geopandas as gpd
from utils import *
import fiona
import rasterio.mask

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_path = 'data/train_summer/images_gt'
samples_dir='data/summer_tiles_300'

patch_size=300

use_AOI = True

# Processing for training

In [4]:
def create_train_ds(input_dir, resulting_ds_dir):
    """
    Preparing data for training by splitting big input images into tiles
    
        Args:
            input_dir: directory, which contains directories with data that will be used for training
            resulting_ds_dir: directory for resulting splitted images
    """
    for p in tqdm(os.listdir(input_dir)):
        path = os.path.join(data_path,p)
        print(path)
        for file in os.listdir(path):
            if file.endswith(".shp"):
                orig = gpd.read_file(os.path.join(path, file))
            if file.endswith('.tif'):
                file_3chanel = "three_"+file
                with rasterio.open(os.path.join(path,file)) as rast:
                    meta = rast.meta
                    out_image=rast.read()
                    if use_AOI:
                        path_AOI = os.path.join(os.path.join(input_dir.split('/images')[0], 'AOI'),path.split('/')[-1])
                        path_AOI = os.path.join(path_AOI,path.split('/')[-1].split('_gpx_H')[0]+"_gpx_AOI.shp")
                        with fiona.open(path_AOI, "r") as shapefile:
                            shapes = [feature["geometry"] for feature in shapefile]
                            out_image, out_transform = rasterio.mask.mask(rast, shapes)
        with rasterio.open(file_3chanel,'w', **meta) as new_dataset:
            new_dataset.write(out_image)

        original_data = orig['geometry'].bounds.apply(inv_affine, 1, file_name=file_3chanel, meta=meta, label='Tree')
        original_data.to_csv(f"{p}_before_proc.csv")
        train_annotations= split_raster(path_to_raster=file_3chanel,
                                     annotations_file=f"{p}_before_proc.csv",
                                     base_dir=resulting_ds_dir,
                                     patch_size=patch_size,
                                     patch_overlap=0.25, allow_empty=True)
        
        os.remove(file_3chanel)
        os.remove(f"{p}_before_proc.csv")

    final_df = None
    for csv in os.listdir(samples_dir):
        if csv.endswith(".csv"):
            processed = pd.read_csv(os.path.join(resulting_ds_dir,csv)).dropna().reset_index().drop(['index'],axis=1)

            if final_df is None:
                final_df = processed
            else:
                final_df = final_df.append(processed)
    final_df.to_csv(os.path.join(resulting_ds_dir,'final_df.csv'))
    
create_train_ds(data_path, samples_dir)

  0%|          | 0/8 [00:00<?, ?it/s]

data/train_summer/images_gt/trees_afs_summer_6_Almetjevsk_CIMA_gpx_H


 12%|█▎        | 1/8 [00:15<01:50, 15.77s/it]

data/train_summer/images_gt/trees_afs_summer_3_HMAO_CIMA_gpx_H


 25%|██▌       | 2/8 [00:52<02:48, 28.05s/it]

data/train_summer/images_gt/trees_afs_summer_10_Baykalsk_CIMA_gpx_H


 38%|███▊      | 3/8 [01:09<01:56, 23.21s/it]

data/train_summer/images_gt/trees_afs_summer_8_Baykalsk_CIMA_gpx_H


 50%|█████     | 4/8 [01:17<01:08, 17.22s/it]

data/train_summer/images_gt/trees_afs_summer_7_Baykalsk_CIMA_gpx_H


 62%|██████▎   | 5/8 [01:29<00:45, 15.05s/it]

data/train_summer/images_gt/trees_afs_summer_9_Baykalsk_CIMA_gpx_H


 75%|███████▌  | 6/8 [01:51<00:34, 17.48s/it]

data/train_summer/images_gt/trees_afs_summer_1_HMAO_CIMA_gpx_H


 88%|████████▊ | 7/8 [02:00<00:14, 14.77s/it]

data/train_summer/images_gt/trees_afs_summer_4_HMAO_CIMA_gpx_H


100%|██████████| 8/8 [03:04<00:00, 23.09s/it]


In [5]:
# perfroming train/test split by images

train_val_test = pd.read_csv(os.path.join(samples_dir,'final_df.csv'))

train_val_test['label'] = 'Tree'

train_val_paths, test_paths = train_test_split(train_val_test['image_path'].unique(), test_size=0.15, random_state=42)
train_val = train_val_test[train_val_test['image_path'].isin(train_val_paths)]
test = train_val_test[train_val_test['image_path'].isin(test_paths)]
train_val.to_csv(os.path.join(samples_dir,"train_val.csv"))
test.to_csv(os.path.join(samples_dir,'test.csv'))