## Import Libraries

In [47]:
import src.notebooks.geoutils as geoutils
from pathlib import Path
import pandas as pd
import numpy as np
import gdal
import matplotlib as mpl
import rasterio
import geopandas as gpd

## Configure Paths

In [48]:
data_path = Path('processedBuildingLabels/data/rasters_vectors')

## Plotting Settings

In [49]:
%matplotlib inline

In [50]:
dpi = 300
mpl.rcParams['figure.dpi']= dpi

## Loading Data to DataFrame

In [51]:
df = geoutils.get_dataframe_from_data(data_path=data_path)

In [52]:
df.head()

Unnamed: 0,3band,8band,geojson,image_number
0,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,1
1,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,2
2,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,3
3,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,4
4,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,5


### Check file extensions

In [53]:
df['geo_extension'] = df['geojson'].map(lambda x: geoutils.extract_file_extension(x))

In [54]:
df['raster_extension_3'] = df['3band'].map(lambda x: geoutils.extract_file_extension(x))

In [55]:
df['raster_extension_8'] = df['8band'].map(lambda x: geoutils.extract_file_extension(x))


In [56]:
df.head()

Unnamed: 0,3band,8band,geojson,image_number,geo_extension,raster_extension_3,raster_extension_8
0,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,1,geojson,tif,tif
1,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,2,geojson,tif,tif
2,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,3,geojson,tif,tif
3,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,4,geojson,tif,tif
4,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,5,geojson,tif,tif


In [57]:
df.drop(df[df['image_number'] == 6940].index,inplace=True)

In [58]:
df['3band'][df['raster_extension_3'] != 'tif'].value_counts()

Series([], Name: 3band, dtype: int64)

In [59]:
df['8band'][df['raster_extension_8'] != 'tif'].value_counts()

Series([], Name: 8band, dtype: int64)

### Drop any rows that do not have the right file extension

In [60]:
df = df[df['geo_extension'] == 'geojson']

In [61]:
df.head()

Unnamed: 0,3band,8band,geojson,image_number,geo_extension,raster_extension_3,raster_extension_8
0,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,1,geojson,tif,tif
1,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,2,geojson,tif,tif
2,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,3,geojson,tif,tif
3,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,4,geojson,tif,tif
4,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,5,geojson,tif,tif


### Extract column for image numbers

In [62]:
df['geoshape'] = df['geojson'].map(lambda x: geoutils.get_geojson_shape(x))

  aout[:] = out


## Save output dataframes as pickle files

In [63]:
artifacts_path = Path('src/notebooks/artifacts')

In [64]:
df.to_pickle(Path(artifacts_path/'df'))

In [65]:
df = pd.read_pickle(Path(artifacts_path/'df'))

### Extract dataframe containing non-empty geojson files

In [66]:
gdf = df[df['geoshape']!=(0,1)].reset_index(drop=True)

In [67]:
gdf.head()

Unnamed: 0,3band,8band,geojson,image_number,geo_extension,raster_extension_3,raster_extension_8,geoshape
0,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,12,geojson,tif,tif,"(5, 19)"
1,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,30,geojson,tif,tif,"(2, 19)"
2,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,44,geojson,tif,tif,"(1, 19)"
3,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,46,geojson,tif,tif,"(11, 19)"
4,processedBuildingLabels/data/rasters_vectors/3...,processedBuildingLabels/data/rasters_vectors/8...,processedBuildingLabels/data/rasters_vectors/g...,47,geojson,tif,tif,"(15, 19)"


In [68]:
gdf.to_pickle(Path(artifacts_path/'gdf'))

### Percentages of images with non-empty geojson files

In [69]:
gdf.shape[0]/df.shape[0]*100

62.107235514557516