In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

In [2]:
PLOT_LOCS = '../data/processed/blm_usfs_wadnr_plot_footprints.shp'
ECOREGIONS = '../data/raw/epa/us_eco_l3_state_boundaries.shp'

In [3]:
plot_locs = gpd.read_file(PLOT_LOCS)
plot_locs['uuid_part'] = plot_locs['uuid'].apply(lambda x: x.split('-')[0])
plot_locs['geometry'] = plot_locs.centroid

In [6]:
ecoreg = gpd.read_file(ECOREGIONS)
ecoreg = ecoreg.loc[ecoreg.STATE_NAME.isin(['Washington', 'Oregon'])]
ecoreg['ecoregion3'] = ecoreg.US_L3NAME.str.lower().str.replace(' ', '_').str.replace('/', '_')
ecoreg.crs = {'init': 'epsg:5070'}
ecoreg.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 82 entries, 15 to 1630
Data columns (total 15 columns):
US_L3CODE     82 non-null object
US_L3NAME     82 non-null object
NA_L3CODE     82 non-null object
NA_L3NAME     82 non-null object
NA_L2CODE     82 non-null object
NA_L2NAME     82 non-null object
NA_L1CODE     82 non-null object
NA_L1NAME     82 non-null object
STATE_NAME    82 non-null object
EPA_REGION    82 non-null int64
L3_KEY        82 non-null object
L2_KEY        82 non-null object
L1_KEY        82 non-null object
geometry      82 non-null object
ecoregion3    82 non-null object
dtypes: int64(1), object(14)
memory usage: 10.2+ KB


In [8]:
LIDAR_DATA = '../data/processed/lidar_structure_training_data.csv'
SENTINEL_DATA = '../data/processed/sentinel_structure_training_data.csv'

lid_train = pd.read_csv(LIDAR_DATA)
sat_train = pd.read_csv(SENTINEL_DATA)

In [9]:
lid_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3834 entries, 0 to 3833
Columns: 107 entries, uuid to meas_yr
dtypes: float64(100), int64(2), object(5)
memory usage: 3.1+ MB


In [10]:
lid_train.columns

Index(['uuid', 'lidar_year', 'lidar_acq', 'strat0_return-proportion',
       'strat1_return-proportion', 'strat2_return-proportion',
       'strat3_return-proportion', 'strat4_return-proportion',
       'strat5_return-proportion', 'height_05-percentile',
       ...
       'JUNIPER', 'LARCH', 'year_diff', 'lat', 'lon', 'ecoregion3', 'agency',
       'distance_to_water_m', 'plot_size_ac', 'meas_yr'],
      dtype='object', length=107)

In [11]:
lid_train.head()

Unnamed: 0,uuid,lidar_year,lidar_acq,strat0_return-proportion,strat1_return-proportion,strat2_return-proportion,strat3_return-proportion,strat4_return-proportion,strat5_return-proportion,height_05-percentile,...,JUNIPER,LARCH,year_diff,lat,lon,ecoregion3,agency,distance_to_water_m,plot_size_ac,meas_yr
0,0080963b,2008,or-south-coast_2008,0.065025,0.070458,0.024363,0.104669,0.16562,0.502377,0.0,...,0.0,0.0,3.0,43.028673,-123.898747,coast_range,BLM,180.194859,0.125,2011
1,008dc9d1,2013,tulalip_2013,0.1,0.031409,0.00127,0.005889,0.270901,0.585797,0.0,...,0.0,0.0,1.0,48.178724,-121.981598,north_cascades,WADNR,1237.148141,0.1,2014
2,008dc9d1,2016,wa-western_2016,0.09486,0.039737,0.001681,0.003923,0.284375,0.570177,0.0,...,0.0,0.0,0.0,48.178724,-121.981598,north_cascades,WADNR,1237.148141,0.1,2014
3,00900e38,2008,or-south-coast_2008,0.021095,0.070268,0.015251,0.076539,0.237885,0.429019,0.7275,...,0.0,0.0,2.0,43.771495,-123.753809,coast_range,BLM,64.7182,0.125,2010
4,00e46fbb,2008,or-south-coast_2008,0.016287,0.015253,0.036194,0.079369,0.19545,0.639736,5.09,...,0.0,0.0,2.0,43.770491,-123.762025,coast_range,BLM,35.985424,0.125,2010


In [60]:
lid_pivot = pd.pivot_table(lid_train.groupby(by=['ecoregion3', 'year'])[['uuid', 'lidar_acq']].nunique().reset_index(), index=['ecoregion3'], columns=['year'], values=['uuid'])['uuid']
lid_pivot.index = [' '.join(idx.split('_')[0:2]).upper() for idx in lid_pivot.index]
lid_pivot.columns = [int(col) for col in lid_pivot.columns]
lid_pivot = lid_pivot.drop('COLUMBIA PLATEAU')
lid_pivot['TOTAL'] = lid_pivot.sum(axis=1).astype(int)
lid_pivot.loc['TOTAL'] = lid_pivot.sum(axis=0).astype(int)
lid_pivot.fillna('--')

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,TOTAL
BLUE MOUNTAINS,--,--,--,--,17,68,--,--,--,85
CASCADES,--,--,--,89,81,27,201,167,116,681
COAST RANGE,456,342,5,283,185,80,130,202,61,1744
EASTERN CASCADES,--,--,--,--,42,31,64,5,56,198
KLAMATH MOUNTAINS,31,50,--,177,--,12,30,8,--,308
NORTH CASCADES,--,--,--,--,13,40,228,88,50,419
NORTHERN ROCKIES,--,--,--,--,--,5,34,25,40,104
PUGET LOWLAND,--,--,--,2,11,43,44,20,7,127
WILLAMETTE VALLEY,--,--,--,--,--,--,1,38,6,45
TOTAL,487,392,5,551,349,306,732,553,336,3711


In [64]:
lid_train['uuid'].nunique()

3594

In [67]:
sat_train.groupby(by=['ecoregion3'])['uuid'].nunique()

ecoregion3
blue_mountains                                           87
cascades                                                704
coast_range                                            1760
columbia_plateau                                         10
eastern_cascades_slopes_and_foothills                   415
klamath_mountains_california_high_north_coast_range     291
north_cascades                                          411
northern_rockies                                        103
puget_lowland                                           159
willamette_valley                                        45
Name: uuid, dtype: int64

In [62]:
lid_pivot.shape

(10, 10)

In [61]:
sat_pivot = pd.pivot_table(sat_train.groupby(by=['ecoregion3', 'year'])[['uuid']].nunique().reset_index(), index=['ecoregion3'], columns=['year'], values=['uuid'])['uuid']
sat_pivot.index = [' '.join(idx.split('_')[0:2]).upper() for idx in sat_pivot.index]
sat_pivot.columns = [int(col) for col in sat_pivot.columns]
sat_pivot = sat_pivot.drop('COLUMBIA PLATEAU')
sat_pivot['TOTAL'] = sat_pivot.sum(axis=1).astype(int)
sat_pivot.loc['TOTAL'] = sat_pivot.sum(axis=0).astype(int)
sat_pivot.fillna('--')

Unnamed: 0,2019,2020,TOTAL
BLUE MOUNTAINS,87,87,174
CASCADES,704,704,1408
COAST RANGE,1758,1304,3062
EASTERN CASCADES,413,415,828
KLAMATH MOUNTAINS,291,260,551
NORTH CASCADES,411,411,822
NORTHERN ROCKIES,102,103,205
PUGET LOWLAND,158,159,317
WILLAMETTE VALLEY,45,45,90
TOTAL,3969,3488,7457


In [29]:
len(pd.unique(lid_train.uuid)), len(pd.unique(sat_train.uuid))

(3594, 3985)

In [34]:
USE_LIDAR_COLS = ['strat0_return-proportion', 'strat1_return-proportion', 
                  'strat2_return-proportion', 'strat3_return-proportion', 
                  'strat4_return-proportion', 'strat5_return-proportion', 
                  'height_05-percentile',  'height_25-percentile', 
                  'height_50-percentile', 'height_75-percentile',
                  'height_95_percentile', 'cover', 
                  'potential_volume', 'stddev_height', 
                  'surface_volume', 'kurtosis', 'skewness']
LIDAR_X_COLS = USE_LIDAR_COLS + ['elevation', 'lat', 'lon']
len(LIDAR_X_COLS) - 3

17

In [31]:
SAT_COLS = ['S2_R_LEAFOFF', 'S2_G_LEAFOFF', 'S2_B_LEAFOFF', 'S2_NIR_LEAFOFF', 'S2_SWIR1_LEAFOFF',
            'S2_SWIR2_LEAFOFF', 'S2_RE1_LEAFOFF', 'S2_RE2_LEAFOFF', 'S2_RE3_LEAFOFF', 
            'S2_R_LEAFON', 'S2_G_LEAFON', 'S2_B_LEAFON', 'S2_NIR_LEAFON', 'S2_SWIR1_LEAFON',
            'S2_SWIR2_LEAFON', 'S2_RE1_LEAFON', 'S2_RE2_LEAFON', 'S2_RE3_LEAFON', 'S2_RE4_LEAFON',
            'S2_NDVI_LEAFON', 'S2_SAVI_LEAFON', 'S2_BRIGHTNESS_LEAFON', 'S2_GREENNESS_LEAFON',
            'S2_WETNESS_LEAFON', 'S2_NDVI_LEAFOFF', 'S2_SAVI_LEAFOFF', 'S2_BRIGHTNESS_LEAFOFF',
            'S2_GREENNESS_LEAFOFF', 'S2_WETNESS_LEAFOFF', 'S2_dR', 'S2_dG', 'S2_dB', 'S2_dNIR',
            'S2_dSWIR1', 'S2_dSWIR2', 'S2_dRE1', 'S2_dRE2', 'S2_dNDVI', 'S2_dSAVI', 'S2_dBRIGHTNESS',
            'S2_dGREENNESS', 'S2_dWETNESS', 'S2_dRE3', 'S2_dRE4', 
            'LT_DUR_NBR', 'LT_DUR_SWIR1', 'LT_MAG_NBR', 'LT_MAG_SWIR1', 'LT_RATE_NBR', 'LT_RATE_SWIR1',
            'LT_YSD_NBR', 'LT_YSD_SWIR1', 
            'elevation', 'lat', 'lon']

In [33]:
len(SAT_COLS) - 3

52