In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_info_columns', 999)

In [3]:
# INVENTORY FEATURES
FVS = '../data/processed/fvs_outputs/fvs_outputs.csv'

# CLIMATE FEATURES
CLIMATE = '../data/processed/pnw_lidar_plots_climatewna_2000-2017MSY.csv'

# LIDAR FEATURES
CLOUDMETS = '../data/processed/lidar/plot_clips/cloudmetrics.csv'
GRIDSURF = '../data/processed/lidar/plot_clips/gridsurfacestats.csv'
TOPOMETS = '../data/processed/lidar/plot_clips/topometrics.csv'

# SATELLITE FEATURES
L8 = '../data/interim/gee/landsat_2014-2020.csv'
# MODIS = '../data/interim/modis_metrics.csv'
S2 = '../data/interim/gee/sentinel2sr_2019-2020.csv'
S1 = '../data/interim/gee/sentinel1_2015-2020.csv'
PALSAR = '../data/interim/gee/palsar_2010-2018.csv'
LANDTRENDR = '../data/interim/gee/landtrendr_2010-2020.csv'

## Combine features that are time-invariant for a plot
We'll identify the soil types, distance from water, and some attributes like latitude, longitude, potential vegetation type, and ecoregion.

In [4]:
# PLOT FEATURES
PLOT_LOCS = '../data/processed/blm_usfs_wadnr_plot_footprints.shp'
SOILS = '../data/processed/soils/soils_data_forPlots.csv'
POTVEG = '../data/processed/pot_veg/pot_veg_plots.csv'
WATER = '../data/interim/dist_to_water/water_distance_fromPlots.csv'
ECOREGIONS = '../data/raw/epa/us_eco_l3_state_boundaries.shp'

In [5]:
plot_locs = gpd.read_file(PLOT_LOCS).to_crs(5070)
KEEP_PLOT_COLS = ['lat', 'lon', 'orig_id', 'uuid', 'source', 'meas_yr', 'meas_date', 'geometry']
plot_locs['uuid'] = plot_locs.uuid.str[0:8]
plot_locs = plot_locs[KEEP_PLOT_COLS]
plot_locs.info()

  return _prepare_from_string(" ".join(pjargs))


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5089 entries, 0 to 5088
Data columns (total 8 columns):
lat          5089 non-null float64
lon          5089 non-null float64
orig_id      5089 non-null object
uuid         5089 non-null object
source       5089 non-null object
meas_yr      5089 non-null int64
meas_date    3866 non-null object
geometry     5089 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 318.1+ KB


We need to do a spatial join to identify the ecoregion that each plot falls within

In [6]:
ecoreg = gpd.read_file(ECOREGIONS)
ecoreg = ecoreg.loc[ecoreg.STATE_NAME.isin(['Washington', 'Oregon'])]
ecoreg['ecoregion3'] = ecoreg.US_L3NAME.str.lower().str.replace(' ', '_').str.replace('/', '_')
ecoreg.crs = {'init': 'epsg:5070'}
ecoreg.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 82 entries, 15 to 1630
Data columns (total 15 columns):
US_L3CODE     82 non-null object
US_L3NAME     82 non-null object
NA_L3CODE     82 non-null object
NA_L3NAME     82 non-null object
NA_L2CODE     82 non-null object
NA_L2NAME     82 non-null object
NA_L1CODE     82 non-null object
NA_L1NAME     82 non-null object
STATE_NAME    82 non-null object
EPA_REGION    82 non-null int64
L3_KEY        82 non-null object
L2_KEY        82 non-null object
L1_KEY        82 non-null object
geometry      82 non-null object
ecoregion3    82 non-null object
dtypes: int64(1), object(14)
memory usage: 10.2+ KB


In [7]:
# add the ecoregion to the plot table
plot_locs['geometry'] = plot_locs.geometry.centroid
ecoreg_cols = ecoreg.columns.drop('geometry')
ecoreg_join = gpd.sjoin(plot_locs.to_crs(5070), ecoreg.to_crs(5070), how='left').drop(['index_right'], axis=1)
ecoreg_join.loc[ecoreg_join.uuid == '9d4ab655', ecoreg_cols] = ecoreg.loc[ecoreg.ecoregion3 == 'puget_lowland', ecoreg_cols].drop_duplicates().values

  return _prepare_from_string(" ".join(pjargs))


In [8]:
KEEP_PLOT_COLS = ['lat', 'lon', 'orig_id', 'uuid', 'source', 'meas_yr', 'ecoregion3']
plot_info = ecoreg_join[KEEP_PLOT_COLS].copy()

plot_info.loc[plot_info.source.str.contains('USFS'), 'agency'] = 'USFS'
plot_info.loc[plot_info.source.str.contains('BLM'), 'agency'] = 'BLM'
plot_info.loc[plot_info.source.str.contains('WA-DNR'), 'agency'] = 'WADNR'

plot_info.loc[plot_info.agency == 'USFS', 'plot_size_ac'] = 1/4
plot_info.loc[plot_info.agency == 'BLM', 'plot_size_ac'] = 1/8
plot_info.loc[plot_info.agency == 'WADNR', 'plot_size_ac'] = 1/10

plot_info = plot_info.set_index('uuid')
plot_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5089 entries, d7c01e3a to 4214cf54
Data columns (total 8 columns):
lat             5089 non-null float64
lon             5089 non-null float64
orig_id         5089 non-null object
source          5089 non-null object
meas_yr         5089 non-null int64
ecoregion3      5089 non-null object
agency          5089 non-null object
plot_size_ac    5089 non-null float64
dtypes: float64(3), int64(1), object(4)
memory usage: 357.8+ KB


In [9]:
soils = pd.read_csv(SOILS)
soils['uuid'] = soils.uuid.str[0:8]
soils = soils.set_index('uuid')
soils.columns = [col.lower() for col in soils.columns]
KEEP_SOILS = ['bulk_dens', 'soil_depth', 'pct_clay_surf', 'pct_rock_surf', 'pct_sand_surf']
soils = soils[KEEP_SOILS]
soils.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5089 entries, 075cbfbc to 25d94682
Data columns (total 5 columns):
bulk_dens        5089 non-null int64
soil_depth       5089 non-null float64
pct_clay_surf    5089 non-null int64
pct_rock_surf    5089 non-null int64
pct_sand_surf    5089 non-null int64
dtypes: float64(1), int64(4)
memory usage: 238.5+ KB


In [10]:
potveg = pd.read_csv(POTVEG)
potveg['uuid'] = potveg['uuid'].str[0:8]
potveg = potveg.set_index('uuid')
potveg.columns = [col.lower() for col in potveg.columns]
potveg = potveg.rename({'esp2':'pot_veg_type'}, axis=1)[['pot_veg_type']]
potveg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5089 entries, 0fc9bef5 to 25d94682
Data columns (total 1 columns):
pot_veg_type    5089 non-null int64
dtypes: int64(1)
memory usage: 79.5+ KB


In [11]:
water = pd.read_csv(WATER)
water['uuid'] = water.uuid.str[0:8]
water = water.set_index('uuid')
water.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5089 entries, 075cbfbc to 25d94682
Data columns (total 1 columns):
distance_to_water_m    5089 non-null float64
dtypes: float64(1)
memory usage: 79.5+ KB


In [12]:
plot_atts = plot_info.join(soils, how='outer').join(water, how='outer').join(potveg, how='outer')
plot_atts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5089 entries, 00027724 to fff7e1c3
Data columns (total 15 columns):
lat                    5089 non-null float64
lon                    5089 non-null float64
orig_id                5089 non-null object
source                 5089 non-null object
meas_yr                5089 non-null int64
ecoregion3             5089 non-null object
agency                 5089 non-null object
plot_size_ac           5089 non-null float64
bulk_dens              5089 non-null int64
soil_depth             5089 non-null float64
pct_clay_surf          5089 non-null int64
pct_rock_surf          5089 non-null int64
pct_sand_surf          5089 non-null int64
distance_to_water_m    5089 non-null float64
pot_veg_type           5089 non-null int64
dtypes: float64(5), int64(6), object(4)
memory usage: 636.1+ KB


In [13]:
plot_atts.to_csv('../data/processed/plot_features.csv', index=True, header=True)

## Prepare monthly, seasonal, and annual climate features

In [14]:
climate = pd.read_csv(CLIMATE).rename(
    {'Year': 'year', 'ID1':'uuid', 'ID2':'source'},
    axis=1)

In [15]:
climate = climate.set_index(['uuid', 'year'])

In [16]:
# breaking the climatena columns into groups
MONTHLY_COLS = ['Tmax01','Tmax02','Tmax03','Tmax04','Tmax05','Tmax06',
                'Tmax07','Tmax08','Tmax09','Tmax10','Tmax11','Tmax12',
                'Tmin01','Tmin02','Tmin03','Tmin04','Tmin05','Tmin06',
                'Tmin07','Tmin08','Tmin09','Tmin10','Tmin11','Tmin12',
                'Tave01','Tave02','Tave03','Tave04','Tave05','Tave06',
                'Tave07','Tave08','Tave09','Tave10','Tave11','Tave12',
                'PPT01','PPT02','PPT03','PPT04','PPT05','PPT06',
                'PPT07','PPT08','PPT09','PPT10','PPT11','PPT12',
                'Rad01','Rad02','Rad03','Rad04','Rad05','Rad06',
                'Rad07','Rad08','Rad09','Rad10','Rad11','Rad12',
                'DD_0_01','DD_0_02','DD_0_03','DD_0_04','DD_0_05','DD_0_06',
                'DD_0_07','DD_0_08','DD_0_09','DD_0_10','DD_0_11','DD_0_12',
                'DD5_01','DD5_02','DD5_03','DD5_04','DD5_05','DD5_06',
                'DD5_07','DD5_08','DD5_09','DD5_10','DD5_11','DD5_12',
                'DD_18_01','DD_18_02','DD_18_03','DD_18_04','DD_18_05','DD_18_06',
                'DD_18_07','DD_18_08','DD_18_09','DD_18_10','DD_18_11','DD_18_12',
                'DD18_01','DD18_02','DD18_03','DD18_04','DD18_05','DD18_06',
                'DD18_07','DD18_08','DD18_09','DD18_10','DD18_11','DD18_12',
                'NFFD01','NFFD02','NFFD03','NFFD04','NFFD05','NFFD06',
                'NFFD07','NFFD08','NFFD09','NFFD10','NFFD11','NFFD12',
                'PAS01','PAS02','PAS03','PAS04','PAS05','PAS06',
                'PAS07','PAS08','PAS09','PAS10','PAS11','PAS12',
                'Eref01','Eref02','Eref03','Eref04','Eref05','Eref06',
                'Eref07','Eref08','Eref09','Eref10','Eref11','Eref12',
                'CMD01','CMD02','CMD03','CMD04','CMD05','CMD06',
                'CMD07','CMD08','CMD09','CMD10','CMD11','CMD12',
                'RH01','RH02','RH03','RH04','RH05','RH06',
                'RH07','RH08','RH09','RH10','RH11','RH12']
SEASON_COLS = ['Tmax_wt','Tmax_sp','Tmax_sm','Tmax_at',
               'Tmin_wt','Tmin_sp','Tmin_sm','Tmin_at',
               'Tave_wt','Tave_sp','Tave_sm','Tave_at',
               'PPT_wt','PPT_sp','PPT_sm','PPT_at',
               'Rad_wt','Rad_sp','Rad_sm','Rad_at',
               'DD_0_wt','DD_0_sp','DD_0_sm','DD_0_at',
               'DD5_wt','DD5_sp','DD5_sm','DD5_at',
               'DD_18_wt','DD_18_sp','DD_18_sm','DD_18_at',
               'DD18_wt','DD18_sp','DD18_sm','DD18_at',
               'NFFD_wt','NFFD_sp','NFFD_sm','NFFD_at',
               'PAS_wt','PAS_sp','PAS_sm','PAS_at',
               'Eref_wt','Eref_sp','Eref_sm','Eref_at',
               'CMD_wt','CMD_sp','CMD_sm','CMD_at',
               'RH_wt','RH_sp','RH_sm','RH_at']
ANN_COLS = ['MAT','MWMT','MCMT','TD','MAP','MSP',
            'AHM','SHM','DD_0','DD5','DD_18','DD18',
            'NFFD','bFFP','eFFP','FFP','PAS','EMT',
            'EXT','MAR','Eref','CMD','RH']

In [17]:
climate_monthly = climate[MONTHLY_COLS].replace(-9999.0, np.nan)
climate_seasonal = climate[SEASON_COLS].replace(-9999.0, np.nan).drop(['Rad_wt','Rad_sp','Rad_sm','Rad_at'], axis=1)
climate_annual = climate[ANN_COLS].replace(-9999.0, np.nan).drop('MAR', axis=1)

In [18]:
# replace current year values for fall and winter with previous year values
# to reflect run-up to current growing season
FALL_WINT_COLS = [x for x in climate_monthly.columns if x[-2:] in ['09', '10', '11', '12']]
climate_monthly[FALL_WINT_COLS] = climate_monthly.groupby(level=[0,1])[FALL_WINT_COLS].sum().shift()
climate_monthly.loc[climate_monthly.index.get_level_values(1) == 2000, FALL_WINT_COLS] = np.nan

In [19]:
climate_annual.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MAT,MWMT,MCMT,TD,MAP,MSP,AHM,SHM,DD_0,DD5,DD_18,DD18,NFFD,bFFP,eFFP,FFP,PAS,EMT,EXT,Eref,CMD,RH
uuid,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
d7c01e3a,2000,5.0,15.4,-2.8,18.2,1019,119,14.7,129.9,542,1233,4733,30,172,156,261,105,365,-33.4,34.5,655,428,62
c16be14e,2000,6.6,17.3,-1.3,18.6,680,89,24.5,194.2,395,1566,4199,71,184,151,267,116,159,-32.3,36.9,789,536,58
b4e059b8,2000,4.4,14.7,-3.2,17.9,1019,127,14.2,116.1,596,1106,4948,21,165,161,259,98,403,-33.9,33.9,625,395,63
e3b77390,2000,7.0,17.7,-1.4,19.0,583,96,29.2,184.2,380,1666,4086,88,185,148,267,119,125,-33.2,37.5,811,551,56
5588b367,2000,4.5,14.8,-3.2,18.0,1033,128,14.0,115.1,596,1112,4940,22,167,159,260,101,407,-33.8,33.8,621,390,64


In [20]:
climate_monthly.to_csv('../data/processed/climate_features_monthly.csv', index=True, header=True)
climate_seasonal.to_csv('../data/processed/climate_features_seasonal.csv', index=True, header=True)
climate_annual.to_csv('../data/processed/climate_features_annual.csv', index=True, header=True)

## Combine LiDAR Features
We've got cloudmetrics, gridsurfacestats, and topometrics attributes to combine

In [21]:
cloudmets = pd.read_csv(CLOUDMETS, low_memory=False)
cloudmets['uuid'] = cloudmets.FileTitle.apply(lambda x: x.split('_')[0]).str[0:8]
cloudmets['lidar_year'] = cloudmets.FileTitle.apply(lambda x: int(x.split('_')[-1].split('-')[-1]))
cloudmets['lidar_acq'] = cloudmets.FileTitle.apply(lambda x: '_'.join(x.split('_')[1:3]))

In [22]:
RENAME_COLS = {'Total return count': 'num_returns',
               'Elev strata (below 0.15) return proportion':'strat0_return-proportion',
               'Elev strata (0.15 to 1.37) return proportion':'strat1_return-proportion',
               'Elev strata (5.00 to 10.00) return proportion':'strat2_return-proportion',
               'Elev strata (10.00 to 20.00) return proportion':'strat3_return-proportion',
               'Elev strata (20.00 to 30.00) return proportion':'strat4_return-proportion',
               'Elev strata (above 30.00) return proportion':'strat5_return-proportion',
               'Elev skewness': 'skewness',
               'Elev kurtosis': 'kurtosis',
               'Int strata (below 0.15) median':'strat0_intensity-median',
               'Int strata (0.15 to 1.37) median':'strat1_intensity-median',
               'Int strata (1.37 to 5.00) median':'strat2_intensity-median',
               'Int strata (5.00 to 10.00) median':'strat3_intensity-median',
               'Int strata (10.00 to 20.00) median':'strat4_intensity-median',
               'Int strata (above 30.00) median':'strat5_intensity-median',
               'Elev P05':'height_05-percentile','Elev P25':'height_25-percentile',
               'Elev P50':'height_50-percentile',
               'Elev P75':'height_75-percentile',
               'Elev P95':'height_95_percentile',
               'Elev maximum':'height_max',
               'Percentage all returns above 1.37':'cover'}

In [23]:
cloudmets = cloudmets.rename(RENAME_COLS, axis=1)
KEEP_COLS = [col for col in RENAME_COLS.values()] + ['uuid', 'lidar_acq', 'lidar_year']
cloudmets = cloudmets[KEEP_COLS].set_index(['uuid', 'lidar_year'])
cloudmets.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5135 entries, (136a045a, 2016) to (814487b1, 2016)
Data columns (total 23 columns):
num_returns                 5135 non-null int64
strat0_return-proportion    5135 non-null float64
strat1_return-proportion    5135 non-null float64
strat2_return-proportion    5135 non-null float64
strat3_return-proportion    5135 non-null float64
strat4_return-proportion    5135 non-null float64
strat5_return-proportion    5135 non-null float64
skewness                    5135 non-null object
kurtosis                    5135 non-null object
strat0_intensity-median     5135 non-null float64
strat1_intensity-median     5135 non-null float64
strat2_intensity-median     5135 non-null float64
strat3_intensity-median     5135 non-null float64
strat4_intensity-median     5135 non-null float64
strat5_intensity-median     5135 non-null float64
height_05-percentile        5135 non-null float64
height_25-percentile        5135 non-null float64
height_50-percentile

In [24]:
cloudmets.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_returns,strat0_return-proportion,strat1_return-proportion,strat2_return-proportion,strat3_return-proportion,strat4_return-proportion,strat5_return-proportion,skewness,kurtosis,strat0_intensity-median,strat1_intensity-median,strat2_intensity-median,strat3_intensity-median,strat4_intensity-median,strat5_intensity-median,height_05-percentile,height_25-percentile,height_50-percentile,height_75-percentile,height_95_percentile,height_max,cover,lidar_acq
uuid,lidar_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
136a045a,2016,29835,0.037774,0.028758,0.048064,0.293079,0.484632,0.088587,-0.651806,2.968496,768.0,16.0,1507.5,1720.0,2899.0,10218.0,0.28,15.38,21.610001,26.68,32.209999,40.32,93.34674,wa-western_2016
43215ef3,2012,10622,0.226793,0.038411,0.034551,0.212201,0.442101,0.031538,-0.414682,1.6657939999999998,7.0,7.0,4.0,3.0,3.0,5.0,0.0,0.35,19.244999,24.139999,28.629,35.720001,73.479571,chehalis_2012
b7710811,2014,7949,0.042395,0.053592,0.100013,0.089948,0.182664,0.38596,-0.148031,1.486143,19.0,47.0,48.0,40.0,18.0,62.0,0.23,5.33,24.969999,34.52,41.366001,47.84,90.401308,willapa-delivery2_2014
bf16c38f,2006,1402,0.35806,0.097004,0.067047,0.268188,0.177603,0.008559,0.445194,1.702418,39.0,35.0,13.0,11.0,12.0,25.0,0.0,0.0,6.675,18.15,25.4785,32.32,54.493581,ahtanum_2006
853abb5f,2005,517,0.174081,0.116054,0.015474,0.284333,0.410058,0.0,-0.489827,1.632783,446.5,505.0,-9999.0,542.5,632.0,-9999.0,0.0,0.44,18.27,23.129999,26.257999,28.629999,70.98646,lower-columbia_2005


In [25]:
gridsurf = pd.read_csv(GRIDSURF)
gridsurf['uuid'] = gridsurf.plot_id.apply(lambda x: x.split('_')[0]).str[0:8]
gridsurf['lidar_year'] = gridsurf.plot_id.apply(lambda x: int(x.split('_')[-1].split('-')[-1]))
KEEP_COLS = ['uuid', 'lidar_year', 'potential_volume', 'stddev_height',
             'surface_area_ratio', 'surface_volume', 'surface_volume_ratio']
gridsurf = gridsurf[KEEP_COLS].set_index(['uuid', 'lidar_year'])
gridsurf.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5135 entries, (00027724, 2015) to (fff7e1c3, 2008)
Data columns (total 5 columns):
potential_volume        5135 non-null float64
stddev_height           5135 non-null float64
surface_area_ratio      5135 non-null float64
surface_volume          5135 non-null float64
surface_volume_ratio    4830 non-null float64
dtypes: float64(5)
memory usage: 249.2+ KB


In [26]:
gridsurf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,potential_volume,stddev_height,surface_area_ratio,surface_volume,surface_volume_ratio
uuid,lidar_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00027724,2015,94.0,0.079546,1.024375,1.406838,0.025674
0080963b,2008,5046.0,11.256773,7.121909,3391.452148,0.717014
0083eb5e,2006,2658.0,8.932764,6.092628,831.681885,0.312898
008dc9d1,2013,3718.5,2.579358,2.883042,3173.707031,0.860225
008dc9d1,2016,3895.0,2.444401,2.553517,3310.491699,0.869114


In [27]:
topomets = pd.read_csv(TOPOMETS)
topomets['uuid'] = topomets.plot_id.apply(lambda x: x.split('_')[0]).str[0:8]
topomets['lidar_year'] = topomets.plot_id.apply(lambda x: int(x.split('_')[-1].split('-')[-1]))
topomets = topomets.drop('plot_id', axis=1).set_index(['uuid', 'lidar_year'])
topomets.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5135 entries, (00027724, 2015) to (fff7e1c3, 2008)
Data columns (total 7 columns):
aspect                   5069 non-null float64
elevation                5135 non-null float64
overall_curvature        5069 non-null float64
plan_curvature           5069 non-null float64
profile_curvature        5069 non-null float64
slope                    5069 non-null float64
solar_radiation_index    5069 non-null float64
dtypes: float64(7)
memory usage: 329.5+ KB


In [28]:
lidar_features = cloudmets.join(gridsurf, how='outer').join(topomets, how='outer')
lidar_features.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5411 entries, (00027724, 2015) to (fff7e1c3, 2008)
Data columns (total 35 columns):
num_returns                 5411 non-null int64
strat0_return-proportion    5411 non-null float64
strat1_return-proportion    5411 non-null float64
strat2_return-proportion    5411 non-null float64
strat3_return-proportion    5411 non-null float64
strat4_return-proportion    5411 non-null float64
strat5_return-proportion    5411 non-null float64
skewness                    5411 non-null object
kurtosis                    5411 non-null object
strat0_intensity-median     5411 non-null float64
strat1_intensity-median     5411 non-null float64
strat2_intensity-median     5411 non-null float64
strat3_intensity-median     5411 non-null float64
strat4_intensity-median     5411 non-null float64
strat5_intensity-median     5411 non-null float64
height_05-percentile        5411 non-null float64
height_25-percentile        5411 non-null float64
height_50-percentile

In [29]:
lidar_features.to_csv('../data/processed/lidar_features.csv', index=True, header=True)

## Inventory features

In [30]:
fvs = pd.read_csv(FVS)
fvs['uuid'] = fvs['uuid'].str[0:8]
fvs = fvs.set_index(['uuid', 'year'])
fvs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 49410 entries, (ba510248, 2010) to (c4f7f099, 2025)
Data columns (total 75 columns):
tpa                    49410 non-null int64
ba                     49410 non-null int64
sdi                    49410 non-null int64
ccf                    49410 non-null int64
qmd                    49410 non-null float64
tcuft                  49410 non-null int64
topht                  49410 non-null int64
number_of_strata       49410 non-null int64
total_cover            49410 non-null int64
structure_class        49410 non-null object
canopy_baseheight      49410 non-null int64
canopy_bulkdensity     49410 non-null float64
aboveground_biomass    49410 non-null int64
aboveground_carbon     49410 non-null int64
gs_tpa                 49410 non-null int64
AF                     49410 non-null int64
AS                     49410 non-null int64
BM                     49410 non-null int64
BO                     49410 non-null int64
CH                     4

In [31]:
fvs.to_csv('../data/processed/inventory_features.csv', index=True, header=True)

## Combine satellite-derived features
We've got data from Sentinel-1, Sentinel-2, LANDSAT 8, PALSAR, and Landtrendr time-series.

In [32]:
s1 = pd.read_csv(S1)
s1['uuid'] = s1['uuid'].str[0:8]
s1 = s1.set_index(['uuid', 'year'])
s1.columns = ['_'.join(['S1', col.upper()]) for col in s1.columns]
s1.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 30534 entries, (d7c01e3a, 2015) to (0c3b98d7, 2020)
Data columns (total 4 columns):
S1_VV_LEAFOFF    30534 non-null float64
S1_VH_LEAFOFF    28860 non-null float64
S1_VV_LEAFON     30534 non-null float64
S1_VH_LEAFON     29115 non-null float64
dtypes: float64(4)
memory usage: 1.1+ MB


In [33]:
s2 = pd.read_csv(S2)
s2['uuid'] = s2['uuid'].str[0:8]
s2 = s2.set_index(['uuid', 'year'])
s2.columns = ['_'.join(['S2', col.upper()]) for col in s2.columns]
s2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10178 entries, (c16be14e, 2019) to (dcf80683, 2020)
Data columns (total 20 columns):
S2_R_LEAFOFF        10178 non-null float64
S2_G_LEAFOFF        10178 non-null float64
S2_B_LEAFOFF        10178 non-null float64
S2_NIR_LEAFOFF      10178 non-null float64
S2_SWIR1_LEAFOFF    10178 non-null float64
S2_SWIR2_LEAFOFF    10178 non-null float64
S2_RE1_LEAFOFF      10178 non-null float64
S2_RE2_LEAFOFF      10178 non-null float64
S2_RE3_LEAFOFF      10178 non-null float64
S2_RE4_LEAFOFF      10178 non-null float64
S2_R_LEAFON         10178 non-null float64
S2_G_LEAFON         10178 non-null float64
S2_B_LEAFON         10178 non-null float64
S2_NIR_LEAFON       10178 non-null float64
S2_SWIR1_LEAFON     10178 non-null float64
S2_SWIR2_LEAFON     10178 non-null float64
S2_RE1_LEAFON       10178 non-null float64
S2_RE2_LEAFON       10178 non-null float64
S2_RE3_LEAFON       10178 non-null float64
S2_RE4_LEAFON       10178 non-null float64
dtype

In [34]:
landsat = pd.read_csv(L8)
landsat['uuid'] = landsat['uuid'].str[0:8]
landsat = landsat.set_index(['uuid', 'year'])
landsat.columns = ['_'.join(['L8', col.upper()]) for col in landsat.columns]
landsat = landsat/1000.
landsat.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 35623 entries, (5588b367, 2014) to (0c3b98d7, 2020)
Data columns (total 12 columns):
L8_R_LEAFOFF        35425 non-null float64
L8_G_LEAFOFF        35425 non-null float64
L8_B_LEAFOFF        35425 non-null float64
L8_NIR_LEAFOFF      35425 non-null float64
L8_SWIR1_LEAFOFF    35425 non-null float64
L8_SWIR2_LEAFOFF    35425 non-null float64
L8_R_LEAFON         35621 non-null float64
L8_G_LEAFON         35621 non-null float64
L8_B_LEAFON         35621 non-null float64
L8_NIR_LEAFON       35621 non-null float64
L8_SWIR1_LEAFON     35621 non-null float64
L8_SWIR2_LEAFON     35621 non-null float64
dtypes: float64(12)
memory usage: 3.4+ MB


In [35]:
palsar = pd.read_csv(PALSAR)
palsar['uuid'] = palsar['uuid'].str[0:8]
palsar = palsar.set_index(['uuid', 'year'])
palsar = palsar[['HH', 'HV']]
palsar.columns = ['_'.join(['P', col.upper()]) for col in palsar.columns]
palsar.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 45801 entries, (5b5e205d, 2010) to (dcf80683, 2018)
Data columns (total 2 columns):
P_HH    20356 non-null float64
P_HV    20356 non-null float64
dtypes: float64(2)
memory usage: 889.8+ KB


In [36]:
landtrendr = pd.read_csv(LANDTRENDR)
landtrendr['uuid'] = landtrendr['uuid'].str[0:8]
landtrendr = landtrendr.set_index(['uuid', 'year'])
landtrendr.columns = ['_'.join(['LT', col.upper()]) for col in landtrendr.columns]
landtrendr = landtrendr.astype(int)
landtrendr.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 55979 entries, (d7c01e3a, 2010) to (4214cf54, 2020)
Data columns (total 8 columns):
LT_DUR_NBR       55979 non-null int64
LT_DUR_SWIR1     55979 non-null int64
LT_MAG_NBR       55979 non-null int64
LT_MAG_SWIR1     55979 non-null int64
LT_RATE_NBR      55979 non-null int64
LT_RATE_SWIR1    55979 non-null int64
LT_YSD_NBR       55979 non-null int64
LT_YSD_SWIR1     55979 non-null int64
dtypes: int64(8)
memory usage: 3.6+ MB


In [47]:
landtrendr.columns

Index(['LT_DUR_NBR', 'LT_DUR_SWIR1', 'LT_MAG_NBR', 'LT_MAG_SWIR1',
       'LT_RATE_NBR', 'LT_RATE_SWIR1', 'LT_YSD_NBR', 'LT_YSD_SWIR1'],
      dtype='object')

In [37]:
satellite_features = s1.join(s2, how='outer').join(palsar, how='outer').join(landsat, how='outer').join(landtrendr, how='outer')

In [44]:
df = satellite_features
for season in ['LEAFON', 'LEAFOFF']:
    for sensor in ['L8', 'S2']:
        R, G, B = f'{sensor}_R_{season}', f'{sensor}_G_{season}', f'{sensor}_B_{season}'
        NIR, SWIR1, SWIR2 =  f'{sensor}_NIR_{season}', f'{sensor}_SWIR1_{season}', f'{sensor}_SWIR2_{season}'

        NDVI = f'{sensor}_NDVI_{season}'
        df[NDVI] = (df[NIR] - df[R])/(df[NIR] + df[R])

        ENDVI = f'{sensor}_ENDVI_{season}'
        df[NDVI] = (df[NIR] + df[G] - 2*df[B])/(df[NIR] + df[G] + 2*df[B])

        SAVI = f'{sensor}_SAVI_{season}'
        df[SAVI] = 1.5*(df[NIR] - df[R])/(df[NIR] + df[R] + 0.5)
        
        BRIGHTNESS = f'{sensor}_BRIGHTNESS_{season}'
        df[BRIGHTNESS] = 0.3029*df[B] + 0.2786*df[G] + 0.4733*df[R] + 0.5599*df[NIR] + 0.508*df[SWIR1] + 0.1872*df[SWIR2]
        
        GREENNESS = f'{sensor}_GREENNESS_{season}'
        df[GREENNESS] = -0.2941*df[B] + -0.243*df[G] + -0.5424*df[R] + 0.7276*df[NIR] + 0.0713*df[SWIR1] + -0.1608*df[SWIR2]
        
        WETNESS = f'{sensor}_WETNESS_{season}'
        df[WETNESS] = 0.1511*df[B] + 0.1973*df[G] + 0.3283*df[R] + 0.3407*df[NIR] + -0.7117*df[SWIR1] + -0.4559*df[SWIR2]

for sensor in ['L8', 'S2']:
    if sensor == 'S2':
        addl_bands = ['RE1', 'RE2', 'RE3', 'RE4']
    else:
        addl_bands =  []
    for band in ['R', 'G', 'B', 'NIR', 'SWIR1', 'SWIR2', 'NDVI', 'SAVI', 'BRIGHTNESS', 'GREENNESS', 'WETNESS'] + addl_bands:
        df[f'{sensor}_d{band}'] = df[f'{sensor}_{band}_LEAFON'] - df[f'{sensor}_{band}_LEAFOFF']

In [45]:
satellite_features.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 55979 entries, (00027724, 2010) to (fff7e1c3, 2020)
Data columns (total 92 columns):
S1_VV_LEAFOFF            30534 non-null float64
S1_VH_LEAFOFF            28860 non-null float64
S1_VV_LEAFON             30534 non-null float64
S1_VH_LEAFON             29115 non-null float64
S2_R_LEAFOFF             10178 non-null float64
S2_G_LEAFOFF             10178 non-null float64
S2_B_LEAFOFF             10178 non-null float64
S2_NIR_LEAFOFF           10178 non-null float64
S2_SWIR1_LEAFOFF         10178 non-null float64
S2_SWIR2_LEAFOFF         10178 non-null float64
S2_RE1_LEAFOFF           10178 non-null float64
S2_RE2_LEAFOFF           10178 non-null float64
S2_RE3_LEAFOFF           10178 non-null float64
S2_RE4_LEAFOFF           10178 non-null float64
S2_R_LEAFON              10178 non-null float64
S2_G_LEAFON              10178 non-null float64
S2_B_LEAFON              10178 non-null float64
S2_NIR_LEAFON            10178 non-null float64
S

In [46]:
satellite_features.to_csv('../data/processed/satellite_features.csv', index=True, header=True)