# Wildfire Smoke Controls on Gross Primary Production in Central Canada

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.stats import pearsonr

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

# get Sam's stepwise selection function
import statsmodels.api as sm
%run ../lab02/Tutorial_2_2021_functions2.ipynb

# show full dataframes
pd.set_option('max_columns', None)

In [2]:
# get data
data_in = pd.read_csv("drf_timeseries.csv", parse_dates=True, index_col=0)

In [3]:
# select just the growing/wildfire season (may-sept, inclusive)
#data_in["datetime"] = pd.to_datetime(data_in["datetime"])
growing_season = pd.DataFrame()
for month in [5,6,7,8,9]:
    growing_season = growing_season.append(data_in[pd.to_datetime(data_in['datetime']).dt.month == month])

## Find which measurements have the most complete data

In [4]:
# check data for completeness
def check_complete(data_in):
    """
    Prints out the percentage of non-NaN values in a dataset
    """
    data_len = np.shape(data_in)[0]
    for key in data_in.keys():
        if key != "datetime":
            not_nans = np.shape((data_in[np.isnan(data_in[key]) == False]))[0]
            completeness = not_nans / data_len * 100
            print(f"{key}:  {round(completeness,2)} %")
 
def keep_complete(data_in, thres):
    """
    returns a dataframe that contains a percentage of non-NaNs above
    a specified threshhold
    """
    data_out = pd.DataFrame()
    data_out["datetime"] = data_in["datetime"]
    data_len = np.shape(data_in)[0]
    for key in data_in.keys():
        if key != "datetime":
            not_nans = np.shape((data_in[np.isnan(data_in[key]) == False]))[0]
            completeness = not_nans / data_len * 100
            if completeness >= thres:
                data_out[key] = data_in[key]
    return data_out

In [5]:
check_complete(growing_season)

smoke:  98.84 %
AOD_1640nm:  5.23 %
AOD_1020nm:  6.23 %
AOD_870nm:  6.45 %
AOD_865nm:  0.0 %
AOD_779nm:  0.0 %
AOD_675nm:  6.44 %
AOD_667nm:  0.0 %
AOD_620nm:  0.0 %
AOD_560nm:  0.0 %
AOD_555nm:  0.0 %
AOD_551nm:  0.0 %
AOD_532nm:  0.0 %
AOD_531nm:  0.0 %
AOD_510nm:  0.0 %
AOD_500nm:  5.51 %
AOD_490nm:  0.0 %
AOD_443nm:  0.0 %
AOD_440nm:  6.32 %
AOD_412nm:  0.0 %
AOD_400nm:  0.0 %
AOD_380nm:  6.43 %
AOD_340nm:  6.38 %
Precipitable_Water(cm):  6.44 %
AOD_681nm:  0.0 %
AOD_709nm:  0.0 %
Ozone(Dobson):  6.45 %
NO2(Dobson):  6.45 %
USTAR:  35.13 %
TA:  36.0 %
WD:  36.01 %
WS:  36.02 %
NEE_PI:  26.2 %
FC:  34.41 %
SC:  34.61 %
H:  34.89 %
SH:  0.0 %
LE:  34.41 %
SLE:  0.0 %
G:  0.0 %
TS_1:  36.02 %
TS_2:  36.02 %
P:  36.63 %
RH:  35.81 %
PA:  36.6 %
CO2_1:  34.64 %
CO2_2:  27.2 %
VPD_PI:  35.81 %
SWC_1:  9.05 %
SWC_2:  9.05 %
NETRAD:  35.51 %
PPFD_IN:  36.57 %
SW_IN:  35.65 %
SW_DIF:  0.0 %
PPFD_OUT:  34.95 %
SW_OUT:  35.49 %
LW_IN:  36.0 %
LW_OUT:  35.93 %
H2O:  34.64 %
RECO_PI:  4.32 %
AP

Pretty dismal. We need the AOD data, so set a cutoff at 5%. We can hopefully interpolate the rest

In [7]:
comp_data = keep_complete(growing_season, 5.0)
check_complete(comp_data) # see if that worked

smoke:  98.84 %
AOD_1640nm:  5.23 %
AOD_1020nm:  6.23 %
AOD_870nm:  6.45 %
AOD_675nm:  6.44 %
AOD_500nm:  5.51 %
AOD_440nm:  6.32 %
AOD_380nm:  6.43 %
AOD_340nm:  6.38 %
Precipitable_Water(cm):  6.44 %
Ozone(Dobson):  6.45 %
NO2(Dobson):  6.45 %
USTAR:  35.13 %
TA:  36.0 %
WD:  36.01 %
WS:  36.02 %
NEE_PI:  26.2 %
FC:  34.41 %
SC:  34.61 %
H:  34.89 %
LE:  34.41 %
TS_1:  36.02 %
TS_2:  36.02 %
P:  36.63 %
RH:  35.81 %
PA:  36.6 %
CO2_1:  34.64 %
CO2_2:  27.2 %
VPD_PI:  35.81 %
SWC_1:  9.05 %
SWC_2:  9.05 %
NETRAD:  35.51 %
PPFD_IN:  36.57 %
SW_IN:  35.65 %
PPFD_OUT:  34.95 %
SW_OUT:  35.49 %
LW_IN:  36.0 %
LW_OUT:  35.93 %
H2O:  34.64 %
PPFD_DIF:  34.87 %


In [8]:
# get all the AOD 
aod_list = []
for key in data.keys():
    if "AOD" in key:
        aod_list.append(key)
        print(key)

NameError: name 'data' is not defined

In [None]:
data.plot("datetime", aod_list, alpha=0.5);

In [None]:
# interpolation method I settled on (subject to change). Try messing around with this, 
# lots of methods available, most arent appropriate
data["AOD_500nm"].interpolate(method='pchip', limit=500, limit_direction='both').plot();

In [None]:
# now apply the interpolation scheme to all AOD sets
for key in aod_list:
    data[key] = data["AOD_500nm"].interpolate(method='pchip', limit=500, limit_direction='both')

In [None]:
# Blanket interpolate all data using the same scheme (this is a bad idea)
data_interp = data.interpolate(method='linear')#.dropna()
data_interp.plot()

In [None]:
# after we finish messing about with interpolations, drop all rows that still have missing data
data_complete = data.dropna()
data_complete

## Part 2: Try PCA 

In [None]:
data = data_complete.drop("datetime", axis=1)
data = data_interp#.drop("datetime", axis=1)

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
PCs = pca.fit_transform(data)
eigvecs = pca.components_
fracVar = pca.explained_variance_ratio_

In [None]:
#plot fraction of variance explained by each mode
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.scatter(range(len(fracVar)),fracVar)
plt.xlabel('Mode Number')
plt.ylabel('Fraction Variance Explained')
plt.title('Variance Explained by All Modes')

plt.subplot(1,2,2)
n_modes_show = 10
plt.scatter(range(n_modes_show),fracVar[:n_modes_show])
plt.xlabel('Mode Number')
plt.ylabel('Fraction Variance Explained')
plt.title('Variance Explained by First ' + str(n_modes_show) + ' Modes')

plt.tight_layout()

plt.show()

In [None]:
plt.plot(PCs[...,:4]);

This works about as we expect, the vast majority is explained by the first few modes. Now, what are the biggest contributors to these modes?

## Stepwise Linear Regression

Perform stepwise linear regression on the PCs to create a predictive model with inputs

**PCs $\rightarrow$ CO2 Fluxes**

or, if we do rotated PCA:

**smoke, PCs $\rightarrow$ CO2 Fluxes**

In [None]:
# assign predictors and predictands
X = pd.DataFrame(PCs)
Y = data["FC"]

print(X.shape)
print(Y.shape)
# do stepwise regression
#result = stepwise_selection(X, Y)

#print('resulting features:')
#print(result)

In [None]:
plt.plot(eigvecs[0], label=data.columns[0])
plt.legend()

## To Do

- Interpolate all columns, eyeball-optimizing between coverage and faithful interpolated values
- Figure out how to represent smoke as input and CO2 flux data as output
- Try it again with rotated PCA
- Write up the presentation


- find characteristic patterns after grouping by smoke level?