# Diffuse Radiation Fertilization Statistics Function Library

put functions for Andrew Loeppky's EOSC 510 final project here

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.stats import pearsonr

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# show full dataframes
pd.set_option('max_columns', None)

# for temporarily displaying function output
from IPython.display import clear_output

In [None]:
def check_complete(data_in):
    """
    Prints out the percentage of non-NaN values in a dataset
    """
    data_len = np.shape(data_in)[0]
    for key in data_in.keys():
        if key != "datetime":
            not_nans = np.shape((data_in[np.isnan(data_in[key]) == False]))[0]
            completeness = not_nans / data_len * 100
            print(f"{key}:  {round(completeness,2)} %")
    return None

In [None]:
def keep_complete(data_in, thres):
    """
    returns a dataframe that contains a percentage of non-NaNs above
    a specified threshhold
    """
    data_out = pd.DataFrame()
    data_len = np.shape(data_in)[0]
    for key in data_in.keys():
        if key != "datetime":
            not_nans = np.shape((data_in[np.isnan(data_in[key]) == False]))[0]
            completeness = not_nans / data_len * 100
            if completeness >= thres:
                data_out[key] = data_in[key]
    return data_out

In [1]:
def do_PCA(data, figtitle):
    """
    does PCA (future version should revise docsting)
    """
    # separate data into inputs (almost everything) and outputs (CO2 Flux data)
    CO2_vars = ["FC", "SC", "NEE_PI"]
    outputs = data[CO2_vars]
    data = data.drop(CO2_vars, axis=1)
    
    # normalize data
    data = data.drop("smoke", axis=1)
    for col in data.columns:
        data[col] -= np.mean(data[col])
        data[col] /= np.std(data[col])
    
    # do PCA
    n_modes = np.min(np.shape(data))
    pca = PCA(n_components = n_modes)
    PCs = pca.fit_transform(data)
    eigvecs = pca.components_
    fracVar = pca.explained_variance_ratio_
    
    #plot fraction of variance explained by each mode
    plt.figure(figsize=(10,5))

    plt.subplot(1,2,1)
    plt.scatter(range(len(fracVar)),fracVar)
    plt.xlabel('Mode Number')
    plt.ylabel('Fraction Variance Explained')
    plt.title('Variance Explained by All Modes')

    plt.subplot(1,2,2)
    n_modes_show = 10
    plt.scatter(range(n_modes_show),fracVar[:n_modes_show])
    plt.xlabel('Mode Number')
    plt.ylabel('Fraction Variance Explained')
    plt.title('Variance Explained by First ' + str(n_modes_show) + ' Modes')

    plt.tight_layout()
    plt.show()
    
    # ask for user input upon seeing fractional variance
    keep_modes = int(input("How many Modes to Retain? "))
    clear_output(wait=True)
    
    # show the resulting PCs and eigvecs
    fig, ax = plt.subplots(2, figsize=(15,4))
    fig.suptitle(figtitle)
    for i in range(keep_modes):
        ax[0].plot(PCs[...,i], label=f"PC{i}")
        ax[1].plot(eigvecs[...,i], label=f"eigenvector {i}")
    ax[0].legend(loc="upper left")
    ax[1].legend(loc="upper left")
    
    return eigvecs, PCs, outputs

In [4]:
def do_MLR(PCs, outputs, plot_title, keep_modes):
    """
    does MLR (docstring needs revision)
    """
    # normalize each PC
    for PC in PCs.T:
        PC /= np.max(np.abs(PC))

    # assign predictors and predictands
    X = pd.DataFrame(PCs[...,:keep_modes])
    Y = outputs["NEE_PI"]

    # Do MLR
    model = LinearRegression().fit(X, Y)
    ypred_MLR = model.predict(X)  # y predicted by MLR
    intercept_MLR = model.intercept_  # intercept predicted by MLR
    coef_MLR = model.coef_.flatten()  # regression coefficients in MLR model
    R2_MLR = model.score(X, Y)  # R-squared value from MLR model

    # Display the results
    print("=== MLR Coefficients ===")
    for i, coef in enumerate(coef_MLR):
        print(f"PC{i}: {coef}")
    print("========================")
    
    # Plot reconstructed and measured data
    ind = np.arange(X.shape[0])
    Y_mod = (
        intercept_MLR
        + (coef_MLR[0] * X[0])
        + (coef_MLR[1] * X[1])
        + (coef_MLR[2] * X[2])
        + (coef_MLR[3] * X[3])
    )
    fig, ax = plt.subplots(2,1, figsize = (15,5))
    fig.suptitle(plot_title)
    ax[0].plot(ind, Y, label="Measured CO2 Flux")
    ax[0].plot(ind, Y_mod, label="Reconstructed From PCs")
    ax[0].set_ylabel("Net Ecosystem Exchange\n($\mu mol CO_2/(m^2s^1)$)")
    ax[0].legend()

    ax[1].scatter(Y,Y_mod, alpha=0.6)
    ax[1].annotate(f"$R^2$: {round(R2_MLR,2)}", (min(Y),min(Y_mod)))
    ax[1].set_xlabel("measured CO2 Flux")
    ax[1].set_ylabel("modeled CO2 Flux")
    plt.tight_layout()
    
    return None

In [None]:
def rank_inputs(data, eigvecs, keep_modes, rank_mode=0):
    """
    takes in a dataframe and it's eigenvectors, returns a 
    dataframe with ranking in order of importance in the leading PC
    """
    drop_me = ["FC", "SC", "NEE_PI", "smoke"]
    pc_data = data.drop(drop_me, axis=1)

    rank_eig = pd.DataFrame()
    rank_eig.index = list(pc_data.columns)
    for i in range(keep_modes):
        rank_eig[f"eigvec {i}"] = list(np.abs(eigvecs[...,i]))

    return rank_eig.sort_values(by=f"eigvec {rank_mode}", ascending=False)