Extract the metrics of importance 

e.g. temperature, time, lat, lon, etc..

In [11]:
from kelp_metrics import main as extract_metrics

kelp_data = extract_metrics(lower_lat=27, upper_lat=37)

data loaded from file


## Clean the data and format arrays

In [12]:
import numpy as np

features = {
    # name : variable
    'Temperature [C]': kelp_data['temp'] - 273.15,
    '1Q Lag Temperature [C]': kelp_data['temp_lag'] - 273.15,
    '2Q Lag Temperature [C]': kelp_data['temp_lag2'] - 273.15,
    'Longitude [deg]': kelp_data['lon'], 
    'Latitude [deg]': kelp_data['lat'],
    'Elevation [m]': kelp_data['elevation'],
    'Sunlight [day]': kelp_data['sunlight']
    #'time': kelp_data['time'],
}

# make arrays
X = np.array([features[k] for k in features]).T
y = kelp_data['kelp']
t = kelp_data['time']

# Remove nans from lagged values
nanmask = np.isnan(features['1Q Lag Temperature [C]']) | np.isnan(features['2Q Lag Temperature [C]'])
X = X[~nanmask]
y = y[~nanmask]
t = t[~nanmask]

## Save data to disk

In [13]:
import pandas as pd
import numpy as np

feat_df = {}
for i, k in enumerate(features):
    feat_df[k] = X[:,i]

feat_df['time'] = t
feat_df['kelp'] = y

# Convert the data to a pandas DataFrame and ensure 'time' is in datetime format
df = pd.DataFrame(feat_df)

# Save data to disk using pandas to_csv function with column names as the first row
df.to_csv('extracted_kelp_27_37.csv', index=False)

## Split data into training/validation sets

In [22]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

print(f"Shape of all the data: {X.shape}")

# Split data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of training: {X_train.shape}")
print(f"Shape of testing: {X_test.shape}")

Shape of all the data: (70239, 3)
Shape of training: (56191, 3)
Shape of testing: (14048, 3)


## Fit a regression model using linear least-squares

In [24]:
# Fit OLS regressor on training data
res = sm.OLS(y_train, sm.add_constant(X_train)).fit()

# Predict on training data and compute the average absolute error
y_ols_train = res.predict(sm.add_constant(X_train))
abs_err_ols_train = np.abs(y_train - y_ols_train).mean()
print(f"Avg. Absolute Error Train: {abs_err_ols_train:.3f} m^2")

# Predict on testing data and compute the average absolute error
y_ols_test = res.predict(sm.add_constant(X_test))
abs_err_ols_test = np.abs(y_test - y_ols_test).mean()
print(f"Avg. Absolute Error Test: {abs_err_ols_test:.3f} m^2")

# Regression coefficients
print("Coefficients:")
for feat, coef in zip(features.keys(), res.params[1:]):  # Exclude the constant term
    print(f"  {feat:<25} : {coef:.3f}")

print(res.summary())

Avg. Absolute Error Train: 59.162 m^2
Avg. Absolute Error Test: 59.227 m^2
Coefficients:
  Temperature [C]           : 5.082
  1Q Lag Temperature [C]    : -9.739
  2Q Lag Temperature [C]    : 1.993
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.057
Method:                 Least Squares   F-statistic:                     1123.
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00
Time:                        16:19:41   Log-Likelihood:            -3.3156e+05
No. Observations:               56191   AIC:                         6.631e+05
Df Residuals:                   56187   BIC:                         6.632e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
            

In [None]:
res.params

## analyze correlation metrics for each feature

In [14]:
from sklearn.feature_selection import mutual_info_regression
from scipy import stats
import json

def correlation_tests(x, y, input_name, output_name):
    # measure the significance of the correlation
    correlations = {'input': input_name, 'output':output_name}

    # Pearson's correlation
    corr, pval = stats.pearsonr(x, y)
    correlations['pearsonr'] = {'corr': round(corr, 3), 'pval': round(pval, 3)}

    # Kendall's tau
    tau, pval = stats.kendalltau(x, y)
    correlations['kendalltau'] = {'tau': round(tau, 3), 'pval': round(pval, 3)}

    # Calculate the Spearman rank correlation
    corr, pval = stats.spearmanr(x, y)
    correlations['spearmanr'] = {'corr': round(corr, 3), 'pval': round(pval, 3)}

    # Mann-Kendall
    tau, pval = stats.mstats.kendalltau(x, y)
    correlations['mann.kendall'] = {'tau': round(tau, 3), 'pval': round(pval, 3)}

    # Mutual Information (Regression)
    mi = mutual_info_regression(X.reshape(-1, 1), y)
    correlations['mutual_info_regression'] = {'mi': round(mi[0], 3)}
    
    return correlations

# for each feature examine the correlations
for i, feat in enumerate(features):
    corr = correlation_tests(X[:,i],y, feat, 'Kelp Area [m^2]')
    print(json.dumps(corr,indent=4))

ValueError: Input X contains NaN.