### Bootstrapped sampling with replacement of trait prediction

This Jupyter notebook resamples with replacement from the forest plot data and re-runs the predictive model n times, storing the results to calculate confidence intervals and coefficient of variation values

In [None]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

**Pre-process and merge dfs**

In [None]:
# Function to rename CHELSA_bio columns consistently
def rename_chelsa_columns(df):
    def clean_chelsa_name(col):
        if col.startswith('CHELSA_bio'):
            parts = col.split('_')
            if len(parts) > 1 and parts[1].startswith('bio'):
                bio_number = parts[1][3:]  # Extract number part from 'bio<number>'
                return f'CHELSA_bio{bio_number}'
        return col

    # Apply the renaming function to all columns
    df.columns = [clean_chelsa_name(col) for col in df.columns]
    return df

# Function to get a consistent numerical order for CHELSA_bio columns
def get_consistent_chelsa_order(*dfs):
    # Collect all CHELSA_bio columns from all DataFrames
    all_chelsa_columns = set()
    for df in dfs:
        chelsa_columns = [col for col in df.columns if col.startswith('CHELSA_bio')]
        all_chelsa_columns.update(chelsa_columns)
    
    # Sort the collected columns numerically
    sorted_chelsa_columns = sorted(all_chelsa_columns, key=lambda x: int(x.replace('CHELSA_bio', '')))
    return sorted_chelsa_columns

# Function to reorder columns based on a given order
def reorder_columns(df, desired_order):
    chelsa_columns = [col for col in df.columns if col.startswith('CHELSA_bio')]
    non_chelsa_columns = [col for col in df.columns if not col.startswith('CHELSA_bio')]
    
    # Ensure that only the columns that exist in the DataFrame are reordered
    chelsa_columns = [col for col in desired_order if col in chelsa_columns]
    
    # Reorder columns
    ordered_columns = non_chelsa_columns + chelsa_columns
    return df[ordered_columns]

# Function to move 'lat' and 'lon' to specific positions
def move_lat_lon_columns(df, lat_pos=None, lon_pos=None):
    columns = df.columns.tolist()  # Get the list of columns
    if 'lat' in columns and lat_pos is not None:
        columns.insert(lat_pos, columns.pop(columns.index('lat')))  # Move 'lat' to the specified position
    if 'lon' in columns and lon_pos is not None:
        columns.insert(lon_pos, columns.pop(columns.index('lon')))  # Move 'lon' to the specified position
    return df[columns]  # Reorder DataFrame

# Load and rename columns for all DataFrames
current_df = pd.read_csv('data/precomputed/plot_and_abiotic_data_current.csv')
current_df = rename_chelsa_columns(current_df)

future_climate_ssp126 = pd.read_csv('data/precomputed/plot_and_abiotic_data_ssp126.csv')
future_climate_ssp126 = rename_chelsa_columns(future_climate_ssp126)

future_climate_ssp370 = pd.read_csv('data/precomputed/plot_and_abiotic_data_ssp370.csv')
future_climate_ssp370 = rename_chelsa_columns(future_climate_ssp370)

future_climate_ssp585 = pd.read_csv('data/precomputed/plot_and_abiotic_data_ssp585.csv')
future_climate_ssp585 = rename_chelsa_columns(future_climate_ssp585)
future_climate_ssp585 = rename_chelsa_columns(future_climate_ssp585)

# Determine the consistent CHELSA_bio column order
desired_order = get_consistent_chelsa_order(current_df, future_climate_ssp126, future_climate_ssp370, future_climate_ssp585)

# Apply the same column order to all DataFrames
current_df = reorder_columns(current_df, desired_order)
future_climate_ssp126 = reorder_columns(future_climate_ssp126, desired_order)
future_climate_ssp370 = reorder_columns(future_climate_ssp370, desired_order)
future_climate_ssp585 = reorder_columns(future_climate_ssp585, desired_order)

# Move 'lat' and 'lon' columns to specific positions
current_df = move_lat_lon_columns(current_df, lat_pos=26, lon_pos=27)
future_climate_ssp126 = move_lat_lon_columns(future_climate_ssp126, lat_pos=1, lon_pos=2)  
future_climate_ssp370 = move_lat_lon_columns(future_climate_ssp370, lat_pos=1, lon_pos=2)  
future_climate_ssp585 = move_lat_lon_columns(future_climate_ssp585, lat_pos=1, lon_pos=2)  

current_df

**Bootstrapping: Resample (with replacement) current forest plot data, retrain model and collect predictions**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.utils import resample
from tqdm import tqdm
from scipy import stats
from scipy.stats import chi2_contingency, wilcoxon
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt

# Define traits and features
trait_names = current_df.columns[1:25].tolist()
climate_features = ['EarthEnvTopoMed_Eastness', 'EarthEnvTopoMed_Elevation',
                    'EarthEnvTopoMed_Northness', 'EarthEnvTopoMed_Slope',
                    'SG_Bulk_density_015cm', 'SG_Clay_Content_015cm',
                    'SG_Coarse_fragments_015cm', 'SG_Depth_to_bedrock',
                    'SG_Sand_Content_015cm', 'SG_Silt_Content_015cm', 'CHELSA_bio1',
                    'CHELSA_bio7', 'CHELSA_bio9', 'CHELSA_bio11', 'CHELSA_bio12',
                    'CHELSA_bio15', 'CHELSA_bio17', 'CHELSA_bio19']

# Preprocessing tools
scaler_features = StandardScaler()
scaler_targets = StandardScaler()
poly = PolynomialFeatures(degree=2, include_bias=False)

# Preprocessing functions
def preprocess_data(features_df, trait_names):
    features = features_df[climate_features].values
    features_scaled = scaler_features.fit_transform(features)
    features_poly = poly.fit_transform(features_scaled)

    traits = features_df[trait_names].values
    traits_scaled = scaler_targets.fit_transform(traits)
    
    return features_poly, traits_scaled

def preprocess_features(features_df):
    features = features_df[climate_features].values
    features_scaled = scaler_features.transform(features)
    features_poly = poly.transform(features_scaled)
    return features_poly

def predict_traits(future_df, model):
    # Prepare future climate data
    features_poly = preprocess_features(future_df)
    predicted_traits = multioutput_rf.predict(features_poly)
    return pd.DataFrame(predicted_traits, columns=trait_names, index=future_df['pid'])

# Split data
features = current_df[climate_features].values
traits = current_df[trait_names].values

train_indices, test_indices, features_train, features_test, traits_train, traits_test = train_test_split(
    current_df.index, features, traits, test_size=0.2, random_state=42, stratify=current_df['FIA_group']
)

# Preprocess train/test data
features_train_scaled = scaler_features.fit_transform(features_train)
features_test_scaled = scaler_features.transform(features_test)
features_train_poly = poly.fit_transform(features_train_scaled)
features_test_poly = poly.transform(features_test_scaled)
traits_train_scaled = scaler_targets.fit_transform(traits_train)
traits_test_scaled = scaler_targets.transform(traits_test)

# Train the model
rf_regressor = RandomForestRegressor(
    n_estimators=1800, random_state=42, n_jobs=-1, min_samples_split=2,
    min_samples_leaf=1, max_features='log2', max_depth=20, bootstrap=False
)

multioutput_rf = MultiOutputRegressor(rf_regressor)
multioutput_rf.fit(features_train_poly, traits_train_scaled)

# Evaluate on the test set
rf_pred_scaled = multioutput_rf.predict(features_test_poly)
rf_r2_per_trait = np.array([r2_score(traits_test_scaled[:, i], rf_pred_scaled[:, i]) for i in range(len(trait_names))])
rf_rmse_per_trait = np.sqrt(mean_squared_error(traits_test_scaled, rf_pred_scaled, multioutput='raw_values'))

# Store evaluation metrics
evaluation_metrics_rf = pd.DataFrame({
    'Trait': trait_names,
    'R²': rf_r2_per_trait,
    'RMSE': rf_rmse_per_trait
})
print("\nEvaluation Metrics:")
print(evaluation_metrics_rf)


# Make plot-level predictions for each SSP across bootstrap runs

n_bootstrap = 500  # Number of bootstrap iterations
predicted_traits_bootstrap = {'SSP126': [], 'SSP370': [], 'SSP585': []}

for i in tqdm(range(n_bootstrap), desc="Bootstrapping progress"):
    # Resample current data
    stratified_sample = current_df.groupby('FIA_group').apply(
        lambda x: x.sample(frac=1.0, replace=True, random_state=i)
    ).reset_index(drop=True)

    # Preprocess resampled data
    boot_features = scaler_features.fit_transform(stratified_sample[climate_features].values)
    boot_traits = scaler_targets.fit_transform(stratified_sample[trait_names].values)
    boot_features_poly = poly.fit_transform(boot_features)

    # Train model on resampled data
    multioutput_rf = MultiOutputRegressor(rf_regressor)
    multioutput_rf.fit(boot_features_poly, boot_traits)

    # Predict traits for each SSP
    for ssp, future_df in zip(['SSP126', 'SSP370', 'SSP585'], 
                              [future_climate_ssp126, future_climate_ssp370, future_climate_ssp585]):
        pred = predict_traits(future_df, multioutput_rf)
        
        # Add a bootstrap_run column to keep track of each iteration
        pred['bootstrap_run'] = i  
        predicted_traits_bootstrap[ssp].append(pred)

# Combine all bootstrap runs for each SSP scenario into one DataFrame
predicted_all_runs_ssp126 = pd.concat(predicted_traits_bootstrap['SSP126'], 
                                      keys=range(n_bootstrap), 
                                      names=['bootstrap_run']).reset_index(level='bootstrap_run')
predicted_all_runs_ssp370 = pd.concat(predicted_traits_bootstrap['SSP370'], 
                                      keys=range(n_bootstrap), 
                                      names=['bootstrap_run']).reset_index(level='bootstrap_run')
predicted_all_runs_ssp585 = pd.concat(predicted_traits_bootstrap['SSP585'], 
                                      keys=range(n_bootstrap), 
                                      names=['bootstrap_run']).reset_index(level='bootstrap_run')

# Add metadata to the combined DataFrames
def add_metadata(predicted_df, source_df, ssp_label):
    predicted_df = predicted_df.reset_index()  # Ensure 'pid' is a column
    predicted_df['pid'] = predicted_df['pid'].astype(str)
    source_df['pid'] = source_df['pid'].astype(str)
    predicted_df = predicted_df.merge(
        source_df[['pid', 'lat', 'lon', 'FIA_group', 'ECO_NAME', 'BIOME']], 
        on='pid'
    )
    predicted_df['SSP'] = ssp_label
    return predicted_df

predicted_all_runs_ssp126 = add_metadata(predicted_all_runs_ssp126, current_df, 'SSP126')
predicted_all_runs_ssp370 = add_metadata(predicted_all_runs_ssp370, current_df, 'SSP370')
predicted_all_runs_ssp585 = add_metadata(predicted_all_runs_ssp585, current_df, 'SSP585')

# Save the full bootstrap predictions
predicted_all_runs_ssp126.to_csv("predicted_traits_ssp126_all_bootstraps.csv", index=False)
predicted_all_runs_ssp370.to_csv("predicted_traits_ssp370_all_bootstraps.csv", index=False)
predicted_all_runs_ssp585.to_csv("predicted_traits_ssp585_all_bootstraps.csv", index=False)


In [None]:
# Concatenate the bootstrap DataFrames for each SSP scenario 
predicted_all_runs_ssp126 = pd.concat(predicted_traits_bootstrap['SSP126'])
predicted_all_runs_ssp370 = pd.concat(predicted_traits_bootstrap['SSP370'])
predicted_all_runs_ssp585 = pd.concat(predicted_traits_bootstrap['SSP585'])

# Reset the index to ensure 'pid' is a column rather than the index.
# After reset_index, 'pid' will be a column again.
predicted_all_runs_ssp126 = predicted_all_runs_ssp126.reset_index()
predicted_all_runs_ssp370 = predicted_all_runs_ssp370.reset_index()
predicted_all_runs_ssp585 = predicted_all_runs_ssp585.reset_index()

# Add metadata
predicted_all_runs_ssp126 = add_metadata(predicted_all_runs_ssp126, current_df, 'SSP126')
predicted_all_runs_ssp370 = add_metadata(predicted_all_runs_ssp370, current_df, 'SSP370')
predicted_all_runs_ssp585 = add_metadata(predicted_all_runs_ssp585, current_df, 'SSP585')

# Save the full bootstrap predictions
predicted_all_runs_ssp126.to_csv("predicted_traits_ssp126_all_bootstraps.csv", index=False)
predicted_all_runs_ssp370.to_csv("predicted_traits_ssp370_all_bootstraps.csv", index=False)
predicted_all_runs_ssp585.to_csv("predicted_traits_ssp585_all_bootstraps.csv", index=False)
