In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import joblib
import flask

In [4]:
print(pd.__version__)
print(np.__version__)
print(joblib.__version__)
print(sklearn.__version__)
print(flask.__version__)

1.5.2
1.24.0
1.2.0
1.2.1
2.2.2


In [2]:
# Load dataset
df = pd.read_csv('EcologicalRestoration_FinalDataset.csv')

In [3]:
# Drop rows with NaN values in key columns
df_cleaned = df.dropna(subset=['Climate', 'Ecosystem', 'Latitude', 'Longitude', 'MAT (C)', 'MAP(mm)', 'Soil depth (cm)', 'Ambient SOC(g kg-1)', 'Restoration time (years)', 'lnRR.SOC'])

In [4]:
# Define filtering functions
def filter_data(df, climate, ecosystem):
    return df[(df['Climate'] == climate) & (df['Ecosystem'] == ecosystem)]

# Create filtered datasets
df_subtropical_forest = filter_data(df_cleaned, 'Subtropical', 'Forest')
df_subtropical_non_forest = filter_data(df_cleaned, 'Subtropical', 'Non-forest')
df_temperate_forest = filter_data(df_cleaned, 'Temperate', 'Forest')
df_temperate_non_forest = filter_data(df_cleaned, 'Temperate', 'Non-forest')
df_tropical_forest = filter_data(df_cleaned, 'Tropical', 'Forest')
df_tropical_non_forest = filter_data(df_cleaned, 'Tropical', 'Non-forest')

In [5]:
# Define baseline model evaluation function
def evaluate_baseline(data, target_variable):
    if data.shape[0] == 0:
        print(f"No data available for baseline evaluation of {target_variable}.")
        return

    y = data[target_variable]
    mean_value = y.mean()
    y_pred_baseline = np.full_like(y, mean_value)
    mse_baseline = mean_squared_error(y, y_pred_baseline)
    return mse_baseline

In [6]:
# Define model training and evaluation function for single target
def train_and_evaluate_models(data, target_variable, model_name, category_name, baseline_mse):
    if data.shape[0] == 0:
        print(f"No data available for {category_name} - {target_variable} - {model_name}. Skipping.")
        return

    X = data[['Latitude', 'Longitude', 'MAT (C)', 'MAP(mm)', 'Soil depth (cm)', 'Ambient SOC(g kg-1)']]
    y = data[target_variable]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate other models
    models = {
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Linear Regression': LinearRegression(),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'Support Vector Regression': SVR(kernel='rbf')
    }
    
    model = models[model_name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'{category_name} - {target_variable} - {model_name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}, Baseline MSE: {baseline_mse:.4f}')
    
    # Save model
    joblib.dump(model, f'{model_name.replace(" ", "_")}_{target_variable}_{category_name}.pkl')

In [7]:
# Define dataset categories and train models
categories = {
    'Subtropical Forest': df_subtropical_forest,
    'Subtropical Non-Forest': df_subtropical_non_forest,
    'Temperate Forest': df_temperate_forest,
    'Temperate Non-Forest': df_temperate_non_forest,
    'Tropical Forest': df_tropical_forest,
    'Tropical Non-Forest': df_tropical_non_forest
}

target_variables = ['Restoration time (years)', 'lnRR.SOC']
models = ['Random Forest', 'Linear Regression', 'Gradient Boosting', 'Support Vector Regression']

In [8]:
# Print baseline model results once for each target variable
baseline_mses = {}
for target_variable in target_variables:
    for category_name, data in categories.items():
        mse_baseline = evaluate_baseline(data, target_variable)
        if mse_baseline is not None:
            baseline_mses[(target_variable, category_name)] = mse_baseline
            print(f'Baseline Model for {category_name} - {target_variable} - MSE: {mse_baseline:.4f}')

Baseline Model for Subtropical Forest - Restoration time (years) - MSE: 184.9201
Baseline Model for Subtropical Non-Forest - Restoration time (years) - MSE: 43.4756
Baseline Model for Temperate Forest - Restoration time (years) - MSE: 158.2584
Baseline Model for Temperate Non-Forest - Restoration time (years) - MSE: 173.9965
No data available for baseline evaluation of Restoration time (years).
No data available for baseline evaluation of Restoration time (years).
Baseline Model for Subtropical Forest - lnRR.SOC - MSE: 0.3489
Baseline Model for Subtropical Non-Forest - lnRR.SOC - MSE: 0.1728
Baseline Model for Temperate Forest - lnRR.SOC - MSE: 0.3370
Baseline Model for Temperate Non-Forest - lnRR.SOC - MSE: 0.3007
No data available for baseline evaluation of lnRR.SOC.
No data available for baseline evaluation of lnRR.SOC.


In [9]:
# Train and evaluate models
for category_name, data in categories.items():
    for target_variable in target_variables:
        baseline_mse = baseline_mses.get((target_variable, category_name))
        for model_name in models:
            train_and_evaluate_models(data, target_variable, model_name, category_name, baseline_mse)

Subtropical Forest - Restoration time (years) - Random Forest - MSE: 204.7567, MAE: 8.3083, R²: 0.0543, Baseline MSE: 184.9201
Subtropical Forest - Restoration time (years) - Linear Regression - MSE: 168.1148, MAE: 8.2299, R²: 0.2235, Baseline MSE: 184.9201
Subtropical Forest - Restoration time (years) - Gradient Boosting - MSE: 195.9080, MAE: 7.9795, R²: 0.0952, Baseline MSE: 184.9201
Subtropical Forest - Restoration time (years) - Support Vector Regression - MSE: 200.9181, MAE: 7.6212, R²: 0.0720, Baseline MSE: 184.9201
Subtropical Forest - lnRR.SOC - Random Forest - MSE: 0.1200, MAE: 0.2682, R²: 0.6102, Baseline MSE: 0.3489
Subtropical Forest - lnRR.SOC - Linear Regression - MSE: 0.2541, MAE: 0.4174, R²: 0.1745, Baseline MSE: 0.3489
Subtropical Forest - lnRR.SOC - Gradient Boosting - MSE: 0.1232, MAE: 0.2718, R²: 0.5996, Baseline MSE: 0.3489
Subtropical Forest - lnRR.SOC - Support Vector Regression - MSE: 0.3126, MAE: 0.4388, R²: -0.0155, Baseline MSE: 0.3489
Subtropical Non-Forest 