In [212]:
#libs
import numpy as np
import pandas as pd
import random

#regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
#import xgboost as xgb

from sklearn.metrics import accuracy_score, classification_report

In [204]:
## load data
df = pd.read_csv('../data/trees_data_train.csv', index_col=0)
#load pcs
tree_pcs = pd.read_csv('../data/single_tree_pcs.csv', index_col=0)
xval_zones = pd.read_csv('../data/Zone_splits_OFFICIAL.txt')

df.HEIGHT = pd.to_numeric(df.HEIGHT, errors='coerce')
df.DBH = pd.to_numeric(df.DBH, errors='coerce')

In [205]:
#Label taxa categories
conifer_fams = ['Pinaceae', 'Arucarailes', 'Podocarpaceae', 'Sciadopityaceae', 'Cupressaceae', 'Cephalotaxaceae', 'Taxaceae']
genus_codes = {'PINUS':1, 'ACER':2, 'QUERCUS':3, 'MALUS':4, 'GLEDITSIA': 5, 
               'PICEA': 6, 'ULMUS': 7, 'THUJA': 8, 'PLANTANUS': 9, 'FRAXINUS': 10}
df['CONIFER_CAT'] = df.FAMILY.isin(conifer_fams).map({True: 1, False: 2})
df['GENUS_CAT'] = df.GENUS.map(genus_codes).fillna(11).astype(int)

In [207]:
df.sample(3)

Unnamed: 0,FID,ACC_NUM_AN,ACC_NUM,ACC_NUM_QU,ACC_NUM__1,NAME_NUM,FAMILY,NAME,NAME_HTML,ALT_SYN_NA,...,LABEL_HAVE,LABEL_NEED,DEDIC_LABE,GPS_Point,Zone_ID,IN_ZONE,Plot_ID,IN_PLOT,CONIFER_CAT,GENUS_CAT
17175,17176,CC5895*03,CC5895,3,CC5895*03,1086,Ulmaceae,Ulmus rubra,<I>Ulmus rubra</I>,,...,,,,POINT (-84.480467 42.727727),zone_I,POLYGON ((-84.47787524919971 42.72750980584095...,X15-Y37,"<GeometryArray>\n[<POLYGON ((-84.48 42.728, -8...",2,7
8176,8177,20060280*06,20060280,6,20060280*06,1736,Pinaceae,Abies balsamea var. phanerolepis,<I>Abies balsamea</I> var. <I>phanerolepis</I>,,...,,,,POINT (-84.480884 42.729811),zone_D,POLYGON ((-84.4808943047544 42.734305411463055...,X11-Y36,"<GeometryArray>\n[<POLYGON ((-84.481 42.73, -8...",1,11
14125,14126,CC5837*01,CC5837,1,CC5837*01,76,Sapindaceae,Acer saccharum,<I>Acer saccharum</I>,,...,,,,POINT (-84.470149 42.726685),zone_Q,POLYGON ((-84.46685388916796 42.72693262734432...,X17-Y53,"<GeometryArray>\n[<POLYGON ((-84.47 42.727, -8...",2,2


In [120]:
train_zones = list(xval_zones['ZONE'].loc[xval_zones['ASSIGNMENT'] == 'train'])
val_zones = list(xval_zones['ZONE'].loc[xval_zones['ASSIGNMENT'] == 'val'])

In [150]:
def single_tree_tts(df=df, pc_df = tree_pcs, train_zones=train_zones, val_zones=val_zones, y_metric='HEIGHT'):
    train_df = df[['FID', y_metric]][df.Zone_ID.isin(train_zones)].dropna()
    val_df = df[['FID', y_metric]][df.Zone_ID.isin(val_zones)].dropna()

    train_fids = list(train_df['FID'])
    val_fids = list(val_df['FID'])
    
    y_train = np.array(train_df[y_metric])
    y_val = np.array(val_df[y_metric])

    X_train = pc_df.loc[train_fids]
    X_val = pc_df.loc[val_fids]
    
    return X_train, X_val, y_train, y_val                 

### Regression models

In [151]:
X_train, X_val, y_train, y_val  = single_tree_tts()

In [154]:
### HEIGHT models
## Linear regression 
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = linear_reg.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(mse, r2)

194.9438532977864 -0.1558992841915281


In [163]:
#Ridge regression model
ridge_reg = Ridge(alpha=1.0)  # Adjust alpha for more or less regularization
ridge_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_reg.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Ridge Regression - Mean Squared Error: {mse:.4f}, R-squared: {r2:.4f}")

Ridge Regression - Mean Squared Error: 194.9439, R-squared: -0.1559


In [164]:
#Lasso regression model
lasso_reg = Lasso(alpha=0.1)  # Adjust alpha for more or less regularization
lasso_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = lasso_reg.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Lasso Regression - Mean Squared Error: {mse:.4f}, R-squared: {r2:.4f}")

Lasso Regression - Mean Squared Error: 194.9017, R-squared: -0.1556


In [165]:
# Train a Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Random Forest - Mean Squared Error: {mse:.4f}, R-squared: {r2:.4f}")


Random Forest - Mean Squared Error: 196.4491, R-squared: -0.1648


In [167]:
### DBH models
X_train, X_val, y_train, y_val  = single_tree_tts(y_metric='DBH')

In [168]:
## Linear regression 
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = linear_reg.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(mse, r2)

54.53531996746001 0.1977861458975394


In [169]:
#Ridge regression model
ridge_reg = Ridge(alpha=1.0)  # Adjust alpha for more or less regularization
ridge_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_reg.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Ridge Regression - Mean Squared Error: {mse:.4f}, R-squared: {r2:.4f}")

Ridge Regression - Mean Squared Error: 54.5353, R-squared: 0.1978


In [170]:
#Lasso regression model
lasso_reg = Lasso(alpha=0.1)  # Adjust alpha for more or less regularization
lasso_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = lasso_reg.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Lasso Regression - Mean Squared Error: {mse:.4f}, R-squared: {r2:.4f}")

Lasso Regression - Mean Squared Error: 54.5337, R-squared: 0.1978


In [171]:
# Train a Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Random Forest - Mean Squared Error: {mse:.4f}, R-squared: {r2:.4f}")

Random Forest - Mean Squared Error: 57.8000, R-squared: 0.1498


### Classification models

In [213]:
### Conifer vs. Broadleaf
## Logistic Regression
X_train, X_val, y_train, y_val = single_tree_tts(y_metric='CONIFER_CAT')

# Instantiate the model
logistic_reg = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
logistic_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_reg.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")
print(classification_report(y_val, y_pred))

Logistic Regression Accuracy: 0.7300
              precision    recall  f1-score   support

           1       0.35      0.21      0.27       999
           2       0.79      0.88      0.83      3364

    accuracy                           0.73      4363
   macro avg       0.57      0.55      0.55      4363
weighted avg       0.69      0.73      0.70      4363



In [217]:
### Top genera classification
X_train, X_val, y_train, y_val = single_tree_tts(y_metric='GENUS_CAT')

# logistic regression
logistic_reg = LogisticRegression(max_iter=10000, random_state=42)

# Train the model
logistic_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_reg.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")
print(classification_report(y_val, y_pred))


Logistic Regression Accuracy: 0.3495
              precision    recall  f1-score   support

           1       0.13      0.16      0.15       438
           2       0.34      0.19      0.24       712
           3       0.30      0.09      0.14       476
           4       0.11      0.02      0.04       327
           5       0.09      0.02      0.03       152
           6       0.13      0.08      0.10       187
           7       0.50      0.01      0.03       147
           8       0.00      0.00      0.00       207
          10       0.00      0.00      0.00        43
          11       0.41      0.75      0.53      1674

    accuracy                           0.35      4363
   macro avg       0.20      0.13      0.13      4363
weighted avg       0.29      0.35      0.28      4363



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [279]:
shuffler = np.random.default_rng(42)
y_train_random = y_train.copy()
shuffler.shuffle(y_train_random)
y_train, y_train_random

(array([ 1, 11,  4, ...,  2,  2,  1]), array([11,  1,  2, ..., 11,  2, 11]))

### YOLO classifier