In [1]:
import pandas as pd
import numpy as np
import rasterio
from rasterio import *
from rasterio.plot import show
from pyspatialml import Raster
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import joblib

In [2]:
predictors = pd.read_csv("out_data/forest_data_poland_lucas.csv", sep=" ",  index_col=False, na_values = 48,
                         header = None)
pd.set_option('display.max_columns',None)


Better variable names

In [7]:
predictors_names = pd.read_csv("out_data/available_features_names.csv", sep=",",  index_col=False)
predictors.columns = predictors_names.name.tolist()
predictors.tail(5)

Unnamed: 0,GLAD_mean100m,GLAD_median100m,GLAD_mode100m,GLAD_q1,GLAD_q3,CLC10m_mean,CLC10m_median,CLC10m_mode.,CLC10m_q1,CLC10m_q3,CLC_01.vrt,C-GLOPS_mean,C-GLOPS_median,C-GLOPS_mode,C-GLOPS_q1,C-GLOPS_q3,JAXA_q3,JAXA_q1,JAXA_mode,JAXA_median,JAXA_mean
23079,0.246136,0,0,0,1,0.6,1,1,0,1,0,0.191063,0.07,0.07,0.04,0.08,0.5,0.0,0.0,0.0,0.277736
23080,0.0,0,0,0,0,0.0,0,0,0,0,0,0.016844,0.01,0.01,0.01,0.03,0.0,0.0,0.0,0.0,0.0
23081,0.932173,1,1,1,1,0.95,1,1,1,1,1,0.402705,0.44,0.39,0.39,0.62,1.0,1.0,1.0,1.0,0.961699
23082,0.0,0,0,0,0,0.0,0,0,0,0,0,0.043004,0.03,0.03,0.02,0.03,0.0,0.0,0.0,0.0,0.023998
23083,0.0,0,0,0,0,0.09,0,0,0,0,0,0.098559,0.09,0.06,0.06,0.11,0.0,0.0,0.0,0.0,0.0


In [10]:
(predictors.columns)

Index(['GLAD_mean100m', 'GLAD_median100m', 'GLAD_mode100m', 'GLAD_q1',
       'GLAD_q3', 'CLC10m_mean', 'CLC10m_median', 'CLC10m_mode.', 'CLC10m_q1',
       'CLC10m_q3', 'CLC_01.vrt', 'C-GLOPS_mean', 'C-GLOPS_median',
       'C-GLOPS_mode', 'C-GLOPS_q1', 'C-GLOPS_q3', 'JAXA_q3', 'JAXA_q1',
       'JAXA_mode', 'JAXA_median', 'JAXA_mean'],
      dtype='object')

Now the left hand side

In [4]:
outcome = pd.read_csv("out_data/LUCAS/LUCAS_locations.csv", sep=",",  index_col=False)
outcome.tail(5)

Unnamed: 0,X,Y,SURVEY_DATE,LC1,LC1_PERC,LC2,LC2_PERC,forest,forestperc
23079,5278000.0,3114000.0,2028/06/18,C22,100,8,0,1,100.0
23080,5278000.0,3136000.0,2005/07/18,B11,100,8,0,0,
23081,5294000.0,3150000.0,1931/07/18,C10,100,8,88888,1,100.0
23082,5294000.0,3172000.0,2008/08/18,B11,92,E30,5,0,
23083,5286000.0,3116000.0,1931/07/18,Bx1,98,8,88888,0,


In [5]:
print (len(outcome))
print (len(predictors))

23084
23084


Split the Dataset

In [6]:
X    = predictors
Y    = outcome.iloc[:,7].to_frame()
feat = predictors.columns.values
print(X.shape)
print(Y.shape)
print(type(X))
print(type(Y))

(23084, 21)
(23084, 1)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


There are a handful of NAs

In [7]:
na_rows = X.apply(lambda x: np.any(pd.isna(x)), axis=1)
result = np.where(na_rows)[0]
print('NA rows: ' + str(result))

NA rows: [   33    54    58 ... 23032 23046 23077]


In [8]:
X = X.drop(X.index[result])
Y = Y.drop(Y.index[result])


In [9]:
print(X.shape)
print(Y.shape)

(21819, 21)
(21819, 1)


Create 4 dataset for training and testing the algorithm 

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=2023)
y_train = np.ravel(Y_train)
y_test = np.ravel(Y_test)

In [11]:
print(np.mean(y_train==1))
print(np.mean(y_test==1))

0.3208360069667247
0.3242896425297892


Use wandb to seach the model space 

In [12]:
import wandb

In [13]:
wandb.init(project="RF_Foresthybrid_sweep")

[34m[1mwandb[0m: Currently logged in as: [33mhofer1991[0m ([33mnodes[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
#wandb.sklearn.plot_confusion_matrix(y_train, dic_pred['train'])


In [15]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'BIC/val',
      'goal': 'maximize'   
    },
    'parameters': {
        'rf__max_features': {
            'distribution': 'int_uniform',
            'min' : 3 ,
            'max' : 10
        },
        'rf__max_samples':{
            'distribution': 'uniform',
            'min' : 0.3 ,
            'max' : 0.7
        },
        'rf__n_estimators':{
            'distribution': 'int_uniform',
            'min' : 500 ,
            'max' : 5000
        },
        'rf__max_depth':{
            'distribution': 'int_uniform',
            'min' : 50 ,
            'max' : 500
        }
    }
}

In [None]:
# Create the sweep configuration dictionary
sweep_configuration = {
    'method': 'bayes',
    'metric': {
        'goal': 'maximize',
        'name': 'BIC/val'
    },
    'parameters': {}
}

# Add parameters based on column names
for column_name in column_names:
    sweep_configuration['parameters'][column_name] = {'values': [0, 1]}

print(sweep_configuration)

In [16]:
# Initialize a new sweep
# Arguments:
#     – sweep_config: the sweep config dictionary defined above
#     – entity: Set the username for the sweep
#     – project: Set the project name for the sweep
sweep_id = wandb.sweep(sweep_config, entity="hofer1991", project="RF_Foresthybrid_sweep")

Create sweep with ID: uab16tjq
Sweep URL: https://wandb.ai/hofer1991/RF_Foresthybrid_sweep/sweeps/uab16tjq


In [17]:
# Training and Evaluation routines for Sweeping
def performance_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    acc = (tp + tn) / (tn + tp + fp + fn)
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fn)
    bacc= (tpr + tnr) * 0.5
#    f1 = (tpr * tnr) / (tpr + tnr)
    return(acc)

def evaluate(model, X, y):
    y_pred = model.predict(X) 
    metric  = performance_metrics(y, y_pred)
    return(metric)
    
def train(config=None):
    with wandb.init(config=config):
        
        # Config is a variable that holds and saves hyperparameters and inputs
        config = wandb.config
        
        rfReg = RandomForestClassifier(n_estimators=config.rf__n_estimators,
                              max_features=config.rf__max_features,
                              max_depth=config.rf__max_depth,
                              max_samples=config.rf__max_samples,
                              n_jobs=-1,
                             oob_score = True) 
        rfReg.fit(X_train, y_train)
        
        
        acc_train = evaluate(rfReg,X_train, y_train)
        acc_test = evaluate(rfReg, X_test, y_test)
        
        wandb.log({
                "Accuracy/train": acc_train,
                "Accuracy/val": acc_test
        })
        

In [18]:
wandb.agent(sweep_id, train)



<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[34m[1mwandb[0m: Agent Starting Run: mr90q1zj with config:
[34m[1mwandb[0m: 	rf__max_depth: 141
[34m[1mwandb[0m: 	rf__max_features: 10
[34m[1mwandb[0m: 	rf__max_samples: 0.6694717618608634
[34m[1mwandb[0m: 	rf__n_estimators: 4899
Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/user/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 275, in check_stop_status
    self._loop_check_status(
  File "/home/user/.local/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 213, in _loop_check_status
    local_handle = request()
  File "/home/user/.local/lib/python3.10/site-packages/wandb/sdk/interface/interface.py", line 789, in deliver_stop_status
    return self._deliver_stop_status(status)
  File "/home/user/.local/lib/python3.10/sit

Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


0,1
Accuracy/train,▁
Accuracy/val,▁

0,1
Accuracy/train,0.99477
Accuracy/val,0.90009


[34m[1mwandb[0m: [32m[41mERROR[0m Run mr90q1zj errored: UnboundLocalError("local variable 'best_acc' referenced before assignment")
[34m[1mwandb[0m: Agent Starting Run: wgurh3ta with config:
[34m[1mwandb[0m: 	rf__max_depth: 210
[34m[1mwandb[0m: 	rf__max_features: 5
[34m[1mwandb[0m: 	rf__max_samples: 0.4580305095833963
[34m[1mwandb[0m: 	rf__n_estimators: 1440
[34m[1mwandb[0m: Currently logged in as: [33mhofer1991[0m. Use [1m`wandb login --relogin`[0m to force relogin


Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


0,1
Accuracy/train,▁
Accuracy/val,▁

0,1
Accuracy/train,0.96581
Accuracy/val,0.90073


[34m[1mwandb[0m: [32m[41mERROR[0m Run wgurh3ta errored: UnboundLocalError("local variable 'best_acc' referenced before assignment")
[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: uztm6ayb with config:
[34m[1mwandb[0m: 	rf__max_depth: 453
[34m[1mwandb[0m: 	rf__max_features: 9
[34m[1mwandb[0m: 	rf__max_samples: 0.5459961159046715
[34m[1mwandb[0m: 	rf__n_estimators: 1187


Widget Javascript not detected.  It may not be installed or enabled properly. Reconnecting the current kernel may help.


0,1
Accuracy/train,▁
Accuracy/val,▁

0,1
Accuracy/train,0.97727
Accuracy/val,0.90009


[34m[1mwandb[0m: [32m[41mERROR[0m Run uztm6ayb errored: UnboundLocalError("local variable 'best_acc' referenced before assignment")
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Error in callback <function _WandbInit._pause_backend at 0x7f38618e0280> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

In [19]:
a=1

Error in callback <function _WandbInit._resume_backend at 0x7f38618cfd90> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

Error in callback <function _WandbInit._pause_backend at 0x7f38618e0280> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe