In [1]:
import pandas as pd
import os
import numpy as np
import rasterio as rio
from rasterio.plot import show
from pyspatialml import Raster
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split,GridSearchCV


In [2]:
# Training and Evaluation routines for Sweeping
def performance_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    acc = (tp + tn) / (tn + tp + fp + fn)
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fn)
    bacc= (tpr + tnr) * 0.5
#    f1 = (tpr * tnr) / (tpr + tnr)
    return(acc)

def evaluate(model, X, y):
    y_pred = model.predict(X) 
    metric  = performance_metrics(y, y_pred)
    return(metric)

In [3]:
country = "poland"
country_abbr = "pl"

In [4]:
predictors = pd.read_csv(f"out_data/forest_data_{country}_lucas.csv", sep=" ",  index_col=False, na_values = 48,
                         header = None, low_memory = False)
pd.set_option('display.max_columns',None)


#Better variable names

predictors_names_pd = pd.read_csv(f"out_data/names_{country_abbr}.csv", sep=",",  index_col=False)
predictors_names = predictors_names_pd.name.tolist()
predictors_names_clean = [s.strip().replace('"', '') for s in predictors_names]
predictors_names_pd.name = predictors_names_clean

predictors.columns = predictors_names_clean
predictors.head()

Unnamed: 0,GLAD_mean100m,GLAD_median100m,GLAD_mode100m,GLAD_q1,GLAD_q3,CLC10m_mean,CLC10m_median,CLC10m_mode,CLC10m_q1,CLC10m_q3,CLC_01,C-GLOPS_mean,C-GLOPS_median,C-GLOPS_mode,C-GLOPS_q1,C-GLOPS_q3,JAXA_q3,JAXA_q1,JAXA_mode,JAXA_median,JAXA_mean,FML_01
0,0.0,0,0,0,0,0.0,0,0,0,0,0,0.022032,0.04,0.01,0.01,0.05,0.0,0.0,0.0,0.0,0.0,0
1,0.620165,1,1,0,1,0.62,1,1,0,1,1,0.699817,0.53,0.53,0.53,0.75,1.0,0.5,0.5,0.5,0.654317,1
2,0.925006,1,1,1,1,0.9,1,1,1,1,1,0.528377,0.57,0.57,0.42,0.62,1.0,0.5,0.5,0.5,0.636479,1
3,0.0,0,0,0,0,0.0,0,0,0,0,0,0.076326,0.07,0.07,0.05,0.14,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0,0,0,0,0.0,0,0,0,0,0,0.05656,0.04,0.04,0.04,0.09,0.0,0.0,0.0,0.0,0.0,0


Now the left hand side

In [5]:
outcome = pd.read_csv(f"out_data/LUCAS/LUCAS_{country_abbr}_locations.csv", sep=",",  index_col=False)
outcome.tail(5)

Unnamed: 0,X,Y,LC1,LC1_PERC,forest
23079,5278000.0,3114000.0,C22,100,1
23080,5278000.0,3136000.0,B11,100,0
23081,5294000.0,3150000.0,C10,100,1
23082,5294000.0,3172000.0,B11,92,0
23083,5286000.0,3116000.0,Bx1,98,0


In [6]:
print (len(outcome))
print (len(predictors))

23084
23084


In [7]:
predictor_selection = [
    "JAXA_mean", 
"C-GLOPS_mode",
"CLC10m_mean",
"C-GLOPS_mean",
"GLAD_mean100m"]


Split the Dataset

In [8]:
X    = predictors[predictor_selection]
Y    = outcome.loc[:, 'forest'].to_frame()
feat = predictors.columns.values
print(X.shape)
print(Y.shape)
print(type(X))
print(type(Y))

(23084, 5)
(23084, 1)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


There are a handful of NAs

In [9]:
na_rows = X.apply(lambda x: np.any(pd.isna(x)), axis=1)
result = np.where(na_rows)[0]
print('NA rows: ' + str(result))

NA rows: [   33    54    58 ... 23032 23046 23077]


In [10]:
X = X.drop(X.index[result])
Y = Y.drop(Y.index[result])


In [11]:
print(X.shape)
print(Y.shape)

(21819, 5)
(21819, 1)


Create 4 dataset for training and testing the algorithm 

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=2023)
y_train = np.ravel(Y_train)
y_test = np.ravel(Y_test)

In [13]:
rfReg = RandomForestClassifier(n_estimators=2799,
                        max_features=3,
                        max_depth=250,
                        max_samples=0.3335,
                        n_jobs=-1,
                        oob_score = True) 
rfReg.fit(X_train, y_train)


RandomForestClassifier(max_depth=250, max_features=3, max_samples=0.3335,
                       n_estimators=2799, n_jobs=-1, oob_score=True)

In [14]:
dic_pred = {}
dic_pred['train'] = rfReg.predict(X_train)
dic_pred['test'] = rfReg.predict(X_test)

In [15]:
performance_metrics(y_test, dic_pred['test'])

0.9008249312557287

Get the predictor names and paths

In [16]:
# Filter and maintain the order
selected_predictor_paths = [predictors_names_pd[predictors_names_pd['name'] == predictor]['V1'].values[0] for predictor in predictor_selection if predictor in predictors_names_pd['name'].values]



for i, element in enumerate(predictor_selection):

    print(predictor_selection[i] + ":   " + selected_predictor_paths[i])





JAXA_mean:   out_data/JAXA/poland_mean.tif
C-GLOPS_mode:   out_data/C-GLOPS/2019/EU_mode.vrt
CLC10m_mean:   out_data/CLC10m/EU_mean.tif
C-GLOPS_mean:   out_data/C-GLOPS/2019/EU_mean.vrt
GLAD_mean100m:   out_data/GLADGLCextent/reproj3035_100m_tifs/EU_mean100m.tif


In [17]:
file_path = "/tmp/selected_features.txt"

data = selected_predictor_paths
with open(file_path, "w") as file:
    for layer in data:
        file.write(str(layer) + '\n')
        
!cat /tmp/selected_features.txt



out_data/JAXA/poland_mean.tif
out_data/C-GLOPS/2019/EU_mode.vrt
out_data/CLC10m/EU_mean.tif
out_data/C-GLOPS/2019/EU_mean.vrt
out_data/GLADGLCextent/reproj3035_100m_tifs/EU_mean100m.tif


In [18]:
%%bash

# Path to the text file
filelist="/tmp/selected_features.txt"

# Loop through each line in the file
while IFS= read -r line; do
    gdalinfo "$line" |grep -e "Upper Left" -e "Lower Right" -e "Pixel Size" -e "Band 1"
    echo --------------------------------
done < "$filelist"


Pixel Size = (100.000000000000000,-100.000000000000000)
Upper Left  ( 4598000.000, 3554000.000) ( 14d19'53.70"E, 55d 0'59.58"N)
Lower Right ( 5310000.000, 2948000.000) ( 23d33' 6.72"E, 48d49'23.11"N)
Band 1 Block=7120x1 Type=Float64, ColorInterp=Gray
--------------------------------
Pixel Size = (100.000000000000000,-100.000000000000000)
Upper Left  (  900000.000, 5500000.000) ( 56d30'18.51"W, 56d29' 4.75"N)
Lower Right ( 7400000.000,  900000.000) ( 40d39'45.75"E, 25d32'40.96"N)
Band 1 Block=128x128 Type=Float64, ColorInterp=Gray
--------------------------------
Pixel Size = (100.000000000000000,-100.000000000000000)
Upper Left  (  900000.000, 5500000.000) ( 56d30'18.51"W, 56d29' 4.75"N)
Lower Right ( 7400000.000,  900000.000) ( 40d39'45.75"E, 25d32'40.96"N)
Band 1 Block=65000x1 Type=Float64, ColorInterp=Gray
--------------------------------
Pixel Size = (100.000000000000000,-100.000000000000000)
Upper Left  (  900000.000, 5500000.000) ( 56d30'18.51"W, 56d29' 4.75"N)
Lower Right ( 7400

In [None]:
selected_predictor_paths

Cut the file into smaller chunks so we can load them to RAM

the edges need to be fixed, they seem to go out of bounds a little. when i is 0 

In [19]:
import subprocess
print(country)
if country == "Spain":
    xstart= 2760600
    xend  = 3833000

    ystart= 1530900
    yend  = 2467200

if country == "poland":
    xstart = 4598000
    xend   = 5310000
    
    ystart = 2948000
    yend   = 3554000

    
splits = 4

delta_x = int((xend - xstart) / splits)
delta_y = int((yend - ystart) / splits)


for i in range(splits):
    for j in range(splits):
        print(f"{i} {j}------------------------------------------------------")
        
        temp_xstart = xstart + delta_x*i
        temp_xend   = xstart + delta_x*(i+1)
        temp_yend   = yend - delta_y*j
        temp_ystart = yend - delta_y*(j+1)
        
#         print(f"xstart={temp_xstart}\nxend={temp_xend}")
#         print(f"ystart={temp_ystart}\nyend={temp_yend}")
        
        outfile = f"out_data/window4prediction_{i}_{j}.vrt"
        command = f"gdalbuildvrt -te {temp_xstart} {temp_ystart} {temp_xend} {temp_yend} -separate -input_file_list /tmp/selected_features.txt {outfile}"
        
        subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print(command)
        
#         for i, input_file in enumerate(selected_predictor_paths):
#             input_shortname = predictor_selection[i]
#             outfile_separate = f"out_data/{input_shortname}_window4prediction_{i}_{j}.vrt"
#             command_separate= f"gdalbuildvrt -te {temp_xstart} {temp_ystart} {temp_xend} {temp_yend} -separate {outfile_separate} {input_file}"
#             print(command_separate)
            
#             subprocess.run(command_separate, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        
#         


poland
0 0------------------------------------------------------
gdalbuildvrt -te 4598000 3402500 4776000 3554000 -separate -input_file_list /tmp/selected_features.txt out_data/window4prediction_0_0.vrt
0 1------------------------------------------------------
gdalbuildvrt -te 4598000 3251000 4776000 3402500 -separate -input_file_list /tmp/selected_features.txt out_data/window4prediction_0_1.vrt
0 2------------------------------------------------------
gdalbuildvrt -te 4598000 3099500 4776000 3251000 -separate -input_file_list /tmp/selected_features.txt out_data/window4prediction_0_2.vrt
0 3------------------------------------------------------
gdalbuildvrt -te 4598000 2948000 4776000 3099500 -separate -input_file_list /tmp/selected_features.txt out_data/window4prediction_0_3.vrt
1 0------------------------------------------------------
gdalbuildvrt -te 4776000 3402500 4954000 3554000 -separate -input_file_list /tmp/selected_features.txt out_data/window4prediction_1_0.vrt
1 1----------

In [20]:
import time

for pred_i in range(splits):
    for pred_j in range(splits):
        
        
        start_time = time.time()
        start_time_formatted = time.strftime('%H:%M', time.localtime(start_time))
        in_file = f"out_data/window4prediction_{pred_i}_{pred_j}.vrt"
        out_tif_filename = f'out_data/predictions/{country}_gridsplits{splits}{splits}_{pred_i}_{pred_j}.tif'
        
        if os.path.exists(out_tif_filename):
            print(f"The file {out_tif_filename} exists already. Continue")
            continue
        
        print(f"Start at {start_time_formatted}: {out_tif_filename} ------------------------------------------------------")
        
        stack = Raster(in_file)
        stack.names = predictor_selection
                
        result = stack.predict(estimator=rfReg, dtype='int16', nodata=48)

        result.write(file_path=out_tif_filename, driver = "GTiff")

        end_time = time.time()
        print("Done estimating and writing after --- %s seconds ---" % (end_time - start_time))


Start at 17:54: out_data/predictions/poland_gridsplits44_0_0.tif ------------------------------------------------------
Done estimating and writing after --- 351.3073034286499 seconds ---
Start at 18:00: out_data/predictions/poland_gridsplits44_0_1.tif ------------------------------------------------------
Done estimating and writing after --- 476.529625415802 seconds ---
Start at 18:08: out_data/predictions/poland_gridsplits44_0_2.tif ------------------------------------------------------
Done estimating and writing after --- 478.1296486854553 seconds ---
Start at 18:16: out_data/predictions/poland_gridsplits44_0_3.tif ------------------------------------------------------
Done estimating and writing after --- 453.1867995262146 seconds ---
Start at 18:23: out_data/predictions/poland_gridsplits44_1_0.tif ------------------------------------------------------
Done estimating and writing after --- 518.2296357154846 seconds ---
Start at 18:32: out_data/predictions/poland_gridsplits44_1_1.