Extract points data from the 01 maps
Then show their individual accuracies

In [5]:
%%bash

start=`date +%s`

# Input files
features_file="out_data/all01_tif.vrt"
coordinates_file="out_data/LUCAS/LUCAS_XY.csv"

# Output file
output_file="out_data/forest_binary_data_poland_lucas.csv"

BB=$(( $(gdalinfo out_data/all01_tif.vrt | grep out_data | wc -l) - 1 ))    # band number

# write header
gdalinfo out_data/all01_tif.vrt | grep out_data | awk -F'/' 'NR>1 { printf "%s ", $2 } END { printf "\n" }' > out_data/forest_binary_data_poland_lucas.csv

gdallocationinfo -geoloc -valonly $features_file < $coordinates_file  | awk -v BB=$BB 'ORS=NR%BB?FS:RS' >> $output_file



end=`date +%s`
expr $end - $start


174


In [6]:
%%bash
head out_data/forest_binary_data_poland_lucas.csv

GLADGLCextent CLC10m CLC C-GLOPS JAXA 
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 0 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0


In [9]:
# Basic data manipulation
import pandas as pd
import numpy as np

# Basic data visualization
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import seaborn as sns



In [10]:

def performance_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    acc = (tp + tn) / (tn + tp + fp + fn)
#    tpr = tp / (tp + fn)
#    tnr = tn / (tn + fn)
#    bacc= (tpr + tnr) * 0.5
#    f1 = (tpr * tnr) / (tpr + tnr)
    return(acc)


In [11]:
X = pd.read_csv("out_data/forest_binary_data_poland_lucas.csv", sep = ' ', na_values= 48).iloc[:,0:4]

X.head(5)

Unnamed: 0,GLADGLCextent,CLC10m,CLC,C-GLOPS
0,0,0,0,0.0
1,1,1,1,1.0
2,1,1,1,1.0
3,0,0,0,0.0
4,0,0,0,0.0


In [12]:
outcome = pd.read_csv("out_data/LUCAS/LUCAS_locations.csv", sep=",",  index_col=False)
outcome.tail(5)

Unnamed: 0,X,Y,LC1,LC1_PERC,forest
23079,5278000.0,3114000.0,C22,100,1
23080,5278000.0,3136000.0,B11,100,0
23081,5294000.0,3150000.0,C10,100,1
23082,5294000.0,3172000.0,B11,92,0
23083,5286000.0,3116000.0,Bx1,98,0


In [13]:
Y    = outcome.iloc[:,4].to_frame()


In [14]:
na_rows = X.apply(lambda x: np.any(pd.isna(x)), axis=1)
result = np.where(na_rows)[0]
print('NA rows: ' + str(result))

X = X.drop(X.index[result])
Y = Y.drop(Y.index[result])

print(X.shape)
print(Y.shape)

NA rows: [   33    54    58 ... 23032 23046 23077]
(21819, 4)
(21819, 1)


In [15]:
for i in range(4):
    column_name = X.columns[i]
    metric_value = 100*performance_metrics(Y, X.iloc[:, i])
    output = f"{column_name.ljust(15)}: {metric_value:.2f}%"
    print(output)


GLADGLCextent  : 88.43%
CLC10m         : 88.62%
CLC            : 88.51%
C-GLOPS        : 88.85%


Now all of the binary maps

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=2023)
y_train = np.ravel(Y_train)
y_test = np.ravel(Y_test)

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

In [18]:
OLSmodel = LinearRegression()
OLSmodel.fit(X_train, y_train)
y_pred = OLSmodel.predict(X_test)

performance_metrics(y_test, y_pred >0.5)


0.8931255728689276

In [20]:
OLSmodel.coef_

array([0.20924354, 0.19440516, 0.2325991 , 0.23067852])

In [19]:
model = LogisticRegression() 

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

performance_metrics(y_test, y_pred)

0.8932172318973419

In [21]:
model.coef_

array([[1.31117514, 1.20508543, 1.38683806, 1.297646  ]])