# K-fold cross validation (k=number of samples =16)

For each run, we test on a single image and train on n_samples-1 images. The final score is in the last cell

In [1]:
from utils.data import Data
from utils.estimators import Dataset, Classifier
import numpy as np
%matplotlib inline

In [2]:
tiff_location = "./Data/Images/"
shp_location = "./Data/Labels/"
all_data = Data(tiff_location, shp_location, classes = ["water", "land"])
all_tiff = all_data.read_tiff() 
all_mask = all_data.get_mask()
X, y = all_data.get_Xy(all_tiff, all_mask, n_sample = 200000, k_fold=True)

  return _prepare_from_string(" ".join(pjargs))


In [3]:
classifier = Classifier()
all_preds = []
all_y = []
for i in range(X.shape[0]):
    print(f"Training on split {i+1} out of split {X.shape[0]}")
    _tempX = np.copy(X)
    _tempY = np.copy(y)
    X_test, y_test = _tempX[i], _tempY[i]
    X_train, y_train = np.delete(_tempX, i, 0).reshape(-1, X.shape[2]), np.delete(_tempY, i, 0).reshape(-1, y.shape[2])
    dataset = Dataset(X_train, X_test, y_train, y_test)
    all_y.extend(dataset.testY)
    preds = classifier.random_forest(trainX=dataset.trainX, trainY=dataset.trainY, 
                                     testX=dataset.testX, testY=dataset.testY,
                                     grid_search=False, train=True, 
                                     n_estimators = 10, max_depth = 10)
    all_preds.extend(preds)
all_preds = np.asarray(all_preds)
all_y = np.asarray(all_y)

Training on split 1 out of split 16

Random Forest
Elapsed_time training  57.601113 
Accuracy on train Set: 
0.9962285
Accuracy on Test Set: 
0.8947825
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.79      0.88    200000
           1       0.83      1.00      0.90    200000

    accuracy                           0.89    400000
   macro avg       0.91      0.89      0.89    400000
weighted avg       0.91      0.89      0.89    400000

Confusion Matrix: 
[[158530  41470]
 [   617 199383]]
Training on split 2 out of split 16

Random Forest
Elapsed_time training  57.452603 
Accuracy on train Set: 
0.9964711666666667
Accuracy on Test Set: 
0.1480325
Classification Report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00    200000
           1       0.23      0.30      0.26    200000

    accuracy                           0.15    400000
   macro avg       0.11      0.15      0

[[182980  17020]
 [184519  15481]]
Training on split 16 out of split 16

Random Forest
Elapsed_time training  57.721899 
Accuracy on train Set: 
0.9961815
Accuracy on Test Set: 
0.9895325
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    200000
           1       0.98      1.00      0.99    200000

    accuracy                           0.99    400000
   macro avg       0.99      0.99      0.99    400000
weighted avg       0.99      0.99      0.99    400000

Confusion Matrix: 
[[195813   4187]
 [     0 200000]]


In [4]:
from sklearn import metrics
print("Classification Report: ")
print(metrics.classification_report(all_y, all_preds))
print("Confusion Matrix: ")
print(metrics.confusion_matrix(all_y, all_preds))

Classification Report: 
              precision    recall  f1-score   support

           0       0.72      0.51      0.60   3200000
           1       0.62      0.80      0.70   3200000

    accuracy                           0.66   6400000
   macro avg       0.67      0.66      0.65   6400000
weighted avg       0.67      0.66      0.65   6400000

Confusion Matrix: 
[[1628494 1571506]
 [ 636213 2563787]]
