In [232]:
import xgboost as xgb
import pandas as pd
import numpy as np
import dask.array as da
import h5py as h5
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
# import dask_xgboost as dxgb
from dask_ml.xgboost import XGBClassifier
from dask_ml.preprocessing import DummyEncoder
from tqdm import tqdm

In [109]:
# Start client
# client = Client('10.70.1.160:8786')
# client = Client(processes=False)

In [309]:
# Parameters
DATA_PATH = r'data/eightieth.h5' # Path to the learning dataset
CHUNK_SIZE = 5000 # Number of images to process in one batch (must fit comfortably in memory)
TRAIN_SIZE = 7/8 # Size of the train dataset (compared to the total dataset)


# data_h5 = h5.File('data/eightieth.h5', mode='r')
# landcover = da.from_array(data_h5['TOP_LANDCOVER'], chunks=(1000,1))
# images = da.from_array(data_h5['S2'], chunks=(1000,16,16,4))

In [310]:
# Setup generator yielding X, y chunks (default size 1000)

classes = da.unique(da.from_array(h5.File(DATA_PATH)['TOP_LANDCOVER'], chunks=(CHUNK_SIZE,1))).compute()

def generator(h5_path, chunk_size, train_size):
    f = h5.File(h5_path, 'r')
    X = f['S2']
    y = f['TOP_LANDCOVER']
    
    # Compute number of chunks needed
    no_chunks = len(X) // chunk_size
    if len(X) % chunk_size > 0:
        # len(X) is not an exact multiple of chunk_size
        no_chunks += 1
    
    print(len(X), '-', no_chunks)
    for c in range(no_chunks):
        X_to_yield = X[c*chunk_size:(c+1)*chunk_size,:,:,:]
        y_to_yield = y[c*chunk_size:(c+1)*chunk_size,:]
        
        # Wrangle data
        X_to_yield = np.array(X_to_yield).reshape((-1, 16*16*4)) # Flatten each element in the array
#         y_to_yield = pd.get_dummies(np.array(y_to_yield).astype(int).reshape((-1,)))
        y_to_yield = np.array(y_to_yield).astype(int).reshape((-1,))
        y_to_yield = label_binarize(y_to_yield, classes=classes)
#         y_to_yield = pd.DataFrame(y_to_yield, index=None, columns=['landcover']).astype(int)
#         y_to_yield = pd.get_dummies(y_to_yield, columns=['landcover'])
        
        
        yield train_test_split(X_to_yield, y_to_yield, train_size=train_size, test_size=1-train_size)
    
# Test
# for X, y in generator('data/eightieth.h5', 1000):
#     pass

In [312]:
forests_list = [] # List of forests to aggregate

for X_train, X_test, y_train, y_test in tqdm(generator(DATA_PATH, CHUNK_SIZE, TRAIN_SIZE)):
    cf = RandomForestClassifier(n_estimators=100, n_jobs=-1) # parameters to define
    cf.fit(X=X_train, y=y_train)
    
#     Reshape y_test to make it understandable (1D series)
#     y_test = y_test.idxmax(axis=1)
    score = cf.score(X_test, y_test)
    
    forests_list.append((cf, score))
    
forests_list = np.array(forests_list)
print('Mean score:', forests_list[:,1].mean())


0it [00:00, ?it/s]

234000 - 47



Exception in thread Thread-20:
Traceback (most recent call last):
  File "C:\Program Files\Miniconda3\lib\threading.py", line 914, in _bootstrap_inner
    self.run()
  File "C:\Program Files\Miniconda3\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "C:\Program Files\Miniconda3\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

4it [01:00, 15.09s/it]


KeyboardInterrupt: 

In [305]:
# Merge all the trees back in a single RF
merged_cf = RandomForestClassifier()
merged_cf.estimators_ = []

# Comppute the total number of distinct classes in the dataset
n_classes = len(classes)

for cf in forests_list[:,0]:
    merged_cf.estimators_.extend(cf.estimators_)
    merged_cf.n_estimators = len(merged_cf.estimators_)
    
merged_cf.n_classes_ = [2] * n_classes
merged_cf.classes_ = [np.array([0,1])] * n_classes
merged_cf.n_outputs_ = n_classes

In [280]:
probas = merged_cf.predict_proba(X_test)

In [308]:
merged_cf.predict(X_test)[9]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [283]:
probas[0].shape[0]

200

In [None]:
merged_cf.classes_[k].take(np.argmax(proba[k],axis=1),
                           axis=0)

In [285]:
merged_cf.classes_[0].take()

array([0, 1])

In [292]:
np.argmax(probas[0], axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)

In [301]:
merged_cf.predict_proba(X_test)[1]

array([[0.82606838, 0.43034188],
       [0.85769231, 0.39871795],
       [0.83418803, 0.42222222],
       [0.8508547 , 0.40555556],
       [0.85555556, 0.4008547 ],
       [0.82307692, 0.43333333],
       [0.83760684, 0.41880342],
       [0.82820513, 0.42820513],
       [0.8482906 , 0.40811966],
       [0.84145299, 0.41495726],
       [0.86452991, 0.39188034],
       [0.88194444, 0.37446581],
       [0.86965812, 0.38675214],
       [0.81880342, 0.43760684],
       [0.86153846, 0.39487179],
       [0.85213675, 0.4042735 ],
       [0.82649573, 0.42991453],
       [0.85213675, 0.4042735 ],
       [0.83931624, 0.41709402],
       [0.82905983, 0.42735043],
       [0.82606838, 0.43034188],
       [0.85555556, 0.4008547 ],
       [0.8534188 , 0.40299145],
       [0.85854701, 0.39786325],
       [0.84017094, 0.41623932],
       [0.82692308, 0.42948718],
       [0.86837607, 0.38803419],
       [0.83675214, 0.41965812],
       [0.83803419, 0.41837607],
       [0.84273504, 0.41367521],
       [0.

In [297]:
cf.predict_proba(X_test)

[array([[0.7, 0.3],
        [0.9, 0.1],
        [0.5, 0.5],
        [1. , 0. ],
        [0.9, 0.1],
        [0.9, 0.1],
        [0.9, 0.1],
        [0.9, 0.1],
        [0.8, 0.2],
        [0.9, 0.1],
        [1. , 0. ],
        [0.9, 0.1],
        [0.8, 0.2],
        [0.7, 0.3],
        [0.9, 0.1],
        [0.9, 0.1],
        [0.9, 0.1],
        [1. , 0. ],
        [0.9, 0.1],
        [1. , 0. ],
        [1. , 0. ],
        [0.9, 0.1],
        [0.9, 0.1],
        [1. , 0. ],
        [0.7, 0.3],
        [0.6, 0.4],
        [1. , 0. ],
        [0.9, 0.1],
        [0.7, 0.3],
        [0.7, 0.3],
        [0.7, 0.3],
        [1. , 0. ],
        [0.7, 0.3],
        [1. , 0. ],
        [1. , 0. ],
        [0.8, 0.2],
        [0.8, 0.2],
        [1. , 0. ],
        [0.7, 0.3],
        [0.9, 0.1],
        [0.9, 0.1],
        [1. , 0. ],
        [0.9, 0.1],
        [0.8, 0.2],
        [0.8, 0.2],
        [0.9, 0.1],
        [0.8, 0.2],
        [1. , 0. ],
        [0.5, 0.5],
        [1. , 0. ],
