# Read Data


In [2]:
from pathlib import Path
from PIL import Image
import numpy as np

In [3]:
data_path = Path("/home/user/data")

train_data_node_A_path = data_path / "FL" / "train_node_A"
train_data_node_B_path = data_path / "FL" / "train_node_B"
train_data_node_C_path = data_path / "FL" / "train_node_C"

test_data_path = data_path / "test"

In [4]:
img_height = 180
img_width  = 180
image_size = (img_height, img_width)

# Images and labels for training data

In [5]:
def get_XY(path, image_size):
    train_list = []
    label_list = []
    for directory in list(path.iterdir()):
        files = list(directory.glob('*.jpg'))
    
        print('Directory: ' + directory.as_posix() + ' (' + str(len(files)) + ' files)')
    
        # Get labels from directory name (0 = normal, 1 = tumor)
        label  = int(directory.as_posix()[-1])
        labels = np.repeat(label, len(files))
        label_list.append(labels)
   
        # 1-dim flattened array per image (feature length = 24300)
        arr = np.array([(np.array(Image.open(file).resize(image_size)) / 255).flatten() for file in files])
        train_list.append(arr)
    
    # merge the two train image matrices into one
    X = np.concatenate((train_list[0], train_list[1]))
    
    # merge the output labels vectors into one 
    Y = np.concatenate((label_list[0], label_list[1]))
    
    print('Shape X:', X.shape, 'Shape Y:', Y.shape)
          
    return X, Y

In [6]:
X_node_A, Y_node_A = get_XY(train_data_node_A_path, image_size)
X_node_B, Y_node_B = get_XY(train_data_node_B_path, image_size)
X_node_C, Y_node_C = get_XY(train_data_node_C_path, image_size)

Directory: /home/user/data/FL/train_node_A/target_1 (1382 files)
Directory: /home/user/data/FL/train_node_A/target_0 (3618 files)
Shape X: (5000, 97200) Shape Y: (5000,)
Directory: /home/user/data/FL/train_node_B/target_1 (1875 files)
Directory: /home/user/data/FL/train_node_B/target_0 (3125 files)
Shape X: (5000, 97200) Shape Y: (5000,)
Directory: /home/user/data/FL/train_node_C/target_1 (1223 files)
Directory: /home/user/data/FL/train_node_C/target_0 (3777 files)
Shape X: (5000, 97200) Shape Y: (5000,)


In [7]:
# keep the data from the nodes separate
X_list = [X_node_A, X_node_B, X_node_C]
Y_list = [Y_node_A, Y_node_B, Y_node_C]

In [8]:
# ... or merge them into one train and test array!
X = np.concatenate((X_list[0], X_list[1], X_list[2]))
Y = np.concatenate((Y_list[0], Y_list[1], Y_list[2]))

In [9]:
X.shape

(15000, 97200)

# Images and labels for test data

In [10]:
test_list = []
test_label_list = []
test_files_list = []
for directory in list(test_data_path.iterdir()):
    files = list(directory.glob('*.jpg'))
    
    print('Directory: ' + directory.as_posix() + ' (' + str(len(files)) + ' files)')
    
    # Get labels from directory name (0 = normal, 1 = tumor)
    label  = int(directory.as_posix()[-1])
    labels = np.repeat(label, len(files))
    test_label_list.append(labels)
   
    # 1-dim flattened array per image (feature length = 24300)
    arr = np.array([(np.array(Image.open(file).resize(image_size)) / 255).flatten() for file in files])
    test_list.append(arr)
    
    # keep file names in a list
    test_files_list.append([file.as_posix() for file in files])

file_paths = test_files_list[0] + test_files_list[1]
print('#Images/files:', len(file_paths))

# merge the two test image matrices into one
X_test = np.concatenate((test_list[0], test_list[1]))

# merge the output labels vectors into one 
Y_test = np.concatenate((test_label_list[0], test_label_list[1]))

print('Shape X-test:', X_test.shape, 'Shape Y-test:', Y_test.shape)

Directory: /home/user/data/test/target_1 (2751 files)
Directory: /home/user/data/test/target_0 (3249 files)
#Images/files: 6000
Shape X-test: (6000, 97200) Shape Y-test: (6000,)


# Random Forest Classifier 

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

Let's tune the RF method using a randomized search grid:

In [13]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', 50, 100]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2', 50, 100], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [21]:
# choose 1000 images at random from the whole 15000 image set

np.random.seed(42)
indices = np.random.choice(X.shape[0], replace = False, size = 1000)
indices.shape

(1000,)

In [23]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfc = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid,
                               n_iter = 200, cv = 3, verbose = 2, random_state = 42, n_jobs = -1)

In [24]:
# Fit the random search model
rf_random.fit(X[indices,], Y[indices])

Fitting 3 folds for each of 200 candidates, totalling 600 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt', 'log2',
                                                         50, 100],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [25]:
print ('Best Parameters: ', rf_random.best_params_, ' \n')

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 50,
 'max_depth': 110,
 'bootstrap': False}

In [26]:
# Test predictions using the RF with the best parameters (trained on the 1000 images)

pred = rf_random.predict_proba(X_test)
roc_auc_score(Y_test, pred[:,1])

0.7308990524612948

[CV] END bootstrap=True, max_depth=30, max_features=50, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=  21.5s
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time= 3.9min
[CV] END bootstrap=True, max_depth=110, max_features=50, min_samples_leaf=2, min_samples_split=5, n_estimators=2000; total time=  44.0s
[CV] END bootstrap=True, max_depth=90, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  10.6s
[CV] END bootstrap=True, max_depth=None, max_features=100, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=  57.7s
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time= 7.1min
[CV] END bootstrap=False, max_depth=50, max_features=50, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=  22.4s
[CV] END bootstrap=False, max_depth=None,

[CV] END bootstrap=True, max_depth=30, max_features=50, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=  21.3s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time= 1.7min
[CV] END bootstrap=True, max_depth=100, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  16.4s
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1600; total time= 4.6min
[CV] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   8.0s
[CV] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   8.0s
[CV] END bootstrap=True, max_depth=100, max_features=50, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time=  13.6s
[CV] END bootstrap=True, max_depth=

[CV] END bootstrap=False, max_depth=None, max_features=50, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   8.5s
[CV] END bootstrap=True, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1200; total time= 2.1min
[CV] END bootstrap=True, max_depth=100, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  16.4s
[CV] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time= 1.7min
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time= 1.9min
[CV] END bootstrap=False, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time= 7.1min
[CV] END bootstrap=False, max_depth=50, max_features=50, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=  22.4s
[CV] END bootstrap=True, max_depth=60

[CV] END bootstrap=False, max_depth=None, max_features=50, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   8.4s
[CV] END bootstrap=True, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1200; total time= 2.1min
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1600; total time= 4.6min
[CV] END bootstrap=True, max_depth=None, max_features=50, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=  44.3s
[CV] END bootstrap=False, max_depth=80, max_features=100, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=False, max_depth=110, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time= 2.8min
[CV] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   3.4s
[CV] END bootstrap=False, max_depth

[CV] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=1600; total time= 4.6min
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  42.4s
[CV] END bootstrap=True, max_depth=90, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=  19.4s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=  18.1s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=  18.3s
[CV] END bootstrap=True, max_depth=None, max_features=50, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=  44.8s
[CV] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   7.9s
[CV] END bootstrap=True, max_depth=10

In [38]:
# the rf classifier with the best parameters
best_rfc = RandomForestClassifier(**rf_random.best_params_, random_state = 42, n_jobs = -1, verbose = 1)
best_rfc

RandomForestClassifier(bootstrap=False, max_depth=110, max_features=50,
                       min_samples_leaf=4, n_estimators=600, n_jobs=-1,
                       random_state=42, verbose=1)

In [39]:
# fit all training data (ALL images) using a RF classifier with the best tuned parameters
best_rfc.fit(X, Y)

# test performance
pred = best_rfc.predict_proba(X_test)
roc_auc_score(Y_test, pred[:,1])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   44.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 600 out of 600 | elapsed:    0.5s finished


0.7278314195380868

In [46]:
# simulate the Federated learning: train on each node's training data and increase n_trees!
best_rfc = RandomForestClassifier(**rf_random.best_params_, random_state = 42, 
                                  n_jobs = -1, verbose = 1, warm_start = True)
best_rfc

RandomForestClassifier(bootstrap=False, max_depth=110, max_features=50,
                       min_samples_leaf=4, n_estimators=600, n_jobs=-1,
                       random_state=42, verbose=1, warm_start=True)

In [48]:
n_trees = rf_random.best_params_['n_estimators']
n_trees

600

In [49]:
for X, Y in zip(X_list, Y_list):
    best_rfc.fit(X, Y)
    best_rfc.n_estimators += n_trees

best_rfc.n_estimators -= n_trees

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   13.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   12.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 

In [50]:
pred = best_rfc.predict_proba(X_test)
roc_auc_score(Y_test, pred[:,1])

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed:    0.9s
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed:    1.3s
[Parallel(n_jobs=16)]: Done 1800 out of 1800 | elapsed:    1.3s finished


0.7646246659906765

In [51]:
import pandas as pd

df_submission = pd.DataFrame(data={'file_paths': file_paths, 'predictions': pred[:,1]})
df_submission["file_paths"] = df_submission["file_paths"].apply(lambda x: x.replace("/home/user/data/test","/data/challenges_data/test"))

In [52]:
df_submission.to_csv('rf_results_pseudo_fed.csv', index=False)
df_submission.head()

Unnamed: 0,file_paths,predictions
0,/data/challenges_data/test/target_1/tumor_cent...,0.518774
1,/data/challenges_data/test/target_1/tumor_cent...,0.605361
2,/data/challenges_data/test/target_1/tumor_cent...,0.443389
3,/data/challenges_data/test/target_1/tumor_cent...,0.51694
4,/data/challenges_data/test/target_1/tumor_cent...,0.382402
