## How to train the network

- This notebook creates and populates train, test and validation folders
- `python train.py --network 0 --dataset-path ~/melanoma_images/2020_12_10_set --testset-path ~/melanoma_images/2020_12_10_set/test > out.txt`
- A separate validation set is used for monitoring loss/accuracy and for hyperparameter tuning.
- The validation set must be inside the training folder.

## Prepare training

In [1]:
'Load metadata'
fld = os.path.join(home, 'melanoma_images')
data_fld = '../data'
sub_fld = os.path.join(data_fld, 'TCGA_data')
path = os.path.join(sub_fld, 'tcga_wsi_meta.csv')
df_wsi = pd.read_csv(path, index_col=0)
path = os.path.join(sub_fld, 'tcga_tile_meta.csv')
df_tile = pd.read_csv(path, index_col=0)

In [2]:
'Get date for training folder name'
from datetime import datetime
now = datetime.now()
year = str(now.date().year)
month = str(now.date().month)
day = str(now.date().day)
date = year + '_' + month + '_' + day

In [3]:
'Create training/validation folder'
ds_fld = os.path.join(fld, date + '_set')
if os.path.isdir(ds_fld) is False:
    os.mkdir(ds_fld)
    tmp_fld = os.path.join(ds_fld, 'train')
    os.mkdir(tmp_fld)
#     for class_ in classes:
#         tmp_fld_2 = os.path.join(tmp_fld, class_)
#         os.mkdir(tmp_fld_2)
    tmp_fld = os.path.join(ds_fld, 'validation')
    os.mkdir(tmp_fld)
#     for class_ in classes:
#         tmp_fld_2 = os.path.join(tmp_fld, class_)
#         os.mkdir(tmp_fld_2)
else:
    print('Folder already there')

In [4]:
'Create training/validation dataframes (print num of entries)'
class_size = 1000 # no images in training set for each class
trn_class_size = int(class_size * 0.9)
classes = ['CBT3', 'CBTA', 'CBTP', 'CBTP3', 'CBT', 'CBTPA']
trn_dfs = []
tst_dfs = []
g = df_tile.groupby('genotype')
keys = g.groups.keys()
for key in keys: 
    if key in classes:
        df = g.get_group(key)
        if len(df) < trn_class_size:
            # do not trust validation on these classes
            df = df.sample(n=class_size, replace=True)  
        else:
            df = df.sample(n=class_size)
        trn_dfs.append(df[0:trn_class_size])
        tst_dfs.append(df[trn_class_size:])
df_trn = pd.concat(trn_dfs)
df_tst = pd.concat(tst_dfs)
len(df_trn), len(df_tst) 

(5400, 600)

In [6]:
'Populate folder with images'
from shutil import copyfile
img_fld = os.path.join(fld, 'mouse_tiles')

# populate train folder
for ix, entry in tqdm(df_trn.iterrows()):
    src_path = os.path.join(img_fld, entry.genotype, entry.file)
    assert os.path.isfile(src_path)
    dst_fld = os.path.join(ds_fld, 'train', entry.genotype)
    # if class folder doesnt exist, make it
    if os.path.isdir(dst_fld) is False:
        os.mkdir(dst_fld)
    dst_path = os.path.join(dst_fld, entry.file)
    copyfile(src_path, dst_path)
    
# populate test folder
for ix, entry in tqdm(df_tst.iterrows()):
    src_path = os.path.join(img_fld, entry.genotype, entry.file)
    assert os.path.isfile(src_path)
    dst_fld = os.path.join(ds_fld, 'validation', entry.genotype)
    # if class folder doesnt exist, make it
    if os.path.isdir(dst_fld) is False:
        os.mkdir(dst_fld)
    dst_path = os.path.join(dst_fld, entry.file)
    copyfile(src_path, dst_path)

5400it [01:47, 50.36it/s]
600it [00:12, 46.16it/s]
