In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# This file contains all the main external libs we'll use
from fastai.imports import *

In [3]:
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

In [4]:
PATH = "data/histopathologic"

# for today, lets try using the full image

In [None]:
print(torch.cuda.is_available())

print(torch.backends.cudnn.enabled)

os.listdir(PATH)

In [None]:
dirs = os.listdir(f'{PATH}/train')
dirs

In [None]:
label_csv = f'{PATH}/train_labels.csv'
n = len(list(open(label_csv))) - 1 # header is not counted (-1)
val_idxs = get_cv_idxs(n) # random 20% data for validation set

In [None]:
print(n)
print(len(val_idxs))

## Data Exploration

In [None]:
sz = 224
arch = resnet34
bs = 64

In [None]:
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}/train_labels.csv', test_name='test', # we need to specify where the test set is if you want to submit to Kaggle competitions
                                   val_idxs=val_idxs, suffix='.tif', tfms=tfms, bs=bs)

In [None]:
fn = PATH + '/' + data.trn_ds.fnames[0]; fn

In [None]:
#this will fail if it hits a validation idx
idx = random.randint(0, n-1)

fn = PATH + '/' + data.trn_ds.fnames[idx]; fn

img = PIL.Image.open(fn); img

In [None]:
size_d = {k: PIL.Image.open(PATH + '/' + k).size for k in data.trn_ds.fnames}
row_sz, col_sz = list(zip(*size_d.values()))
row_sz = np.array(row_sz); col_sz = np.array(col_sz)

plt.hist(row_sz);

In [None]:
plt.hist(col_sz)

## initial, naive model

In [None]:
label_csv = f'{PATH}/train_labels.csv'
n = len(list(open(label_csv))) - 1 # header is not counted (-1)
val_idxs = get_cv_idxs(n) # random 20% data for validation set

In [None]:
sz=224
arch=resnet34
bs=64
tfms = tfms_from_model(arch, sz)
data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}/train_labels.csv', test_name='test', # we need to specify where the test set is if you want to submit to Kaggle competitions
                                   val_idxs=val_idxs, suffix='.tif', tfms=tfms, bs=bs)

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=True)
lrf=learn.lr_find()

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot()

got 2 little humps, but 10^-3 (0.001) seems like the right thing.

# naive model first

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=True)
learn.fit(0.001, 3)

naive model after 3 epochs:
(trn_loss, val_loss, acc):
0.346283,   0.314564,   0.865833

## now lets restart and do data augmentation

In [5]:
label_csv = f'{PATH}/train_labels.csv'
n = len(list(open(label_csv))) - 1 # header is not counted (-1)
val_idxs = get_cv_idxs(n) # random 20% data for validation set

sz=224
arch=resnet34
bs=64
tfms = tfms_from_model(arch, sz, aug_tfms=[RandomDihedral(), RandomLighting(0.05, 0.05)])
data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}/train_labels.csv', test_name='test', # we need to specify where the test set is if you want to submit to Kaggle competitions
                                   val_idxs=val_idxs, suffix='.tif', tfms=tfms, bs=bs)
learn = ConvLearner.pretrained(arch, data, precompute=False, ps=0.5)
#learn = ConvLearner.pretrained(arch, data, precompute=True, ps=0.5)

In [None]:
lrf=learn.lr_find()

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot()

In [None]:
# train last layer with Precompute=True for 1-2 epochs
learn.fit(0.0005, 2)

In [None]:
# Train last layer with data augmentation (i.e. precompute=False) for 2-3 epochs with cycle_len=1
learn.precompute=False
learn.fit(0.0005, 3, cycle_len=1)

In [None]:
learn.unfreeze()
lr=np.array([0.0005/9,0.0005/3,0.0005])
learn.fit(lr, 4, cycle_len=1, cycle_mult=2)

In [None]:
learn.save('resnet34_nocrop')

In [None]:
learn.fit(lr, 1, cycle_len=10)

In [23]:
model_name = 'resnet34_nocrop_more_epochs'

In [None]:
learn.save(model_name)

In [6]:
learn.load(model_name)

## Test and Create Submission

In [7]:
data.classes

['0', '1']

In [8]:
log_preds, y = learn.TTA(is_test=True)
probs = np.mean(np.exp(log_preds), 0)

                                              

In [10]:
probs[:5]

array([[0.34309, 0.65691],
       [0.00533, 0.99467],
       [0.95428, 0.04572],
       [0.00455, 0.99545],
       [0.82393, 0.17607]], dtype=float32)

In [11]:
data.test_ds.fnames

['test/9b32131762faa1fc49554aeb0e31265142094a1d.tif',
 'test/f1dbdfd6ce0642daa1b534693b7d0abf3a7d33b7.tif',
 'test/669aee125b4e91bda5b2de980a6153e682984668.tif',
 'test/00d7ec734099f2ba82dbb857c46b121ed1384938.tif',
 'test/dd3c9b6a7b1ee3581df11b9d9fd048943ccf39d3.tif',
 'test/af2305a288a0a4b78f1974cda2e8f60029f2cd24.tif',
 'test/1d8a3f4dd7af07d4b72c260374e6e4beb304e36d.tif',
 'test/2b6939fdd64f9e689ee80e163189679ce8c245da.tif',
 'test/f5098e4bd14a4c317f42d39cd1e6987b66698196.tif',
 'test/1c08b4d94b96a9d4d5f5438e3670fb2148dde96f.tif',
 'test/6a13e6b0a11fcc3d47a548ef60e62575ad04c004.tif',
 'test/ac1ad5abd804d695a4c8512e520c69445103c27a.tif',
 'test/0cfe1de2bd45d776694d9ff5ef0dac2acf3238da.tif',
 'test/dfaacd2b04a09a4a0eb9661a0e393e2219f1bc32.tif',
 'test/08d2d47c12d5182ae35e390e683ffcd85a8e1dcd.tif',
 'test/57fb364199fbe79a3e4c1434d4c827f3bff80918.tif',
 'test/c8ec841ae20b99c00862739cc172b1f9aefb5888.tif',
 'test/dc7e09ec2fc8fd0e88014ecf40415425f16b8672.tif',
 'test/5deda2f23c5c661b70506

In [14]:
df = pd.DataFrame(probs)
df.columns = data.classes

In [15]:
df['label'] = df.idxmax(axis=1); df.head()

Unnamed: 0,0,1,label
0,0.343086,0.656914,1
1,0.005331,0.994669,1
2,0.954284,0.045716,0
3,0.004553,0.995447,1
4,0.823932,0.176068,0


In [16]:
df = df[['label']]; df.head()

Unnamed: 0,label
0,1
1,1
2,0
3,1
4,0


In [20]:
[o[5:-4] for o in data.test_ds.fnames]

['9b32131762faa1fc49554aeb0e31265142094a1d',
 'f1dbdfd6ce0642daa1b534693b7d0abf3a7d33b7',
 '669aee125b4e91bda5b2de980a6153e682984668',
 '00d7ec734099f2ba82dbb857c46b121ed1384938',
 'dd3c9b6a7b1ee3581df11b9d9fd048943ccf39d3',
 'af2305a288a0a4b78f1974cda2e8f60029f2cd24',
 '1d8a3f4dd7af07d4b72c260374e6e4beb304e36d',
 '2b6939fdd64f9e689ee80e163189679ce8c245da',
 'f5098e4bd14a4c317f42d39cd1e6987b66698196',
 '1c08b4d94b96a9d4d5f5438e3670fb2148dde96f',
 '6a13e6b0a11fcc3d47a548ef60e62575ad04c004',
 'ac1ad5abd804d695a4c8512e520c69445103c27a',
 '0cfe1de2bd45d776694d9ff5ef0dac2acf3238da',
 'dfaacd2b04a09a4a0eb9661a0e393e2219f1bc32',
 '08d2d47c12d5182ae35e390e683ffcd85a8e1dcd',
 '57fb364199fbe79a3e4c1434d4c827f3bff80918',
 'c8ec841ae20b99c00862739cc172b1f9aefb5888',
 'dc7e09ec2fc8fd0e88014ecf40415425f16b8672',
 '5deda2f23c5c661b70506d2b51533db84ce33e6b',
 'e9cefa195bb47d2b3791b6c31245278726e24985',
 'd9e507205402f0cb92c337e2f920c740d7e80f46',
 'f13096408fa0f141b1516c48e8b301d36ddf4914',
 '74fa9935

In [21]:
df.insert(0, 'id', [o[5:-4] for o in data.test_ds.fnames]); df.head()

In [24]:
SUBM = f'{PATH}/subm/'
os.makedirs(SUBM, exist_ok=True)
df.to_csv(f'{SUBM}{model_name}.gz', compression='gzip', index=False)