In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import fastai
from fastai import * 
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *

import pandas as pd
import numpy as np
import os

fastai.version.__version__

'1.0.39'

In [3]:
# make sure CUDA is available and enabled
print('CUDA enabled:',torch.cuda.is_available()) 
print('CUDNN enabled:', torch.backends.cudnn.enabled)

CUDA enabled: True
CUDNN enabled: True


# Dataset preprocessing

In [5]:
current_dir = os.getcwd()
input_path =f'{current_dir}/data'
train_dir = f"{input_path}/train"
train_labels = f"{input_path}/labels.csv"
test_dir = f"{input_path}/test"
model_dir = f'{current_dir}/models'

## Train model

In [6]:
SZ = 224
BS = 64
NUM_WORKERS = 8
SEED=0
arch = models.resnet50

In [7]:
df = pd.read_csv(train_labels)
# df = df.sample(frac=0.25)

print(df.shape)
print(df.head())

(25361, 3)
   index          Image         Id
0      0  0000e88ab.jpg  w_f48451c
1      1  0001f9222.jpg  w_c3d896a
2      2  00029d126.jpg  w_20df2c5
3      3  00050a15a.jpg  new_whale
4      4  0005c1ef8.jpg  new_whale


In [11]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [13]:
test_files = ImageItemList.from_folder(test_dir)

In [14]:
# TODO label from df?
data = (
    ImageItemList
        .from_df(df, train_dir, cols=['breed'])
#         .no_split()
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(test_files)
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path=input_path)
        .normalize(imagenet_stats)
)

In [15]:
data.show_batch(rows=3, fig_size=(SZ, SZ))

# Learning rate

In [16]:
learn = create_cnn(data, arch, metrics=accuracy, model_dir=model_dir)

In [17]:
learn.lr_find()

In [18]:
learn.recorder.plot()

# Precompute

In [None]:
learn = create_cnn(data, arch, metrics=accuracy, model_dir=model_dir, callback_fns=ShowGraph)

In [None]:
learn.fit(2)

epoch,train_loss,valid_loss,accuracy
1,5.773535,Unnamed: 2_level_1,Unnamed: 3_level_1
2,4.922956,Unnamed: 2_level_2,Unnamed: 3_level_2


In [None]:
# learn few epochs with unfreeze
learn.unfreeze()

In [None]:
lr_rate = 1e-4
learn.fit(5, [lr_rate/100, lr_rate/10, lr_rate])

epoch,train_loss,valid_loss,accuracy
1,4.353786,Unnamed: 2_level_1,Unnamed: 3_level_1
2,4.161839,Unnamed: 2_level_2,Unnamed: 3_level_2
3,4.087887,Unnamed: 2_level_3,Unnamed: 3_level_3
4,3.942353,Unnamed: 2_level_4,Unnamed: 3_level_4


# Prediction & Summition - TODO

In [None]:
log_preds,y = learn.TTA(ds_type=DatasetType.Test)

In [None]:
def top_preds(preds, classes):
    top = np.argsort(preds.numpy())[:, ::-1][:, :5]
    labels = []
    for i in range(top.shape[0]):
        labels.append(' '.join([classes[idx] for idx in top[i]]))
    return labels

def create_submission(preds, data):
    classes = data.classes
    sub_df = pd.DataFrame({'Image': [path.name for path in data.test_ds.x.items]})
    sub_df['Id'] = top_preds(preds, classes)
    return sub_df

In [None]:
submittion_df = create_submission(log_preds, learn.data)

In [None]:
submittion_file = 'submission-3.csv'
submittion_df.to_csv(submittion_file, index=False)

In [None]:
print(submittion_df.head())
print(submittion_df.shape)

In [None]:
!kaggle competitions submit -c humpback-whale-identification -f {submittion_file} -m "less overfitter model submittion"