In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import fastai
from fastai import * 
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *

import pandas as pd
import numpy as np
import os

fastai.version.__version__

'1.0.39'

In [3]:
# make sure CUDA is available and enabled
print('CUDA enabled:',torch.cuda.is_available()) 
print('CUDNN enabled:', torch.backends.cudnn.enabled)

CUDA enabled: True
CUDNN enabled: True


In [4]:
def recreate_directory(directory):
    !rm -R {directory} 2>nul
    !mkdir {directory}

# Dataset preprocessing

In [5]:
current_dir = os.getcwd()
input_path =f'{current_dir}/data'
train_dir = f"{input_path}/train"
train_labels = f"{input_path}/train.csv"
test_dir = f"{input_path}/test"
model_dir = f'{current_dir}/models'

## Train model

In [6]:
SZ = 224
BS = 64
NUM_WORKERS = 8
SEED=0
arch = models.resnet50

In [7]:
df = pd.read_csv(train_labels)
df = df[df['Id']!='new_whale']
# df = df.sample(frac=0.25)
df = df.reset_index()

print(df.shape)
print(df.head())

(15697, 3)
   index          Image         Id
0      0  0000e88ab.jpg  w_f48451c
1      1  0001f9222.jpg  w_c3d896a
2      2  00029d126.jpg  w_20df2c5
3      6  000a6daec.jpg  w_dd88965
4      8  0016b897a.jpg  w_64404ac


In [8]:
grouped_df = df.groupby('Id')
grouped_counted = grouped_df.count().sort_values(by=['Image'], ascending=False)
grouped_counted = grouped_counted[(grouped_counted['Image']>5) & (grouped_counted['Image']<1000)]
print(len(grouped_counted))
print(grouped_counted.sum().Image, 'of', len(df))
print(grouped_counted.head())

633
7398 of 15697
           index  Image
Id                     
w_23a388d     73     73
w_9b5109b     65     65
w_9c506f6     62     62
w_0369a5c     61     61
w_700ebb4     57     57


In [9]:
valid_pct = 0.2

valid_filenames = pd.DataFrame(columns=df.columns)

for name, group in enumerate(grouped_df):
    sub_df = group[1]
    sample = sub_df.sample(frac=valid_pct)
    valid_filenames = valid_filenames.append(sample, ignore_index=True)

In [10]:
valid_filenames.drop(labels=['index'], axis=1, inplace=True, errors='ignore')

print(valid_filenames.shape)
print(valid_filenames.head())

(2456, 2)
           Image         Id
0  b2acbd8df.jpg  w_0027efa
1  7b9a5e4d8.jpg  w_0027efa
2  204c7a64b.jpg  w_003bae6
3  40c421fb6.jpg  w_003bae6
4  cd650e905.jpg  w_007fefa


In [11]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [12]:
valid_files = ItemList.from_df(df=valid_filenames, path=train_dir, cols=['Image'])

In [13]:
test_files = ImageItemList.from_folder(test_dir)

In [14]:
# TODO label from df?
data = (
    ImageItemList
        .from_df(df, train_dir, cols=['Image'])
        .no_split()
#         .split_by_files(valid_files)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(test_files)
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path=input_path)
        .normalize(imagenet_stats)
)

In [15]:
#data.show_batch(rows=3, fig_size=(SZ, SZ))

# Learning rate

In [16]:
#learn = create_cnn(data, arch, metrics=accuracy, model_dir=model_dir)

In [17]:
#learn.lr_find()

In [18]:
#learn.recorder.plot()

# Precompute

In [19]:
learn = create_cnn(data, arch, metrics=accuracy, model_dir=model_dir, callback_fns=ShowGraph)

In [20]:
learn.fit(2)

epoch,train_loss,valid_loss,accuracy
1,7.523934,Unnamed: 2_level_1,Unnamed: 3_level_1
2,6.249856,Unnamed: 2_level_2,Unnamed: 3_level_2


In [21]:
# learn few epochs with unfreeze
learn.unfreeze()

In [22]:
lr_rate = 1e-4
learn.fit(5, [lr_rate/100, lr_rate/10, lr_rate])

epoch,train_loss,valid_loss,accuracy
1,5.059197,Unnamed: 2_level_1,Unnamed: 3_level_1
2,4.782656,Unnamed: 2_level_2,Unnamed: 3_level_2
3,4.631011,Unnamed: 2_level_3,Unnamed: 3_level_3
4,4.432190,Unnamed: 2_level_4,Unnamed: 3_level_4
5,4.275361,Unnamed: 2_level_5,Unnamed: 3_level_5
6,4.093031,Unnamed: 2_level_6,Unnamed: 3_level_6


KeyboardInterrupt: 

# Prediction & Summition - TODO

In [23]:
log_preds,y = learn.TTA(ds_type=DatasetType.Test)

In [24]:
def top_preds(preds, classes):
    top = np.argsort(preds.numpy())[:, ::-1][:, :4]
    labels = []
    for i in range(top.shape[0]):
        labels.append(' '.join([classes[idx-1] for idx in top[i]]+['new_whale']))
    return labels

def create_submission(preds, data):
    classes = data.classes
    sub_df = pd.DataFrame({'Image': [path.name for path in data.test_ds.x.items]})
    sub_df['Id'] = top_preds(preds, classes)
    return sub_df

In [26]:
submittion_df = create_submission(log_preds, learn.data)

In [27]:
submittion_file = 'submission-3.csv'
submittion_df.to_csv(submittion_file, index=False)

In [28]:
print(submittion_df.head())
print(submittion_df.shape)

           Image                                                 Id
0  a82ba2f76.jpg  w_89f521e w_d066c0a w_5a0ef9f w_0350bf4 new_whale
1  ab62db5a5.jpg  w_fdc6e6d w_c865cfd w_5a0ef9f w_06619ff new_whale
2  6219200df.jpg  w_c1acf72 w_90a7b56 w_ff1de02 w_5b16177 new_whale
3  0f3c1f2ca.jpg  w_190b9bb w_9b46d56 w_396c07d w_df27de9 new_whale
4  7d139670e.jpg  w_f5fcb45 w_b938e96 w_051764c w_5c4f274 new_whale
(7960, 2)


In [None]:
!kaggle competitions submit -c humpback-whale-identification -f {submittion_file} -m "less overfitter model submittion"