In [1]:
%matplotlib inline
from fastai import *
from fastai.vision import *
from fastai.vision.models.wrn import wrn_22

torch.backends.cudnn.benchmark = True

In [9]:
#from Init27 notebook, a generic training with 
#https://github.com/EricPerbos/RTX-2080Ti-Vs-GTX-1080Ti-CIFAR-100-Benchmarks/blob/master/1080Ti%20Notebook.ipynb

import functools
import traceback
def get_ref_free_exc_info():
    "Free traceback from references to locals/globals to avoid circular reference leading to gc.collect() unable to reclaim memory"
    type, val, tb = sys.exc_info()
    traceback.clear_frames(tb)
    return (type, val, tb)

def gpu_mem_restore(func):
    "Reclaim GPU RAM if CUDA out of memory happened, or execution was interrupted"
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            type, val, tb = get_ref_free_exc_info() # must!
            raise type(val).with_traceback(tb) from None
    return wrapper

In [11]:
class gpu_mem_restore_ctx():
    " context manager to reclaim GPU RAM if CUDA out of memory happened, or execution was interrupted"
    def __enter__(self): return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        if not exc_val: return True
        traceback.clear_frames(exc_tb)
        raise exc_type(exc_val).with_traceback(exc_tb) from None

In [2]:
torch.cuda.set_device(0)
torch.cuda.current_device()

0

In [3]:
torch.cuda.get_device_name(0)

'GeForce GTX 1080 Ti'

In [4]:
path = untar_data(URLs.CIFAR)
path

PosixPath('/home/eric/Link_fastaiV1/data/cifar10')

## WideResNet_22 in FP32
https://docs.fast.ai/vision.models.html

In [5]:
bs = 512

In [6]:
ds_tfms = ([*rand_pad(4, 32), flip_lr(p=0.5)], [])
data = ImageDataBunch.from_folder(path, valid='test', ds_tfms=ds_tfms, bs=bs).normalize(cifar_stats)

In [7]:
learn = Learner(data, wrn_22(), metrics=accuracy)

In [8]:
learn.fit_one_cycle(30, 3e-3, wd=0.4, div_factor=10, pct_start=0.5)

Total time: 22:55
epoch  train_loss  valid_loss  accuracy
1      1.455381    1.565406    0.461500  (00:50)
2      1.089067    1.129607    0.596500  (00:44)
3      0.873657    0.865207    0.702100  (00:45)
4      0.719008    0.672240    0.768100  (00:45)
5      0.625094    0.739563    0.743600  (00:45)
6      0.550976    0.557014    0.806600  (00:45)
7      0.500237    0.782475    0.748300  (00:45)
8      0.459986    0.648515    0.783100  (00:45)
9      0.444404    0.747558    0.763800  (00:45)
10     0.411688    0.632976    0.797500  (00:45)
11     0.396282    0.475634    0.844800  (00:45)
12     0.372070    0.533911    0.822000  (00:45)
13     0.360920    0.491856    0.840000  (00:45)
14     0.351502    0.486486    0.839900  (00:45)
15     0.345132    0.666289    0.778700  (00:45)
16     0.326271    0.667956    0.790000  (00:45)
17     0.306654    0.455599    0.848500  (00:45)
18     0.287279    0.362420    0.879300  (00:45)
19     0.256889    0.599531    0.809600  (00:45)
20     0.24

#### Using Init27 generic training.

In [13]:
bs = 512

In [14]:
ds_tfms = ([*rand_pad(4, 32), flip_lr(p=0.5)], [])
data = ImageDataBunch.from_folder(path, valid='test', ds_tfms=ds_tfms, bs=bs).normalize(cifar_stats)

In [15]:
learn = Learner(data, wrn_22(), metrics=accuracy)

In [16]:
# Init27 version:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

Total time: 23:37
epoch  train_loss  valid_loss  accuracy
1      1.486746    1.310544    0.533500  (00:44)
2      1.114782    1.186048    0.588900  (00:46)
3      0.873583    1.413428    0.567500  (00:46)
4      0.722459    0.833492    0.718000  (00:46)
5      0.620652    0.759559    0.757600  (00:46)
6      0.557817    0.848302    0.734200  (00:46)
7      0.491300    0.536562    0.819200  (00:46)
8      0.454952    0.639612    0.801100  (00:46)
9      0.415620    0.565566    0.816300  (00:48)
10     0.381543    0.582206    0.813500  (00:47)
11     0.341282    0.504624    0.839600  (00:47)
12     0.315171    0.488063    0.839500  (00:46)
13     0.276689    0.372598    0.881700  (00:46)
14     0.248158    0.335076    0.888800  (00:46)
15     0.224190    0.328903    0.895700  (00:47)
16     0.193702    0.388996    0.886600  (00:47)
17     0.167116    0.348772    0.895500  (00:47)
18     0.143342    0.344119    0.900300  (00:47)
19     0.122058    0.359529    0.895800  (00:47)
20     0.10

## WideResNet_22 in FP16 (Mixed-Precision)
https://docs.fast.ai/vision.models.html

In [17]:
bs = 512

In [18]:
ds_tfms = ([*rand_pad(4, 32), flip_lr(p=0.5)], [])
data = ImageDataBunch.from_folder(path, valid='test', ds_tfms=ds_tfms, bs=bs).normalize(cifar_stats)

In [19]:
learn = Learner(data, wrn_22(), metrics=accuracy).to_fp16()

In [20]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

Total time: 22:04
epoch  train_loss  valid_loss  accuracy
1      1.482857    1.366357    0.510600  (00:48)
2      1.100611    1.357880    0.565700  (00:44)
3      0.884122    1.040865    0.650400  (00:42)
4      0.716015    1.024709    0.680900  (00:43)
5      0.621266    1.177766    0.649500  (00:43)
6      0.551212    0.794662    0.734500  (00:43)
7      0.497174    1.118847    0.692100  (00:42)
8      0.460304    0.800202    0.749400  (00:43)
9      0.427465    0.978180    0.711300  (00:43)
10     0.390403    0.567522    0.816200  (00:43)
11     0.347311    0.515598    0.833200  (00:43)
12     0.310705    0.622131    0.816600  (00:44)
13     0.281159    0.507776    0.841100  (00:44)
14     0.252079    0.340388    0.886400  (00:43)
15     0.217083    0.480148    0.856400  (00:43)
16     0.195928    0.348758    0.893600  (00:44)
17     0.167910    0.328058    0.900400  (00:44)
18     0.141568    0.304546    0.907600  (00:44)
19     0.124718    0.348586    0.904800  (00:44)
20     0.10

Let's try if FP16 allows for a larger batch_size on a GTX 1080Ti.

*(it's not supposed to in theory, FP16 only works with the Tensor Cores of the new RTX cards, according to Nvidia marketing...)*.

In [21]:
bs = 1024

In [22]:
ds_tfms = ([*rand_pad(4, 32), flip_lr(p=0.5)], [])
data = ImageDataBunch.from_folder(path, valid='test', ds_tfms=ds_tfms, bs=bs).normalize(cifar_stats)

In [23]:
learn = Learner(data, wrn_22(), metrics=accuracy).to_fp16()

In [24]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

Total time: 21:52
epoch  train_loss  valid_loss  accuracy
1      1.707921    1.660548    0.380100  (00:54)
2      1.351769    1.201732    0.572700  (00:43)
3      1.102821    1.404987    0.544700  (00:43)
4      0.910935    1.226915    0.583400  (00:42)
5      0.768682    1.129752    0.651300  (00:42)
6      0.655608    1.436298    0.584900  (00:42)
7      0.571285    2.841699    0.478600  (00:42)
8      0.512306    0.855302    0.733500  (00:42)
9      0.461308    0.633544    0.796100  (00:42)
10     0.416123    0.711148    0.790200  (00:42)
11     0.379831    0.759356    0.767500  (00:44)
12     0.339906    0.681459    0.791600  (00:43)
13     0.307886    0.621493    0.816700  (00:43)
14     0.279879    0.475473    0.847900  (00:44)
15     0.245759    0.399877    0.876900  (00:44)
16     0.219292    0.373359    0.885700  (00:43)
17     0.195365    0.407256    0.874700  (00:44)
18     0.172704    0.362101    0.889200  (00:44)
19     0.147960    0.373115    0.890100  (00:42)
20     0.12

## ResNet 34 in FP32


In [37]:
bs = 256

In [38]:
ds_tfms = ([*rand_pad(4, 32), flip_lr(p=0.5)], [])
data = ImageDataBunch.from_folder(path, valid='test', ds_tfms=ds_tfms, bs=bs).normalize(cifar_stats)

In [39]:
learn = create_cnn(data, models.resnet34, metrics=accuracy)

In [40]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

Total time: 05:00
epoch  train_loss  valid_loss  accuracy
1      1.903524    1.575454    0.459800  (00:10)
2      1.525152    1.312011    0.542300  (00:09)
3      1.299418    1.145065    0.590900  (00:09)
4      1.130673    1.010769    0.639600  (00:09)
5      1.041472    0.919672    0.669100  (00:09)
6      0.960568    0.843034    0.702100  (00:10)
7      0.897726    0.814014    0.714800  (00:09)
8      0.868248    0.766644    0.731000  (00:09)
9      0.829007    0.740382    0.738700  (00:09)
10     0.794627    0.701024    0.752900  (00:09)
11     0.766401    0.691995    0.755700  (00:09)
12     0.758658    0.682084    0.760000  (00:10)
13     0.740958    0.662152    0.765000  (00:09)
14     0.721740    0.654098    0.766600  (00:10)
15     0.706843    0.637763    0.775500  (00:10)
16     0.688595    0.639393    0.776100  (00:09)
17     0.683826    0.623874    0.783700  (00:10)
18     0.676737    0.618360    0.782900  (00:10)
19     0.660001    0.611963    0.783500  (00:10)
20     0.64

## ResNet 34 in FP16


##### Note: temporarily using Init27 code from his GH notebook

In [60]:
bs = 512

In [61]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=bs).normalize(cifar_stats)

In [62]:
learn = create_cnn(data, models.resnet34, metrics=accuracy).to_fp16()

In [63]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

Total time: 48:40
epoch  train_loss  valid_loss  accuracy
1      1.273838    0.593700    0.803167  (01:44)
2      0.691573    0.350773    0.880917  (01:39)
3      0.445259    0.257182    0.913417  (01:38)
4      0.330243    0.208854    0.931500  (01:40)
5      0.267394    0.182757    0.938333  (01:39)
6      0.232294    0.169746    0.940500  (01:38)
7      0.206453    0.159884    0.945000  (01:38)
8      0.186955    0.154321    0.948583  (01:38)
9      0.179224    0.145601    0.951667  (01:38)
10     0.167452    0.142818    0.953917  (01:37)
11     0.157094    0.138956    0.954750  (01:37)
12     0.146853    0.136196    0.954167  (01:36)
13     0.142043    0.133387    0.954250  (01:37)
14     0.136752    0.125431    0.958917  (01:37)
15     0.126130    0.127923    0.957500  (01:37)
16     0.121769    0.120689    0.958667  (01:38)
17     0.114473    0.121626    0.959833  (01:37)
18     0.110525    0.121856    0.960083  (01:37)
19     0.103957    0.122863    0.960750  (01:37)
20     0.09

##### Back to my original code (source Fastai v1.0 GH)

In [65]:
bs = 512

In [66]:
ds_tfms = ([*rand_pad(4, 32), flip_lr(p=0.5)], [])
data = ImageDataBunch.from_folder(path, valid='test', ds_tfms=ds_tfms, bs=bs).normalize(cifar_stats)

In [67]:
learn = create_cnn(data, models.resnet34, metrics=accuracy).to_fp16()

In [68]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

Total time: 04:50
epoch  train_loss  valid_loss  accuracy
1      2.098484    1.662046    0.434300  (00:09)
2      1.714002    1.390969    0.517400  (00:10)
3      1.462106    1.228116    0.567800  (00:09)
4      1.282736    1.098585    0.611700  (00:09)
5      1.151833    1.001352    0.641400  (00:09)
6      1.052983    0.932771    0.667100  (00:09)
7      0.979860    0.867787    0.694300  (00:09)
8      0.920166    0.824125    0.708500  (00:09)
9      0.868191    0.778441    0.726700  (00:09)
10     0.835321    0.747957    0.737500  (00:09)
11     0.803417    0.729577    0.743100  (00:09)
12     0.779405    0.706350    0.747300  (00:09)
13     0.758510    0.700714    0.748600  (00:09)
14     0.745068    0.677067    0.764300  (00:09)
15     0.728893    0.666314    0.765600  (00:09)
16     0.711512    0.661447    0.764400  (00:09)
17     0.696236    0.655620    0.768800  (00:09)
18     0.692409    0.647173    0.772000  (00:09)
19     0.675899    0.649554    0.770700  (00:09)
20     0.67

In [5]:
bs = 512