diff --git a/.travis.yml b/.travis.yml
index ab560b3b4..044dee26a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,24 +9,9 @@ notifications:
   email: false
 git:
   depth: 99999999
+sudo: required
+dist: trusty
 
-## uncomment the following lines to allow failures on nightly julia
-## (tests will run but not make your overall status red)
-#matrix:
-#  allow_failures:
-#  - julia: nightly
-
-## uncomment and modify the following lines to manually install system packages
-#addons:
-#  apt: # apt-get for linux
-#    packages:
-#    - gfortran
-#before_script: # homebrew for mac
-#  - if [ $TRAVIS_OS_NAME = osx ]; then brew install gcc; fi
-
-## uncomment the following lines to override the default test script
-#script:
-#  - julia -e 'Pkg.clone(pwd()); Pkg.build("Metalhead"); Pkg.test("Metalhead"; coverage=true)'
 after_success:
   # push coverage results to Coveralls
   - julia -e 'cd(Pkg.dir("Metalhead")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
diff --git a/README.md b/README.md
index bd251a5e4..c723fa802 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,22 @@ julia> vgg.layers[1:21](x)
   ⋮
 ```
 
+# Available Models for Object Classification
+
+1. VGG - VGG11, VGG13, VGG16, VGG19
+2. ResNet - ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
+3. GoogleNet
+4. SqueezeNet - v1.0 and v1.1
+5. DenseNet - DenseNet121, DenseNet169, DenseNet201, DenseNet264
+
+# Available Pretrained Models for Object Classification
+
+1. VGG19
+2. ResNet50
+3. GoogleNet
+4. SqueezeNet v1.1
+5. DenseNet121
+
 # Working with common datasets
 Metalhead includes support for wokring with several common object recognition datasets.
 The `datasets()` function will attempt to auto-detect any common dataset placed in
diff --git a/src/Metalhead.jl b/src/Metalhead.jl
index 151ed87c4..b7f8b1281 100644
--- a/src/Metalhead.jl
+++ b/src/Metalhead.jl
@@ -5,7 +5,12 @@ using Flux, Images, ImageFiltering, BSON, REPL, Requires, Statistics
 using Flux: @treelike
 
 # Models
-export VGG19, SqueezeNet, DenseNet, ResNet, GoogleNet
+export VGG19, VGG16, VGG13, VGG11, SqueezeNet, DenseNet121,
+       DenseNet169, DenseNet201, DenseNet264, ResNet18,
+       ResNet34, ResNet50, ResNet101, ResNet152, GoogleNet
+
+# Trained Models Loader
+export trained
 
 # Useful re-export from Images
 export load
@@ -30,7 +35,7 @@ include("display/terminal.jl")
 include("datasets/imagenet.jl")
 include("datasets/cifar10.jl")
 include("datasets/autodetect.jl")
-include("vgg19.jl")
+include("vgg.jl")
 include("squeezenet.jl")
 include("densenet.jl")
 include("resnet.jl")
diff --git a/src/densenet.jl b/src/densenet.jl
index edee81d10..a4a134489 100644
--- a/src/densenet.jl
+++ b/src/densenet.jl
@@ -14,7 +14,7 @@ Bottleneck(in_planes, growth_rate) = Bottleneck(
 
 Transition(chs::Pair{<:Int, <:Int}) = Chain(BatchNorm(chs[1], relu),
                                             Conv((1, 1), chs),
-                                            x -> meanpool(x, (2, 2)))
+                                            MeanPool((2, 2)))
 
 function _make_dense_layers(block, in_planes, growth_rate, nblock)
   local layers = []
@@ -25,70 +25,130 @@ function _make_dense_layers(block, in_planes, growth_rate, nblock)
   Chain(layers...)
 end
 
-function _densenet(nblocks = [6, 12, 24, 16]; block = Bottleneck, growth_rate = 32, reduction = 0.5, num_classes = 1000)
-  num_planes = 2growth_rate
-  layers = []
-  push!(layers, Conv((7, 7), 3=>num_planes, stride = (2, 2), pad = (3, 3)))
-  push!(layers, BatchNorm(num_planes, relu))
-  push!(layers, x -> maxpool(x, (3, 3), stride = (2, 2), pad = (1, 1)))
-
-  for i in 1:3
-    push!(layers, _make_dense_layers(block, num_planes, growth_rate, nblocks[i]))
-    num_planes += nblocks[i] * growth_rate
-    out_planes = Int(floor(num_planes * reduction))
-    push!(layers, Transition(num_planes=>out_planes))
-    num_planes = out_planes
-  end
-
-  push!(layers, _make_dense_layers(block, num_planes, growth_rate, nblocks[4]))
-  num_planes += nblocks[4] * growth_rate
-  push!(layers, BatchNorm(num_planes, relu))
-
-  Chain(layers..., x -> meanpool(x, (7, 7)),
-        x -> reshape(x, :, size(x, 4)),
-        Dense(num_planes, num_classes), softmax)
-end
-
-function densenet_layers()
+function trained_densenet121_layers()
   weight = Metalhead.weights("densenet.bson")
   weights = Dict{Any, Any}()
   for ele in keys(weight)
-    weights[string(ele)] = convert(Array{Float64, N} where N ,weight[ele])
+    weights[string(ele)] = weight[ele]
   end
-  ls = _densenet()
-  ls[1].weight.data .= weights["conv1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]
+  ls = load_densenet(densenet_configs["densenet121"]...)
+  ls[1].weight.data .= flipkernel(weights["conv1_w_0"])
   ls[2].β.data .= weights["conv1/bn_b_0"]
   ls[2].γ.data .= weights["conv1/bn_w_0"]
+  ls[2].σ² .= weights["conv1/bn_var_0"]
+  ls[2].μ .= weights["conv1/bn_mean_0"]
   l = 4
   for (c, n) in enumerate([6, 12, 24, 16])
       for i in 1:n
           for j in [2, 4]
-              ls[l][i].layer[j].weight.data .= weights["conv$(c+1)_$i/x$(j÷2)_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]
+              ls[l][i].layer[j].weight.data .= flipkernel(weights["conv$(c+1)_$i/x$(j÷2)_w_0"])
               ls[l][i].layer[j-1].β.data .= weights["conv$(c+1)_$i/x$(j÷2)/bn_b_0"]
               ls[l][i].layer[j-1].γ.data .= weights["conv$(c+1)_$i/x$(j÷2)/bn_w_0"]
+              ls[l][i].layer[j-1].σ² .= weights["conv$(c+1)_$i/x$(j÷2)/bn_var_0"]
+              ls[l][i].layer[j-1].μ .= weights["conv$(c+1)_$i/x$(j÷2)/bn_mean_0"]
           end
       end
       l += 2
   end
   for i in [5, 7, 9] # Transition Block Conv Layers
-    ls[i][2].weight.data .= weights["conv$(i÷2)_blk_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]
+    ls[i][2].weight.data .= flipkernel(weights["conv$(i÷2)_blk_w_0"])
     ls[i][1].β.data .= weights["conv$(i÷2)_blk/bn_b_0"]
     ls[i][1].γ.data .= weights["conv$(i÷2)_blk/bn_w_0"]
+    ls[i][1].σ² .= weights["conv$(i÷2)_blk/bn_var_0"]
+    ls[i][1].μ .= weights["conv$(i÷2)_blk/bn_mean_0"]
   end
+  ls[11].β.data .= weights["conv5_blk/bn_b_0"]
+  ls[11].γ.data .= weights["conv5_blk/bn_w_0"]
+  ls[11].σ² .= weights["conv5_blk/bn_var_0"]
+  ls[11].μ .= weights["conv5_blk/bn_mean_0"]
   ls[end-1].W.data .= transpose(dropdims(weights["fc6_w_0"], dims = (1, 2))) # Dense Layers
   ls[end-1].b.data .= weights["fc6_b_0"]
   Flux.testmode!(ls)
   return ls
 end
 
-struct DenseNet <: ClassificationModel{ImageNet.ImageNet1k}
+function load_densenet(block, nblocks; growth_rate = 32, reduction = 0.5, num_classes = 1000)
+  num_planes = 2growth_rate
+  layers = []
+  push!(layers, Conv((7, 7), 3=>num_planes, stride = (2, 2), pad = (3, 3)))
+  push!(layers, BatchNorm(num_planes, relu))
+  push!(layers, MaxPool((3, 3), stride = (2, 2), pad = (1, 1)))
+
+  for i in 1:3
+    push!(layers, _make_dense_layers(block, num_planes, growth_rate, nblocks[i]))
+    num_planes += nblocks[i] * growth_rate
+    out_planes = Int(floor(num_planes * reduction))
+    push!(layers, Transition(num_planes=>out_planes))
+    num_planes = out_planes
+  end
+
+  push!(layers, _make_dense_layers(block, num_planes, growth_rate, nblocks[4]))
+  num_planes += nblocks[4] * growth_rate
+  push!(layers, BatchNorm(num_planes, relu))
+
+  Chain(layers..., MeanPool((7, 7)),
+        x -> reshape(x, :, size(x, 4)),
+        Dense(num_planes, num_classes), softmax)
+end
+
+densenet_configs =
+  Dict("densenet121" => (Bottleneck, [6, 12, 24, 16]),
+       "densenet169" => (Bottleneck, [6, 12, 32, 32]),
+       "densenet201" => (Bottleneck, [6, 12, 48, 32]),
+       "densenet264" => (Bottleneck, [6, 12, 64, 48]))
+
+struct DenseNet121 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+DenseNet121() = DenseNet121(load_densenet(densenet_configs["densenet121"]...))
+
+trained(::Type{DenseNet121}) = DenseNet121(trained_densenet121_layers())
+
+Base.show(io::IO, ::DenseNet121) = print(io, "DenseNet121()")
+
+@treelike DenseNet121
+
+(m::DenseNet121)(x) = m.layers(x)
+
+struct DenseNet169 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+DenseNet169() = DenseNet169(load_densenet(densenet_configs["densenet169"]...))
+
+trained(::Type{DenseNet169}) = error("Pretrained Weights for DenseNet169 are not available")
+
+Base.show(io::IO, ::DenseNet169) = print(io, "DenseNet169()")
+
+@treelike DenseNet169
+
+(m::DenseNet169)(x) = m.layers(x)
+
+struct DenseNet201 <: ClassificationModel{ImageNet.ImageNet1k}
   layers::Chain
 end
 
-DenseNet() = DenseNet(densenet_layers())
+DenseNet201() = DenseNet201(load_densenet(densenet_configs["densenet201"]...))
+
+trained(::Type{DenseNet201}) = error("Pretrained Weights for DenseNet201 are not available")
+
+Base.show(io::IO, ::DenseNet201) = print(io, "DenseNet201()")
+
+@treelike DenseNet201
+
+(m::DenseNet201)(x) = m.layers(x)
+
+struct DenseNet264 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+DenseNet264() = DenseNet264(load_densenet(densenet_configs["densenet264"]..., growth_rate=48))
+
+trained(::Type{DenseNet264}) = error("Pretrained Weights for DenseNet264 are not available")
 
-Base.show(io::IO, ::DenseNet) = print(io, "DenseNet()")
+Base.show(io::IO, ::DenseNet264) = print(io, "DenseNet264()")
 
-@treelike DenseNet
+@treelike DenseNet264
 
-(m::DenseNet)(x) = m.layers(x)
+(m::DenseNet264)(x) = m.layers(x)
diff --git a/src/googlenet.jl b/src/googlenet.jl
index 80fa29f4a..c2f250fff 100644
--- a/src/googlenet.jl
+++ b/src/googlenet.jl
@@ -9,61 +9,57 @@ end
 
 function InceptionBlock(in_chs, chs_1x1, chs_3x3_reduce, chs_3x3, chs_5x5_reduce, chs_5x5, pool_proj)
   path_1 = Conv((1, 1), in_chs=>chs_1x1, relu)
-
-  path_2 = (Conv((1, 1), in_chs=>chs_3x3_reduce, relu),
-            Conv((3, 3), chs_3x3_reduce=>chs_3x3, relu, pad = (1, 1)))
-
-  path_3 = (Conv((1, 1), in_chs=>chs_5x5_reduce, relu),
-            Conv((5, 5), chs_5x5_reduce=>chs_5x5, relu, pad = (2, 2)))
-
-  path_4 = (x -> maxpool(x, (3,3), stride = (1, 1), pad = (1, 1)),
-            Conv((1, 1), in_chs=>pool_proj, relu))
-
+  path_2 = Chain(Conv((1, 1), in_chs=>chs_3x3_reduce, relu),
+                 Conv((3, 3), chs_3x3_reduce=>chs_3x3, relu, pad = (1, 1)))
+  path_3 = Chain(Conv((1, 1), in_chs=>chs_5x5_reduce, relu),
+                 Conv((5, 5), chs_5x5_reduce=>chs_5x5, relu, pad = (2, 2)))
+  path_4 = Chain(MaxPool((3,3), stride = (1, 1), pad = (1, 1)),
+                 Conv((1, 1), in_chs=>pool_proj, relu))
   InceptionBlock(path_1, path_2, path_3, path_4)
 end
 
 function (m::InceptionBlock)(x)
-  cat(m.path_1(x), m.path_2[2](m.path_2[1](x)), m.path_3[2](m.path_3[1](x)), m.path_4[2](m.path_4[1](x)), dims = 3)
+  cat(m.path_1(x), m.path_2(x), m.path_3(x), m.path_4(x), dims = 3)
 end
 
-_googlenet() = Chain(Conv((7, 7), 3=>64, stride = (2, 2), relu, pad = (3, 3)),
-      x -> maxpool(x, (3, 3), stride = (2, 2), pad = (1, 1)),
+load_googlenet() = Chain(Conv((7, 7), 3=>64, stride = (2, 2), relu, pad = (3, 3)),
+      MaxPool((3, 3), stride = (2, 2), pad = (1, 1)),
       Conv((1, 1), 64=>64, relu),
       Conv((3, 3), 64=>192, relu, pad = (1, 1)),
-      x -> maxpool(x, (3, 3), stride = (2, 2), pad = (1, 1)),
+      MaxPool((3, 3), stride = (2, 2), pad = (1, 1)),
       InceptionBlock(192, 64, 96, 128, 16, 32, 32),
       InceptionBlock(256, 128, 128, 192, 32, 96, 64),
-      x -> maxpool(x, (3, 3), stride = (2, 2), pad = (1, 1)),
+      MaxPool((3, 3), stride = (2, 2), pad = (1, 1)),
       InceptionBlock(480, 192, 96, 208, 16, 48, 64),
       InceptionBlock(512, 160, 112, 224, 24, 64, 64),
       InceptionBlock(512, 128, 128, 256, 24, 64, 64),
       InceptionBlock(512, 112, 144, 288, 32, 64, 64),
       InceptionBlock(528, 256, 160, 320, 32, 128, 128),
-      x -> maxpool(x, (3, 3), stride = (2, 2), pad = (1, 1)),
+      MaxPool((3, 3), stride = (2, 2), pad = (1, 1)),
       InceptionBlock(832, 256, 160, 320, 32, 128, 128),
       InceptionBlock(832, 384, 192, 384, 48, 128, 128),
-      x -> meanpool(x, (7, 7), stride = (1, 1), pad = (0, 0)),
+      MeanPool((7, 7), stride = (1, 1), pad = (0, 0)),
       x -> reshape(x, :, size(x, 4)),
       Dropout(0.4),
       Dense(1024, 1000), softmax)
 
-function googlenet_layers()
+function trained_googlenet_layers()
   weight = Metalhead.weights("googlenet.bson")
   weights = Dict{Any, Any}()
   for ele in keys(weight)
-    weights[string(ele)] = convert(Array{Float64, N} where N, weight[ele])
+    weights[string(ele)] = weight[ele]
   end
-  ls = _googlenet()
-  ls[1].weight.data .= weights["conv1/7x7_s2_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[1].bias.data .= weights["conv1/7x7_s2_b_0"]
-  ls[3].weight.data .= weights["conv2/3x3_reduce_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[3].bias.data .= weights["conv2/3x3_reduce_b_0"]
-  ls[4].weight.data .= weights["conv2/3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[4].bias.data .= weights["conv2/3x3_b_0"]
+  ls = load_googlenet()
+  ls[1].weight.data .= flipkernel(weights["conv1/7x7_s2_w_0"]); ls[1].bias.data .= weights["conv1/7x7_s2_b_0"]
+  ls[3].weight.data .= flipkernel(weights["conv2/3x3_reduce_w_0"]); ls[3].bias.data .= weights["conv2/3x3_reduce_b_0"]
+  ls[4].weight.data .= flipkernel(weights["conv2/3x3_w_0"]); ls[4].bias.data .= weights["conv2/3x3_b_0"]
   for (a, b) in [(6, "3a"), (7, "3b"), (9, "4a"), (10, "4b"), (11, "4c"), (12, "4d"), (13, "4e"), (15, "5a"), (16, "5b")]
-    ls[a].path_1.weight.data .= weights["inception_$b/1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[a].path_1.bias.data .= weights["inception_$b/1x1_b_0"]
-    ls[a].path_2[1].weight.data .= weights["inception_$b/3x3_reduce_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[a].path_2[1].bias.data .= weights["inception_$b/3x3_reduce_b_0"]
-    ls[a].path_2[2].weight.data .= weights["inception_$b/3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[a].path_2[2].bias.data .= weights["inception_$b/3x3_b_0"]
-    ls[a].path_3[1].weight.data .= weights["inception_$b/5x5_reduce_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[a].path_3[1].bias.data .= weights["inception_$b/5x5_reduce_b_0"]
-    ls[a].path_3[2].weight.data .= weights["inception_$b/5x5_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[a].path_3[2].bias.data .= weights["inception_$b/5x5_b_0"]
-    ls[a].path_4[2].weight.data .= weights["inception_$b/pool_proj_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]; ls[a].path_4[2].bias.data .= weights["inception_$b/pool_proj_b_0"]
+    ls[a].path_1.weight.data .= flipkernel(weights["inception_$b/1x1_w_0"]); ls[a].path_1.bias.data .= weights["inception_$b/1x1_b_0"]
+    ls[a].path_2[1].weight.data .= flipkernel(weights["inception_$b/3x3_reduce_w_0"]); ls[a].path_2[1].bias.data .= weights["inception_$b/3x3_reduce_b_0"]
+    ls[a].path_2[2].weight.data .= flipkernel(weights["inception_$b/3x3_w_0"]); ls[a].path_2[2].bias.data .= weights["inception_$b/3x3_b_0"]
+    ls[a].path_3[1].weight.data .= flipkernel(weights["inception_$b/5x5_reduce_w_0"]); ls[a].path_3[1].bias.data .= weights["inception_$b/5x5_reduce_b_0"]
+    ls[a].path_3[2].weight.data .= flipkernel(weights["inception_$b/5x5_w_0"]); ls[a].path_3[2].bias.data .= weights["inception_$b/5x5_b_0"]
+    ls[a].path_4[2].weight.data .= flipkernel(weights["inception_$b/pool_proj_w_0"]); ls[a].path_4[2].bias.data .= weights["inception_$b/pool_proj_b_0"]
   end
   ls[20].W.data .= transpose(weights["loss3/classifier_w_0"]); ls[20].b.data .= weights["loss3/classifier_b_0"]
   Flux.testmode!(ls)
@@ -74,7 +70,9 @@ struct GoogleNet <: ClassificationModel{ImageNet.ImageNet1k}
   layers::Chain
 end
 
-GoogleNet() = GoogleNet(googlenet_layers())
+GoogleNet() = GoogleNet(load_googlenet())
+
+trained(::Type{GoogleNet}) = GoogleNet(trained_googlenet_layers())
 
 Base.show(io::IO, ::GoogleNet) = print(io, "GoogleNet()")
 
diff --git a/src/resnet.jl b/src/resnet.jl
index 3277632ee..729627697 100644
--- a/src/resnet.jl
+++ b/src/resnet.jl
@@ -1,83 +1,87 @@
 struct ResidualBlock
-  conv_layers
-  norm_layers
+  layers
   shortcut
 end
 
 @treelike ResidualBlock
 
 function ResidualBlock(filters, kernels::Array{Tuple{Int,Int}}, pads::Array{Tuple{Int,Int}}, strides::Array{Tuple{Int,Int}}, shortcut = identity)
-  local conv_layers = []
-  local norm_layers = []
+  layers = []
   for i in 2:length(filters)
-    push!(conv_layers, Conv(kernels[i-1], filters[i-1]=>filters[i], pad = pads[i-1], stride = strides[i-1]))
-    push!(norm_layers, BatchNorm(filters[i]))
+    push!(layers, Conv(kernels[i-1], filters[i-1]=>filters[i], pad = pads[i-1], stride = strides[i-1]))
+    if i != length(filters)
+      push!(layers, BatchNorm(filters[i], relu))
+    else
+      push!(layers, BatchNorm(filters[i]))
+    end
   end
-  ResidualBlock(Tuple(conv_layers),Tuple(norm_layers),shortcut)
+  ResidualBlock(Chain(layers...), shortcut)
 end
 
-function ResidualBlock(filters, kernels::Array{Int}, pads::Array{Int}, strides::Array{Int}, shortcut = identity)
+ResidualBlock(filters, kernels::Array{Int}, pads::Array{Int}, strides::Array{Int}, shortcut = identity) =
   ResidualBlock(filters, [(i,i) for i in kernels], [(i,i) for i in pads], [(i,i) for i in strides], shortcut)
-end
 
-function (block::ResidualBlock)(input)
-  local value = copy.(input)
-  for i in 1:length(block.conv_layers)-1
-    value = relu.((block.norm_layers[i])((block.conv_layers[i])(value)))
+(r::ResidualBlock)(input) = relu.(r.layers(input) + r.shortcut(input))
+
+function BasicBlock(filters::Int, downsample::Bool = false, res_top::Bool = false)
+  # NOTE: res_top is set to true if this is the first residual connection of the architecture
+  # If the number of channels is to be halved set the downsample argument to true
+  if !downsample || res_top
+    return ResidualBlock([filters for i in 1:3], [3,3], [1,1], [1,1])
   end
-  relu.(((block.norm_layers[end])((block.conv_layers[end])(value))) + block.shortcut(input))
+  shortcut = Chain(Conv((3,3), filters÷2=>filters, pad = (1,1), stride = (2,2)), BatchNorm(filters))
+  ResidualBlock([filters÷2, filters, filters], [3,3], [1,1], [1,2], shortcut)
 end
 
 function Bottleneck(filters::Int, downsample::Bool = false, res_top::Bool = false)
-  if(!downsample && !res_top)
-    return ResidualBlock([4 * filters, filters, filters, 4 * filters], [1,3,1], [0,1,0], [1,1,1])
-  elseif(downsample && res_top)
-    return ResidualBlock([filters, filters, filters, 4 * filters], [1,3,1], [0,1,0], [1,1,1], Chain(Conv((1,1), filters=>4 * filters, pad = (0,0), stride = (1,1)), BatchNorm(4 * filters)))
+  # NOTE: res_top is set to true if this is the first residual connection of the architecture
+  # If the number of channels is to be halved set the downsample argument to true
+  if !downsample && !res_top
+    ResidualBlock([4 * filters, filters, filters, 4 * filters], [1,3,1], [0,1,0], [1,1,1])
+  elseif downsample && res_top
+    ResidualBlock([filters, filters, filters, 4 * filters], [1,3,1], [0,1,0], [1,1,1], Chain(Conv((1,1), filters=>4 * filters, pad = (0,0), stride = (1,1)), BatchNorm(4 * filters)))
   else
     shortcut = Chain(Conv((1,1), 2 * filters=>4 * filters, pad = (0,0), stride = (2,2)), BatchNorm(4 * filters))
-    return ResidualBlock([2 * filters, filters, filters, 4 * filters], [1,3,1], [0,1,0], [1,1,2], shortcut)
-  end
-end
-
-function resnet50()
-  local layers = [3, 4, 6, 3]
-  local layer_arr = []
-
-  push!(layer_arr, Conv((7,7), 3=>64, pad = (3,3), stride = (2,2)))
-  push!(layer_arr, x -> maxpool(x, (3,3), pad = (1,1), stride = (2,2)))
-
-  initial_filters = 64
-  for i in 1:length(layers)
-    push!(layer_arr, Bottleneck(initial_filters, true, i==1))
-    for j in 2:layers[i]
-      push!(layer_arr, Bottleneck(initial_filters))
-    end
-    initial_filters *= 2
+    ResidualBlock([2 * filters, filters, filters, 4 * filters], [1,3,1], [0,1,0], [1,1,2], shortcut)
   end
-
-  push!(layer_arr, x -> meanpool(x, (7,7)))
-  push!(layer_arr, x -> reshape(x, :, size(x,4)))
-  push!(layer_arr, (Dense(2048, 1000)))
-  push!(layer_arr, softmax)
-
-  Chain(layer_arr...)
 end
 
-function resnet_layers()
+function trained_resnet50_layers()
   weight = Metalhead.weights("resnet.bson")
   weights = Dict{Any ,Any}()
   for ele in keys(weight)
-    weights[string(ele)] = convert(Array{Float64, N} where N, weight[ele])
+    weights[string(ele)] = weight[ele]
   end
-  ls = resnet50()
-  ls[1].weight.data .= weights["gpu_0/conv1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]
+  ls = load_resnet(resnet_configs["resnet50"]...)
+  ls[1][1].weight.data .= flipkernel(weights["gpu_0/conv1_w_0"])
+  ls[1][2].σ² .= weights["gpu_0/res_conv1_bn_riv_0"]
+  ls[1][2].μ .= weights["gpu_0/res_conv1_bn_rm_0"]
+  ls[1][2].β.data .= weights["gpu_0/res_conv1_bn_b_0"]
+  ls[1][2].γ.data .= weights["gpu_0/res_conv1_bn_s_0"]
   count = 2
   for j in [3:5, 6:9, 10:15, 16:18]
     for p in j
-      ls[p].conv_layers[1].weight.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2a_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]
-      ls[p].conv_layers[2].weight.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2b_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]
-      ls[p].conv_layers[3].weight.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2c_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:]
+      ls[p].layers[1].weight.data .= flipkernel(weights["gpu_0/res$(count)_$(p-j[1])_branch2a_w_0"])
+      ls[p].layers[2].σ² .= weights["gpu_0/res$(count)_$(p-j[1])_branch2a_bn_riv_0"]
+      ls[p].layers[2].μ .= weights["gpu_0/res$(count)_$(p-j[1])_branch2a_bn_rm_0"]
+      ls[p].layers[2].β.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2a_bn_b_0"]
+      ls[p].layers[2].γ.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2a_bn_s_0"]
+      ls[p].layers[3].weight.data .= flipkernel(weights["gpu_0/res$(count)_$(p-j[1])_branch2b_w_0"])
+      ls[p].layers[4].σ² .= weights["gpu_0/res$(count)_$(p-j[1])_branch2b_bn_riv_0"]
+      ls[p].layers[4].μ .= weights["gpu_0/res$(count)_$(p-j[1])_branch2b_bn_rm_0"]
+      ls[p].layers[4].β.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2b_bn_b_0"]
+      ls[p].layers[4].γ.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2b_bn_s_0"]
+      ls[p].layers[5].weight.data .= flipkernel(weights["gpu_0/res$(count)_$(p-j[1])_branch2c_w_0"])
+      ls[p].layers[6].σ² .= weights["gpu_0/res$(count)_$(p-j[1])_branch2c_bn_riv_0"]
+      ls[p].layers[6].μ .= weights["gpu_0/res$(count)_$(p-j[1])_branch2c_bn_rm_0"]
+      ls[p].layers[6].β.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2c_bn_b_0"]
+      ls[p].layers[6].γ.data .= weights["gpu_0/res$(count)_$(p-j[1])_branch2c_bn_s_0"]
     end
+    ls[j[1]].shortcut[1].weight.data .= flipkernel(weights["gpu_0/res$(count)_0_branch1_w_0"])
+    ls[j[1]].shortcut[2].σ² .= weights["gpu_0/res$(count)_0_branch1_bn_riv_0"]
+    ls[j[1]].shortcut[2].μ .= weights["gpu_0/res$(count)_0_branch1_bn_rm_0"]
+    ls[j[1]].shortcut[2].β.data .= weights["gpu_0/res$(count)_0_branch1_bn_b_0"]
+    ls[j[1]].shortcut[2].γ.data .= weights["gpu_0/res$(count)_0_branch1_bn_s_0"]
     count += 1
   end
   ls[21].W.data .= transpose(weights["gpu_0/pred_w_0"]); ls[21].b.data .= weights["gpu_0/pred_b_0"]
@@ -85,14 +89,108 @@ function resnet_layers()
   return ls
 end
 
-struct ResNet <: ClassificationModel{ImageNet.ImageNet1k}
+function load_resnet(Block, layers, initial_filters::Int = 64, nclasses::Int = 1000)
+  local top = []
+  local residual = []
+  local bottom = []
+
+  push!(top, Chain(Conv((7,7), 3=>initial_filters, pad = (3,3), stride = (2,2)),
+                   BatchNorm(initial_filters)))
+  push!(top, MaxPool((3,3), pad = (1,1), stride = (2,2)))
+
+  for i in 1:length(layers)
+    push!(residual, Block(initial_filters, true, i==1))
+    for j in 2:layers[i]
+      push!(residual, Block(initial_filters))
+    end
+    initial_filters *= 2
+  end
+
+  push!(bottom, MeanPool((7,7)))
+  push!(bottom, x -> reshape(x, :, size(x,4)))
+  if Block == Bottleneck
+    push!(bottom, (Dense(2048, nclasses)))
+  else
+    push!(bottom, (Dense(512, nclasses)))
+  end
+  push!(bottom, softmax)
+
+  Chain(top..., residual..., bottom...)
+end
+
+resnet_configs =
+  Dict("resnet18" => (BasicBlock, [2, 2, 2, 2]),
+       "resnet34" => (BasicBlock, [3, 4, 6, 3]),
+       "resnet50" => (Bottleneck, [3, 4, 6, 3]),
+       "resnet101" => (Bottleneck, [3, 4, 23, 3]),
+       "resnet152" => (Bottleneck, [3, 8, 36, 3]))
+
+struct ResNet18 <: ClassificationModel{ImageNet.ImageNet1k}
   layers::Chain
 end
 
-ResNet() = ResNet(resnet_layers())
+ResNet18() = ResNet18(load_resnet(resnet_configs["resnet18"]...))
+
+trained(::Type{ResNet18}) = error("Pretrained Weights for ResNet18 are not available")
+
+Base.show(io::IO, ::ResNet18) = print(io, "ResNet18()")
+
+@treelike ResNet18
+
+(m::ResNet18)(x) = m.layers(x)
+
+struct ResNet34 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+ResNet34() = ResNet34(load_resnet(resnet_configs["resnet34"]...))
+
+trained(::Type{ResNet34}) = error("Pretrained Weights for ResNet34 are not available")
+
+Base.show(io::IO, ::ResNet34) = print(io, "ResNet34()")
+
+@treelike ResNet34
+
+(m::ResNet34)(x) = m.layers(x)
+
+struct ResNet50 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+ResNet50() = ResNet50(load_resnet(resnet_configs["resnet50"]...))
+
+trained(::Type{ResNet50}) = ResNet50(trained_resnet50_layers())
+
+Base.show(io::IO, ::ResNet50) = print(io, "ResNet50()")
+
+@treelike ResNet50
+
+(m::ResNet50)(x) = m.layers(x)
+
+struct ResNet101 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+ResNet101() = ResNet101(load_resnet(resnet_configs["resnet101"]...))
+
+trained(::Type{ResNet101}) = error("Pretrained Weights for ResNet101 are not available")
+
+Base.show(io::IO, ::ResNet101) = print(io, "ResNet101()")
+
+@treelike ResNet101
+
+(m::ResNet101)(x) = m.layers(x)
+
+struct ResNet152 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+ResNet152() = ResNet152(load_resnet(resnet_configs["resnet152"]...))
+
+trained(::Type{ResNet152}) = error("Pretrained Weights for ResNet152 are not available")
 
-Base.show(io::IO, ::ResNet) = print(io, "ResNet()")
+Base.show(io::IO, ::ResNet152) = print(io, "ResNet152()")
 
-@treelike ResNet
+@treelike ResNet152
 
-(m::ResNet)(x) = m.layers(x)
+(m::ResNet152)(x) = m.layers(x)
diff --git a/src/squeezenet.jl b/src/squeezenet.jl
index e9fcce46f..40d5981b0 100644
--- a/src/squeezenet.jl
+++ b/src/squeezenet.jl
@@ -1,63 +1,117 @@
-function squeezenet_layers()
+struct Fire
+  squeeze
+  expand1x1
+  expand3x3
+end
+
+@treelike Fire
+
+Fire(inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes) =
+  Fire(Conv((1, 1), inplanes=>squeeze_planes, relu),
+       Conv((1, 1), squeeze_planes=>expand1x1_planes, relu),
+       Conv((3, 3), squeeze_planes=>expand3x3_planes, relu, pad=(1, 1)))
+
+function (f::Fire)(x)
+  x = f.squeeze(x)
+  cat(f.expand1x1(x), f.expand3x3(x), dims=3)
+end
+
+# NOTE: The initialization of the Conv layers are different in the paper. They are Kaiming Normal
+load_squeezenetv1_0() = Chain(Conv((7, 7), 3=>96, relu, stride = (2, 2)),
+    MaxPool((3, 3), stride = (2, 2)),
+    Fire(96, 16, 64, 64),
+    Fire(128, 16, 64, 64),
+    Fire(128, 32, 128, 128),
+    MaxPool((3, 3), stride = (2, 2)),
+    Fire(256, 32, 128, 128),
+    Fire(256, 48, 192, 192),
+    Fire(384, 48, 192, 192),
+    Fire(384, 64, 256, 256),
+    MaxPool((3, 3), stride = (2, 2)),
+    Fire(512, 64, 256, 256),
+    Dropout(0.5),
+    Conv((1, 1), 512=>1000, relu),
+    MeanPool((12, 12), stride = (1, 1)),
+    x -> reshape(x, :, size(x, 4)),
+    softmax)
+
+load_squeezenetv1_1() = Chain(Conv((3, 3), 3=>64, relu, stride = (2, 2)),
+    MaxPool((3, 3), stride = (2, 2)),
+    Fire(64, 16, 64, 64),
+    Fire(128, 16, 64, 64),
+    MaxPool((3, 3), stride = (2, 2)),
+    Fire(128, 32, 128, 128),
+    Fire(256, 32, 128, 128),
+    MaxPool((3, 3), stride = (2, 2)),
+    Fire(256, 48, 192, 192),
+    Fire(384, 48, 192, 192),
+    Fire(384, 64, 256, 256),
+    Fire(512, 64, 256, 256),
+    Dropout(0.5),
+    Conv((1, 1), 512=>1000, relu),
+    MeanPool((13, 13), stride = (1, 1)),
+    x -> reshape(x, :, size(x, 4)),
+    softmax)
+
+function trained_squeezenetv1_1_layers()
   weight = Metalhead.weights("squeezenet.bson")
   weights = Dict{Any ,Any}()
   for ele in keys(weight)
     weights[string(ele)] = weight[ele]
   end
-    c_1 = Conv(weights["conv10_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["conv10_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_1 = Conv(flipkernel(weights["conv10_w_0"]), weights["conv10_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
     c_2 = Dropout(0.5f0)
-    c_3 = Conv(weights["fire9/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire9/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_4 = Conv(weights["fire9/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire9/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_5 = Conv(weights["fire8/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire8/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_6 = Conv(weights["fire8/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire8/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_7 = Conv(weights["fire7/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire7/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_8 = Conv(weights["fire7/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire7/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_9 = Conv(weights["fire6/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire6/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_10 = Conv(weights["fire6/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire6/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_11 = Conv(weights["fire5/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire5/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_12 = Conv(weights["fire5/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire5/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_13 = Conv(weights["fire4/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire4/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_14 = Conv(weights["fire4/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire4/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_15 = Conv(weights["fire3/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire3/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_16 = Conv(weights["fire3/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire3/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_17 = Conv(weights["fire2/expand1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire2/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_18 = Conv(weights["fire2/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire2/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
-    c_19 = Conv(weights["conv1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["conv1_b_0"], stride=(2, 2), pad=(0, 0), dilation = (1, 1))
-    c_20 = Conv(weights["fire2/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire2/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
-    c_21 = Conv(weights["fire3/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire3/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
-    c_22 = Conv(weights["fire4/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire4/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
-    c_23 = Conv(weights["fire5/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire5/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
-    c_24 = Conv(weights["fire6/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire6/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
-    c_25 = Conv(weights["fire7/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire7/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
-    c_26 = Conv(weights["fire8/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire8/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
-    c_27 = Conv(weights["fire9/expand3x3_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire9/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_3 = Conv(flipkernel(weights["fire9/expand1x1_w_0"]), weights["fire9/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_4 = Conv(flipkernel(weights["fire9/squeeze1x1_w_0"]), weights["fire9/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_5 = Conv(flipkernel(weights["fire8/expand1x1_w_0"]), weights["fire8/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_6 = Conv(flipkernel(weights["fire8/squeeze1x1_w_0"]), weights["fire8/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_7 = Conv(flipkernel(weights["fire7/expand1x1_w_0"]), weights["fire7/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_8 = Conv(flipkernel(weights["fire7/squeeze1x1_w_0"]), weights["fire7/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_9 = Conv(flipkernel(weights["fire6/expand1x1_w_0"]), weights["fire6/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_10 = Conv(flipkernel(weights["fire6/squeeze1x1_w_0"]), weights["fire6/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_11 = Conv(flipkernel(weights["fire5/expand1x1_w_0"]), weights["fire5/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_12 = Conv(flipkernel(weights["fire5/squeeze1x1_w_0"]), weights["fire5/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_13 = Conv(flipkernel(weights["fire4/expand1x1_w_0"]), weights["fire4/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_14 = Conv(flipkernel(weights["fire4/squeeze1x1_w_0"]), weights["fire4/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_15 = Conv(flipkernel(weights["fire3/expand1x1_w_0"]), weights["fire3/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_16 = Conv(flipkernel(weights["fire3/squeeze1x1_w_0"]), weights["fire3/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_17 = Conv(flipkernel(weights["fire2/expand1x1_w_0"]), weights["fire2/expand1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_18 = Conv(flipkernel(weights["fire2/squeeze1x1_w_0"]), weights["fire2/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1))
+    c_19 = Conv(flipkernel(weights["conv1_w_0"]), weights["conv1_b_0"], stride=(2, 2), pad=(0, 0), dilation = (1, 1))
+    c_20 = Conv(flipkernel(weights["fire2/expand3x3_w_0"]), weights["fire2/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_21 = Conv(flipkernel(weights["fire3/expand3x3_w_0"]), weights["fire3/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_22 = Conv(flipkernel(weights["fire4/expand3x3_w_0"]), weights["fire4/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_23 = Conv(flipkernel(weights["fire5/expand3x3_w_0"]), weights["fire5/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_24 = Conv(flipkernel(weights["fire6/expand3x3_w_0"]), weights["fire6/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_25 = Conv(flipkernel(weights["fire7/expand3x3_w_0"]), weights["fire7/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_26 = Conv(flipkernel(weights["fire8/expand3x3_w_0"]), weights["fire8/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
+    c_27 = Conv(flipkernel(weights["fire9/expand3x3_w_0"]), weights["fire9/expand3x3_b_0"], stride=(1, 1), pad=(1, 1), dilation = (1, 1))
 
-    ls = Chain(Conv(weights["conv1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["conv1_b_0"], stride=(2, 2), pad=(0, 0), dilation = (1, 1)),
-            x -> relu.(x), x->maxpool(x, (3,3), pad=(0,0), stride=(2,2)),
-            Conv(weights["fire2/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire2/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+    ls = Chain(Conv(flipkernel(weights["conv1_w_0"]), weights["conv1_b_0"], stride=(2, 2), pad=(0, 0), dilation = (1, 1)),
+            x -> relu.(x), MaxPool((3,3), pad=(0,0), stride=(2,2)),
+            Conv(flipkernel(weights["fire2/squeeze1x1_w_0"]), weights["fire2/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_17(x)), relu.(c_20(x)), dims=3),
-            Conv(weights["fire3/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire3/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            Conv(flipkernel(weights["fire3/squeeze1x1_w_0"]), weights["fire3/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_15(x)), relu.(c_21(x)), dims=3),
-            x->maxpool(x, (3, 3), pad=(0, 0), stride=(2, 2)),
-            Conv(weights["fire4/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire4/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            MaxPool((3, 3), pad=(0, 0), stride=(2, 2)),
+            Conv(flipkernel(weights["fire4/squeeze1x1_w_0"]), weights["fire4/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_13(x)), relu.(c_22(x)), dims=3),
-            Conv(weights["fire5/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire5/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            Conv(flipkernel(weights["fire5/squeeze1x1_w_0"]), weights["fire5/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_11(x)), relu.(c_23(x)), dims=3),
-            x->maxpool(x, (3, 3), pad=(0, 0), stride=(2, 2)),
-            Conv(weights["fire6/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire6/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            MaxPool((3, 3), pad=(0, 0), stride=(2, 2)),
+            Conv(flipkernel(weights["fire6/squeeze1x1_w_0"]), weights["fire6/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_9(x)), relu.(c_24(x)), dims=3),
-            Conv(weights["fire7/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire7/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            Conv(flipkernel(weights["fire7/squeeze1x1_w_0"]), weights["fire7/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_7(x)), relu.(c_25(x)), dims=3),
-            Conv(weights["fire8/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire8/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            Conv(flipkernel(weights["fire8/squeeze1x1_w_0"]), weights["fire8/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_5(x)), relu.(c_26(x)), dims=3),
-            Conv(weights["fire9/squeeze1x1_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["fire9/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            Conv(flipkernel(weights["fire9/squeeze1x1_w_0"]), weights["fire9/squeeze1x1_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->cat(relu.(c_3(x)), relu.(c_27(x)), dims=3),
             Dropout(0.5f0),
-            Conv(weights["conv10_w_0"][end:-1:1,:,:,:][:,end:-1:1,:,:], weights["conv10_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
+            Conv(flipkernel(weights["conv10_w_0"]), weights["conv10_b_0"], stride=(1, 1), pad=(0, 0), dilation = (1, 1)),
             x -> relu.(x), x->mean(x, dims=[1,2]),
-            vec, softmax
+            x -> reshape(x, :, size(x, 4)), softmax
             )
-#end
   Flux.testmode!(ls)
   return ls
 end
@@ -66,7 +120,25 @@ struct SqueezeNet <: ClassificationModel{ImageNet.ImageNet1k}
   layers::Chain
 end
 
-SqueezeNet() = SqueezeNet(squeezenet_layers())
+function SqueezeNet(version::String = "1.1")
+  if version == "1.0"
+    SqueezeNet(load_squeezenetv1_0())
+  elseif version == "1.1"
+    SqueezeNet(load_squeezenetv1_1())
+  else
+    error("Only SqueezeNet versions 1.1 and 1.0 available")
+  end
+end
+
+function trained(::Type{SqueezeNet}, version = "1.1")
+  if version == "1.0"
+    error("Pretrained Weights for SqueezeNet v1.0 are not available")
+  elseif version == "1.1"
+    SqueezeNet(trained_squeezenetv1_1_layers())
+  else
+    error("Only SqueezeNet versions 1.1 and 1.0 available")
+  end
+end
 
 Base.show(io::IO, ::SqueezeNet) = print(io, "SqueezeNet()")
 
diff --git a/src/utils.jl b/src/utils.jl
index 4cd55f57f..21e5e594b 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -21,6 +21,9 @@ function weights(name)
   BSON.load(joinpath(deps, name))
 end
 
+# TODO: Remove after NNlib supports flip kernel through https://github.com/FluxML/NNlib.jl/pull/53
+flipkernel(x::AbstractArray) = x[end:-1:1, end:-1:1, :, :]
+
 load_img(im::AbstractMatrix{<:Color}) = im
 load_img(str::AbstractString) = load(str)
 load_img(val::ValidationImage) = load_img(val.img)
diff --git a/src/vgg.jl b/src/vgg.jl
new file mode 100644
index 000000000..043fcc030
--- /dev/null
+++ b/src/vgg.jl
@@ -0,0 +1,120 @@
+function trained_vgg19_layers()
+  ws = weights("vgg19.bson")
+  ls = Chain(
+    Conv(flipkernel(ws[:conv1_1_w_0]), ws[:conv1_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv1_2_w_0]), ws[:conv1_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    MaxPool((2,2)),
+    Conv(flipkernel(ws[:conv2_1_w_0]), ws[:conv2_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv2_2_w_0]), ws[:conv2_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    MaxPool((2,2)),
+    Conv(flipkernel(ws[:conv3_1_w_0]), ws[:conv3_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv3_2_w_0]), ws[:conv3_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv3_3_w_0]), ws[:conv3_3_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv3_4_w_0]), ws[:conv3_4_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    MaxPool((2,2)),
+    Conv(flipkernel(ws[:conv4_1_w_0]), ws[:conv4_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv4_2_w_0]), ws[:conv4_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv4_3_w_0]), ws[:conv4_3_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv4_4_w_0]), ws[:conv4_4_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    MaxPool((2,2)),
+    Conv(flipkernel(ws[:conv5_1_w_0]), ws[:conv5_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv5_2_w_0]), ws[:conv5_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv5_3_w_0]), ws[:conv5_3_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    Conv(flipkernel(ws[:conv5_4_w_0]), ws[:conv5_4_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
+    MaxPool((2,2)),
+    x -> reshape(x, :, size(x, 4)),
+    Dense(ws[:fc6_w_0]', ws[:fc6_b_0], relu),
+    Dropout(0.5f0),
+    Dense(ws[:fc7_w_0]', ws[:fc7_b_0], relu),
+    Dropout(0.5f0),
+    Dense(ws[:fc8_w_0]', ws[:fc8_b_0]),
+    softmax)
+  Flux.testmode!(ls)
+  return ls
+end
+
+function load_vgg(arr, batchnorm::Bool = false)
+  layers = []
+  in_chs = 3
+  for i in arr
+    if i != 0
+      push!(layers, Conv((3, 3), in_chs=>i, pad = (1, 1)))
+      if batchnorm
+        push!(layers, BatchNorm(i))
+      end
+      push!(layers, x -> relu.(x))
+      in_chs = i
+    else
+      push!(layers, MaxPool((2, 2)))
+    end
+  end
+  push!(layers, [x -> reshape(x, :, size(x, 4)), Dense(25088, 4096, relu), Dropout(0.5),
+                 Dense(4096, 4096, relu), Dropout(0.5), Dense(4096, 1000), softmax]...)
+  Chain(layers...)
+end
+
+vgg_configs =
+  Dict("vgg11" => [64, 0, 128, 0, 256, 256, 0, 512, 512, 0, 512, 512, 0],
+       "vgg13" => [64, 64, 0, 128, 128, 0, 256, 256, 0, 512, 512, 0, 512, 512, 0],
+       "vgg16" => [64, 64, 0, 128, 128, 0, 256, 256, 256, 0, 512, 512, 512, 0, 512, 512, 512, 0],
+       "vgg19" => [64, 64, 0, 128, 128, 0, 256, 256, 256, 256, 0, 512, 512, 512, 512, 0, 512, 512, 512, 512, 0])
+
+struct VGG11 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+VGG11(batchnorm::Bool = false) = VGG11(load_vgg(vgg_configs["vgg11"], batchnorm))
+
+trained(::Type{VGG11}, batchnorm::Bool = false) =
+  batchnorm ? error("Pretrained Weights for VGG11 BatchNorm are not available") : error("Pretrained Weights for VGG11 are not available")
+
+Base.show(io::IO, ::VGG11) = print(io, "VGG11()")
+
+@treelike VGG11
+
+(m::VGG11)(x) = m.layers(x)
+
+struct VGG13 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+VGG13(batchnorm::Bool = false) = VGG13(load_vgg(vgg_configs["vgg13"], batchnorm))
+
+trained(::Type{VGG13}, batchnorm::Bool = false) =
+  batchnorm ? error("Pretrained Weights for VGG13 BatchNorm are not available") : error("Pretrained Weights for VGG13 are not available")
+
+Base.show(io::IO, ::VGG13) = print(io, "VGG13()")
+
+@treelike VGG13
+
+(m::VGG13)(x) = m.layers(x)
+
+struct VGG16 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+VGG16(batchnorm::Bool = false) = VGG16(load_vgg(vgg_configs["vgg16"], batchnorm))
+
+trained(::Type{VGG16}, batchnorm::Bool = false) =
+  batchnorm ? error("Pretrained Weights for VGG16 BatchNorm are not available") : error("Pretrained Weights for VGG16 are not available")
+
+Base.show(io::IO, ::VGG16) = print(io, "VGG16()")
+
+@treelike VGG16
+
+(m::VGG16)(x) = m.layers(x)
+
+struct VGG19 <: ClassificationModel{ImageNet.ImageNet1k}
+  layers::Chain
+end
+
+VGG19(batchnorm::Bool = false) = VGG19(load_vgg(vgg_configs["vgg19"], batchnorm))
+
+trained(::Type{VGG19}, batchnorm::Bool = false) =
+  batchnorm ? error("Pretrained Weights for VGG19 BatchNorm are not available") : VGG19(trained_vgg19_layers())
+
+Base.show(io::IO, ::VGG19) = print(io, "VGG19()")
+
+@treelike VGG19
+
+(m::VGG19)(x) = m.layers(x)
diff --git a/src/vgg19.jl b/src/vgg19.jl
deleted file mode 100644
index 071e52313..000000000
--- a/src/vgg19.jl
+++ /dev/null
@@ -1,46 +0,0 @@
-function vgg19_layers()
-  ws = weights("vgg19.bson")
-  ls = Chain(
-    Conv(ws[:conv1_1_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv1_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv1_2_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv1_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    x -> maxpool(x, (2,2)),
-    Conv(ws[:conv2_1_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv2_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv2_2_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv2_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    x -> maxpool(x, (2,2)),
-    Conv(ws[:conv3_1_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv3_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv3_2_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv3_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv3_3_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv3_3_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv3_4_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv3_4_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    x -> maxpool(x, (2,2)),
-    Conv(ws[:conv4_1_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv4_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv4_2_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv4_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv4_3_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv4_3_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv4_4_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv4_4_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    x -> maxpool(x, (2,2)),
-    Conv(ws[:conv5_1_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv5_1_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv5_2_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv5_2_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv5_3_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv5_3_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    Conv(ws[:conv5_4_w_0][end:-1:1,:,:,:][:,end:-1:1,:,:], ws[:conv5_4_b_0], relu, pad = (1,1), stride = (1,1), dilation = (1,1)),
-    x -> maxpool(x, (2,2)),
-    x -> reshape(x, :, size(x, 4)),
-    Dense(ws[:fc6_w_0]', ws[:fc6_b_0], relu),
-    Dropout(0.5f0),
-    Dense(ws[:fc7_w_0]', ws[:fc7_b_0], relu),
-    Dropout(0.5f0),
-    Dense(ws[:fc8_w_0]', ws[:fc8_b_0]),
-    softmax)
-  Flux.testmode!(ls)
-  return ls
-end
-
-struct VGG19 <: ClassificationModel{ImageNet.ImageNet1k}
-  layers::Chain
-end
-
-VGG19() = VGG19(vgg19_layers())
-
-Base.show(io::IO, ::VGG19) = print(io, "VGG19()")
-
-@treelike VGG19
-
-(m::VGG19)(x) = m.layers(x)
diff --git a/test/runtests.jl b/test/runtests.jl
index 4fd9fd7dc..2a564c35e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,14 +1,112 @@
-using Metalhead, Test
+using Metalhead, Flux, Test, InteractiveUtils
 
 # Standardized testing for the models of tomorrow
-@testset "Basic Model Tests" begin
+@testset "Untrained Model Tests" begin
     for (T, MODEL) in [
-            (Float32, VGG19),
-            (Float32, SqueezeNet),
-            (Float64, DenseNet),
-            (Float64, GoogleNet),
+            (Float64, VGG11),
+            (Float64, VGG13),
+            (Float64, VGG16),
+            (Float64, VGG19),
+            (Float64, ResNet18),
+            (Float64, ResNet34),
+            (Float64, ResNet50),
+            (Float64, ResNet101),
+            (Float64, ResNet152),
+            (Float64, DenseNet121),
+            (Float64, DenseNet169),
+            (Float64, DenseNet201),
+            (Float64, DenseNet264),
+            (Float64, GoogleNet)
         ]
+        GC.gc()
+
         model = MODEL()
+        model = Flux.mapleaves(Flux.Tracker.data, model)
+
+        x_test = rand(T, 224, 224, 3, 1)
+        y_test = model(x_test)
+
+        # Test that types and shapes work out as we expect
+        @test y_test isa AbstractArray
+        @test length(y_test) == 1000
+
+        # Test that the models can be indexed
+        @test length(model.layers[1:4].layers) == 4
+
+        # Make all the allocations nothing for GC to free them
+        model = nothing
+        x_test = nothing
+        y_test = nothing
+    end
+    GC.gc()
+    # Test if batchnorm models work properly
+    for (T, MODEL) in [
+            (Float64, VGG19),
+            (Float64, VGG16),
+            (Float64, VGG13),
+            (Float64, VGG11)
+        ]
+        GC.gc()
+
+        model = MODEL(true)
+        model = Flux.mapleaves(Flux.Tracker.data, model)
+
+        x_test = rand(T, 224, 224, 3, 1)
+        y_test = model(x_test)
+
+        # Test that types and shapes work out as we expect
+        @test y_test isa AbstractArray
+        @test length(y_test) == 1000
+
+        # Test that the models can be indexed
+        @test length(model.layers[1:4].layers) == 4
+
+        # Make all the allocations nothing for GC to free them
+        model = nothing
+        x_test = nothing
+        y_test = nothing
+    end
+    GC.gc()
+    # Test models which have a version parameter
+    for (T, version, MODEL) in [
+            (Float64, "1.0", SqueezeNet),
+            (Float64, "1.1", SqueezeNet)
+        ]
+        GC.gc()
+
+        model = MODEL(version)
+        model = Flux.mapleaves(Flux.Tracker.data, model)
+
+        x_test = rand(T, 224, 224, 3, 1)
+        y_test = model(x_test)
+
+        # Test that types and shapes work out as we expect
+        @test y_test isa AbstractArray
+        @test length(y_test) == 1000
+
+        # Test that the models can be indexed
+        @test length(model.layers[1:4].layers) == 4
+
+        # Make all the allocations nothing for GC to free them
+        model = nothing
+        x_test = nothing
+        y_test = nothing
+    end
+    GC.gc()
+end
+
+@testset "Trained Model Tests" begin
+    for (T, MODEL) in [
+            (Float32, SqueezeNet),
+            (Float32, VGG19),
+            (Float32, ResNet50),
+            (Float32, DenseNet121),
+            (Float32, GoogleNet)
+        ]
+        GC.gc()
+
+        model = trained(MODEL)
+        model = Flux.mapleaves(Flux.Tracker.data, model)
 
         x_test = rand(T, 224, 224, 3, 1)
         y_test = model(x_test)
@@ -19,6 +117,11 @@ using Metalhead, Test
 
         # Test that the models can be indexed
         @test length(model.layers[1:4].layers) == 4
+
+        # Make all the allocations nothing for GC to free them
+        model = nothing
+        x_test = nothing
+        y_test = nothing
     end
 end
 
@@ -35,13 +138,14 @@ end
 # Test printing of prediction
 @testset "Prediction table display" begin
     x = valimgs(CIFAR10)[1]
-    m = VGG19()
+    m = trained(VGG19)
     predict(m, x)
 end
 
 # Just run the prediction code end-to-end
 # TODO: Set up travis to actually run these
 if length(datasets()) == 2
+    vgg19 = trained(VGG19)
     for dataset in (ImageNet, CIFAR10)
         val1 = valimgs(dataset)[1]
         predict(vgg19, val1)