In [3]:
if (typeof(Base.find_package("UnicodePlots")) == Nothing)
    println("Package Unicode not installed")
    using Pkg;
    Pkg.add("UnicodePlots")
end
using UnicodePlots

## Using Packages

In [4]:
using .Threads
using BenchmarkTools
using SparseArrays
using LinearAlgebra

## Including External Files|

In [163]:
include("global_curved_multithreading.jl")

plot_blocks (generic function with 1 method)

## Setting SBP operator Orders

In [6]:
SBPp = 6

6

## Reading .inp files and initializing boundary conditions

In [7]:
bc_map = [BC_DIRICHLET, BC_DIRICHLET, BC_NEUMANN, BC_NEUMANN, BC_JUMP_INTERFACE]
# 1 refers to Dirichlet boundary condition
# 2 refers to Neumann boundary condition
# 7 refers to Jump interface condition

5-element Array{Int64,1}:
 1
 1
 2
 2
 7

In [255]:
(verts, EToV, EToF, FToB, EToDomain) = read_inp_2d("../meshes/16_16_block.inp", bc_map=bc_map)

([0.5 -0.5 … 0.1875 0.0625; 0.5 0.5 … -0.3125 -0.3125], [24 72 … 69 65; 255 256 … 239 224; 256 252 … 234 239; 257 257 … 241 241], [1 5 … 525 538; 2 4 … 544 542; 3 6 … 539 500; 4 7 … 543 544], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0  …  0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [256]:
verts'

289×2 Array{Float64,2}:
  0.5      0.5   
 -0.5      0.5   
 -0.5     -0.5   
  0.5     -0.5   
  0.0      0.5   
 -0.5      0.0   
  0.0     -0.5   
  0.5      0.0   
  0.0      0.0   
  0.25     0.5   
  0.0      0.25  
  0.25     0.0   
  0.5      0.25  
  ⋮              
  0.1875  -0.1875
  0.1875  -0.0625
  0.0625  -0.5   
  0.125   -0.4375
  0.0625  -0.375 
  0.0625  -0.4375
  0.1875  -0.375 
  0.1875  -0.5   
  0.1875  -0.4375
  0.125   -0.3125
  0.1875  -0.3125
  0.0625  -0.3125

In [257]:
EToV

4×256 Array{Int64,2}:
  24   72   74   73    4   70   74   73  …   65   69   58   16   47   69   65
 255  256  252  244  242  243  244  245     239  236  199  161  234  239  224
 256  252  244  255  245  242  243  244     228  239  236  224  161  234  239
 257  257  257  257  246  246  246  246     240  240  240  241  241  241  241

In [258]:
FToB

544-element Array{Int64,1}:
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 1
 0
 1
 ⋮
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [259]:
EToDomain

256-element Array{Int64,1}:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

In [260]:
(nelems, nfaces) = (size(EToV, 2), size(FToB, 1))
@show (nelems, nfaces)

(nelems, nfaces) = (256, 544)


(256, 544)

In [261]:
# This is needed to fix up points that should be on the boundary of the
# circle, but the mesh didn't quite put them there
for v in 1:size(verts, 2)
    x,y = verts[1,v], verts[2,v]
    if abs(hypot(x,y) - 1) < 1e-5
        Q = atan(y,x)
        verts[1,v], verts[2,v] = cos(Q), sin(Q)
    end
end


In [262]:
plot_connectivity(verts, EToV)

[1m                                        connectivity[22m
[90m      ┌────────────────────────────────────────────────────────────────────────────────┐[39m 
    [90m1[39m[90m │[39m[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[90m│[39m 
     [90m │[39m[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀

## Setting base mesh sizes in each direction

In [263]:
N1 = N0 = 16

16

In [264]:
EToN0 = zeros(Int64, 2, nelems)
EToN0[1,:] .= N0;
@show EToN0
EToN0[2,:] .= N1;
@show EToN0

EToN0 = [16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16; 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

2×256 Array{Int64,2}:
 16  16  16  16  16  16  16  16  16  …  16  16  16  16  16  16  16  16  16
 16  16  16  16  16  16  16  16  16     16  16  16  16  16  16  16  16  16

In [318]:
# Checking types and sizes 
@assert typeof(EToV) == Array{Int, 2} && size(EToV) == (4, nelems)
@assert typeof(EToF) == Array{Int, 2} && size(EToF) == (4, nelems)
@assert maximum(maximum(EToF)) == nfaces   # The maximum number of EToF element should equal the number of faces 

# Determine secondary arrays
# FToE : Unique Global Face to Element Number
#        (the i'th column of this stores the element numbers that share the
#        global face number i)
# FToLF: Unique Global Face to Element local face number
#        (the i'th column of this stores the element local face numbers that
#        shares the global face number i)
# EToO : Element to Unique Global Faces Orientation
#        (the i'th column of this stores the whether the element and global
#        face are oriented in the same way in physical memory or need to be
#        rotated)
# EToS : Element to Unique Global Face Side
#        (the i'th column of this stores whether an element is on the
#        plus side or minus side of the global face)


In [319]:
# connectivity arrays
(FToE, FToLF, EToO, EToS) = connectivityarrays(EToV, EToF)

([1 1 … 254 255; 28 4 … 255 256], [1 2 … 2 2; 3 4 … 4 4], Bool[1 1 … 1 1; 1 1 … 1 1; 1 1 … 1 1; 1 1 … 1 1], [1 1 … 2 2; 1 2 … 1 2; 1 1 … 2 2; 1 1 … 2 2])

## Forming exact solutions

In [320]:
Lx = maximum(verts[1,:])
@show Lx
Ly = maximum(abs.(verts[2,:]))
@show Ly

Lx = 0.5
Ly = 0.5


0.5

In [321]:
(kx, ky) = (2*π / Lx, 4*π / Ly)

(12.566370614359172, 25.132741228718345)

In [322]:
vex(x,y,e) = begin
    if EToDomain[e] == 1
        return cos.(kx * x) .* cosh.(ky * y)
    elseif EToDomain[e] == 2
        return 10 .+ cos.(kx * x) .* cosh.(ky * y)
    else
        error("invalid block")
    end
end

vex (generic function with 1 method)

In [323]:
vex_x(x,y,e) = begin
  if EToDomain[e] == 1
    return -kx * sin.(kx * x) .* cosh.(ky * y)
  elseif EToDomain[e] == 2
    return -kx * sin.(kx * x) .* cosh.(ky * y)
  else
    error("invalid block")
  end
end

vex_x (generic function with 1 method)

In [324]:
vex_y(x,y,e) = begin
  if EToDomain[e] == 1
    return ky * cos.(kx * x) .* sinh.(ky * y)
  elseif EToDomain[e] == 2
    return ky * cos.(kx * x) .* sinh.(ky * y)
  else
    error("invalid block")
  end
end

vex_y (generic function with 1 method)

In [325]:
vex_xx(x,y,e) = begin
  if EToDomain[e] == 1
    return -kx^2 * cos.(kx * x) .* cosh.(ky * y)
  elseif EToDomain[e] == 2
    return -kx^2 * cos.(kx * x) .* cosh.(ky * y)
  else
    error("invalid block")
  end
end

vex_xx (generic function with 1 method)

In [326]:
vex_xy(x,y,e) = begin
  if EToDomain[e] == 1
    return -kx * ky * sin.(kx * x) .* sinh.(ky * y)
  elseif EToDomain[e] == 2
    return -kx * ky * sin.(kx * x) .* sinh.(ky * y)
  else
    error("invalid block")
  end
end

vex_xy (generic function with 1 method)

In [327]:
vex_yy(x,y,e) = begin
  if EToDomain[e] == 1
    return ky^2 * cos.(kx * x) .* cosh.(ky * y)
  elseif EToDomain[e] == 2
    return ky^2 * cos.(kx * x) .* cosh.(ky * y)
  else
    error("invalid block")
  end
end

vex_yy (generic function with 1 method)

In [328]:
ϵ = zeros(4)

4-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0

In [329]:
lvl = 1

1

In [330]:
# generate base mesh size for each element
Nr = EToN0[1,:] * (2^(lvl - 1))
Ns = EToN0[2,:] * (2^(lvl - 1))

256-element Array{Int64,1}:
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
  ⋮
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16

In [331]:
OPTYPE = typeof(locoperator(2,8,8))

NamedTuple{(:M̃, :F, :coord, :facecoord, :JH, :sJ, :nx, :ny, :Hf, :HfI, :τ, :bctype),Tuple{SparseMatrixCSC{Float64,Int64},NTuple{4,SparseMatrixCSC{Float64,Int64}},Tuple{Array{Float64,2},Array{Float64,2}},Tuple{Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}},Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}}},SparseMatrixCSC{Float64,Int64},NTuple{4,Array{Float64,1}},NTuple{4,Array{Float64,1}},NTuple{4,A

In [332]:
lop = Dict{Int64, OPTYPE}()

Dict{Int64,NamedTuple{(:M̃, :F, :coord, :facecoord, :JH, :sJ, :nx, :ny, :Hf, :HfI, :τ, :bctype),Tuple{SparseMatrixCSC{Float64,Int64},NTuple{4,SparseMatrixCSC{Float64,Int64}},Tuple{Array{Float64,2},Array{Float64,2}},Tuple{Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}},Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}}},SparseMatrixCSC{Float64,Int64},NTuple{4,Array{Float64,1}},NTuple{4,Array{Float64,1}}

In [333]:
@benchmark for e = 1:nelems
      # Get the element corners
      (x1, x2, x3, x4) = verts[1, EToV[:, e]]
      (y1, y2, y3, y4) = verts[2, EToV[:, e]]

      # Initialize the block transformations as transfinite between the corners
      ex = [(α) -> x1 * (1 .- α) / 2 + x3 * (1 .+ α) / 2,
            (α) -> x2 * (1 .- α) / 2 + x4 * (1 .+ α) / 2,
            (α) -> x1 * (1 .- α) / 2 + x2 * (1 .+ α) / 2,
            (α) -> x3 * (1 .- α) / 2 + x4 * (1 .+ α) / 2]
      exα = [(α) -> -x1 / 2 + x3 / 2,
             (α) -> -x2 / 2 + x4 / 2,
             (α) -> -x1 / 2 + x2 / 2,
             (α) -> -x3 / 2 + x4 / 2]
      ey = [(α) -> y1 * (1 .- α) / 2 + y3 * (1 .+ α) / 2,
            (α) -> y2 * (1 .- α) / 2 + y4 * (1 .+ α) / 2,
            (α) -> y1 * (1 .- α) / 2 + y2 * (1 .+ α) / 2,
            (α) -> y3 * (1 .- α) / 2 + y4 * (1 .+ α) / 2]
      eyα = [(α) -> -y1 / 2 + y3 / 2,
             (α) -> -y2 / 2 + y4 / 2,
             (α) -> -y1 / 2 + y2 / 2,
             (α) -> -y3 / 2 + y4 / 2]

      # For blocks on the circle, put in the curved edge transform
      if FToB[EToF[1, e]] == BC_JUMP_INTERFACE
        error("curved face 1 not implemented yet")
      end
      if FToB[EToF[2, e]] == BC_JUMP_INTERFACE
        error("curved face 2 not implemented yet")
      end
      if FToB[EToF[3, e]] == BC_JUMP_INTERFACE
        Q1 = atan(y1, x1)
        Q2 = atan(y2, x2)
        if !(-π/2 < Q1 - Q2 < π/2)
          Q2 -= sign(Q2) * 2 * π
        end
        ex[3] = (α) -> cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        ey[3] = (α) -> sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        β3 = (Q2 - Q1) / 2
        exα[3] = (α) -> -β3 .* sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        eyα[3] = (α) -> +β3 .* cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
      end
      if FToB[EToF[4, e]] == BC_JUMP_INTERFACE
        Q3 = atan(y3, x3)
        Q4 = atan(y4, x4)
        if !(-π/2 < Q3 - Q4 < π/2)
          error("curved face 4 angle correction not implemented yet")
        end
        ex[4] = (α) -> cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        ey[4] = (α) -> sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        β4 = (Q4 - Q3) / 2
        exα[4] = (α) -> -β4 .* sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        eyα[4] = (α) -> +β4 .* cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
      end

      # Create the volume transform as the transfinite blending of the edge
      # transformations
      xt(r,s) = transfinite_blend(ex[1], ex[2], ex[3], ex[4],
                                  exα[1], exα[2], exα[3], exα[4],
                                  r, s)
      yt(r,s) = transfinite_blend(ey[1], ey[2], ey[3], ey[4],
                                  eyα[1], eyα[2], eyα[3], eyα[4],
                                  r, s)


      metrics = create_metrics(SBPp, Nr[e], Ns[e], xt, yt)

      # Build local operators
      lop[e] = locoperator(SBPp, Nr[e], Ns[e], metrics, FToB[EToF[:, e]])
    end

BenchmarkTools.Trial: 
  memory estimate:  1.82 GiB
  allocs estimate:  3618623
  --------------
  minimum time:     697.362 ms (12.46% GC)
  median time:      738.130 ms (13.47% GC)
  mean time:        748.747 ms (13.64% GC)
  maximum time:     854.041 ms (16.12% GC)
  --------------
  samples:          7
  evals/sample:     1

In [334]:
@benchmark @threads for e = 1:nelems
      # Get the element corners
      (x1, x2, x3, x4) = verts[1, EToV[:, e]]
      (y1, y2, y3, y4) = verts[2, EToV[:, e]]

      # Initialize the block transformations as transfinite between the corners
      ex = [(α) -> x1 * (1 .- α) / 2 + x3 * (1 .+ α) / 2,
            (α) -> x2 * (1 .- α) / 2 + x4 * (1 .+ α) / 2,
            (α) -> x1 * (1 .- α) / 2 + x2 * (1 .+ α) / 2,
            (α) -> x3 * (1 .- α) / 2 + x4 * (1 .+ α) / 2]
      exα = [(α) -> -x1 / 2 + x3 / 2,
             (α) -> -x2 / 2 + x4 / 2,
             (α) -> -x1 / 2 + x2 / 2,
             (α) -> -x3 / 2 + x4 / 2]
      ey = [(α) -> y1 * (1 .- α) / 2 + y3 * (1 .+ α) / 2,
            (α) -> y2 * (1 .- α) / 2 + y4 * (1 .+ α) / 2,
            (α) -> y1 * (1 .- α) / 2 + y2 * (1 .+ α) / 2,
            (α) -> y3 * (1 .- α) / 2 + y4 * (1 .+ α) / 2]
      eyα = [(α) -> -y1 / 2 + y3 / 2,
             (α) -> -y2 / 2 + y4 / 2,
             (α) -> -y1 / 2 + y2 / 2,
             (α) -> -y3 / 2 + y4 / 2]

      # For blocks on the circle, put in the curved edge transform
      if FToB[EToF[1, e]] == BC_JUMP_INTERFACE
        error("curved face 1 not implemented yet")
      end
      if FToB[EToF[2, e]] == BC_JUMP_INTERFACE
        error("curved face 2 not implemented yet")
      end
      if FToB[EToF[3, e]] == BC_JUMP_INTERFACE
        Q1 = atan(y1, x1)
        Q2 = atan(y2, x2)
        if !(-π/2 < Q1 - Q2 < π/2)
          Q2 -= sign(Q2) * 2 * π
        end
        ex[3] = (α) -> cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        ey[3] = (α) -> sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        β3 = (Q2 - Q1) / 2
        exα[3] = (α) -> -β3 .* sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        eyα[3] = (α) -> +β3 .* cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
      end
      if FToB[EToF[4, e]] == BC_JUMP_INTERFACE
        Q3 = atan(y3, x3)
        Q4 = atan(y4, x4)
        if !(-π/2 < Q3 - Q4 < π/2)
          error("curved face 4 angle correction not implemented yet")
        end
        ex[4] = (α) -> cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        ey[4] = (α) -> sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        β4 = (Q4 - Q3) / 2
        exα[4] = (α) -> -β4 .* sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        eyα[4] = (α) -> +β4 .* cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
      end

      # Create the volume transform as the transfinite blending of the edge
      # transformations
      xt(r,s) = transfinite_blend(ex[1], ex[2], ex[3], ex[4],
                                  exα[1], exα[2], exα[3], exα[4],
                                  r, s)
      yt(r,s) = transfinite_blend(ey[1], ey[2], ey[3], ey[4],
                                  eyα[1], eyα[2], eyα[3], eyα[4],
                                  r, s)


      metrics = create_metrics(SBPp, Nr[e], Ns[e], xt, yt)

      # Build local operators
      lop[e] = locoperator(SBPp, Nr[e], Ns[e], metrics, FToB[EToF[:, e]])
    end

BenchmarkTools.Trial: 
  memory estimate:  1.82 GiB
  allocs estimate:  3618401
  --------------
  minimum time:     420.250 ms (39.05% GC)
  median time:      582.145 ms (50.26% GC)
  mean time:        546.094 ms (48.04% GC)
  maximum time:     636.791 ms (56.39% GC)
  --------------
  samples:          10
  evals/sample:     1

Conclusing 1: Using @threads macro could improve performance for building local operators

In [335]:
(M, FbarT, D, vstarts, FToλstarts) = threaded_LocalGlobalOperators(lop, Nr, Ns, FToB, FToE, FToLF, EToO, EToS, (x) -> cholesky(Symmetric(x)))

(SBPLocalOperator1{Float64,SuiteSparse.CHOLMOD.Factor{Float64}}([1, 290, 579, 868, 1157, 1446, 1735, 2024, 2313, 2602  …  71384, 71673, 71962, 72251, 72540, 72829, 73118, 73407, 73696, 73985], [1.5231905732462273e-6, 6.703087536232299e-6, 3.0254008675144863e-6, 5.980495481007056e-6, 4.395256848655774e-6, 4.888070210180819e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6  …  4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.888070210180819e-6, 4.395256848655774e-6, 5.980495481007056e-6, 3.0254008675144863e-6, 6.703087536232299e-6, 1.5231905732462273e-6], [0.25, 0.25390625, 0.2578125, 0.26171875, 0.265625, 0.26953125, 0.2734375, 0.27734375, 0.28125, 0.28515625  …  -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125], [-0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5  …  -0.09765625, -0.09375, -0.08984375, -0.0859375, -0.08203125, -0.078125, -0.07421875, -0.0703

In [336]:
@show lvl

lvl = 1


1

In [337]:
M

SBPLocalOperator1{Float64,SuiteSparse.CHOLMOD.Factor{Float64}}([1, 290, 579, 868, 1157, 1446, 1735, 2024, 2313, 2602  …  71384, 71673, 71962, 72251, 72540, 72829, 73118, 73407, 73696, 73985], [1.5231905732462273e-6, 6.703087536232299e-6, 3.0254008675144863e-6, 5.980495481007056e-6, 4.395256848655774e-6, 4.888070210180819e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6  …  4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.821000275788484e-6, 4.888070210180819e-6, 4.395256848655774e-6, 5.980495481007056e-6, 3.0254008675144863e-6, 6.703087536232299e-6, 1.5231905732462273e-6], [0.25, 0.25390625, 0.2578125, 0.26171875, 0.265625, 0.26953125, 0.2734375, 0.27734375, 0.28125, 0.28515625  …  -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125, -0.3125], [-0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5  …  -0.09765625, -0.09375, -0.08984375, -0.0859375, -0.08203125, -0.078125, -0.07421875, -0.07031

In [338]:
locfactors = M.F

256-element Array{SuiteSparse.CHOLMOD.Factor{Float64},1}:
 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
meth

In [339]:
lop[1]

(M̃ = 
  [1  ,   1]  =  11.5809
  [2  ,   1]  =  0.801039
  [3  ,   1]  =  -0.820124
  [4  ,   1]  =  0.337643
  [5  ,   1]  =  -0.0274042
  [6  ,   1]  =  -0.0128202
  [7  ,   1]  =  -1.09617e-18
  [9  ,   1]  =  3.42553e-20
  [18 ,   1]  =  0.801039
  [35 ,   1]  =  -0.820124
  [52 ,   1]  =  0.337643
  [69 ,   1]  =  -0.0274042
  ⋮
  [204, 289]  =  -0.0128202
  [221, 289]  =  -0.0274042
  [238, 289]  =  0.337643
  [255, 289]  =  -0.820124
  [272, 289]  =  0.801039
  [281, 289]  =  3.42553e-20
  [283, 289]  =  -1.09617e-18
  [284, 289]  =  -0.0128202
  [285, 289]  =  -0.0274042
  [286, 289]  =  0.337643
  [287, 289]  =  -0.820124
  [288, 289]  =  0.801039
  [289, 289]  =  11.5809, F = (
  [1  ,  1]  =  -6.06879
  [2  ,  1]  =  -1.2638
  [3  ,  1]  =  0.947847
  [4  ,  1]  =  -0.421265
  [5  ,  1]  =  0.0789873
  [18 ,  2]  =  -26.7069
  [19 ,  2]  =  -5.56157
  [20 ,  2]  =  4.17118
  [21 ,  2]  =  -1.85386
  [22 ,  2]  =  0.347598
  [35 ,  3]  =  -12.054
  [36 ,  3]  =  -2.51019
  ⋮

In [340]:
FToδstarts = bcstarts(FToB, FToE, FToLF, BC_JUMP_INTERFACE, Nr, Ns);

In [341]:
VNp = vstarts[nelems+1] - 1
λNp = FToλstarts[nfaces+1] - 1
δNp = FToδstarts[nfaces+1] - 1

0

In [342]:
vstarts

257-element Array{Int64,1}:
     1
   290
   579
   868
  1157
  1446
  1735
  2024
  2313
  2602
  2891
  3180
  3469
     ⋮
 70806
 71095
 71384
 71673
 71962
 72251
 72540
 72829
 73118
 73407
 73696
 73985

In [343]:
@benchmark B = assembleλmatrix(FToλstarts,vstarts,EToF,FToB,locfactors,D,FbarT)

BenchmarkTools.Trial: 
  memory estimate:  2.06 GiB
  allocs estimate:  9583222
  --------------
  minimum time:     4.022 s (2.56% GC)
  median time:      4.174 s (2.49% GC)
  mean time:        4.174 s (2.49% GC)
  maximum time:     4.326 s (2.42% GC)
  --------------
  samples:          2
  evals/sample:     1

In [344]:
@benchmark B = threaded_assembleλmatrix(FToλstarts,vstarts,EToF,FToB,locfactors,D,FbarT)

BenchmarkTools.Trial: 
  memory estimate:  2.06 GiB
  allocs estimate:  9583222
  --------------
  minimum time:     4.081 s (2.53% GC)
  median time:      4.108 s (2.51% GC)
  mean time:        4.108 s (2.51% GC)
  maximum time:     4.135 s (2.50% GC)
  --------------
  samples:          2
  evals/sample:     1

 assembleλmatrix can not be parallelized easily with @threads despite it has for loop over elements

In [345]:
B

408×408 SparseMatrixCSC{Float64,Int64} with 36992 stored entries:
  [1  ,   1]  =  6.95753
  [2  ,   1]  =  0.808076
  [3  ,   1]  =  -0.76754
  [4  ,   1]  =  0.245415
  [5  ,   1]  =  0.0275435
  [6  ,   1]  =  -0.022875
  [7  ,   1]  =  -0.000761631
  [8  ,   1]  =  -1.48435e-5
  [9  ,   1]  =  -1.12919e-5
  [10 ,   1]  =  -2.60621e-6
  [11 ,   1]  =  -8.78924e-7
  [12 ,   1]  =  -3.7289e-7
  ⋮
  [396, 408]  =  -1.46664e-7
  [397, 408]  =  -3.7289e-7
  [398, 408]  =  -8.78924e-7
  [399, 408]  =  -2.60621e-6
  [400, 408]  =  -1.12919e-5
  [401, 408]  =  -1.48435e-5
  [402, 408]  =  -0.000761631
  [403, 408]  =  -0.022875
  [404, 408]  =  0.0275435
  [405, 408]  =  0.245415
  [406, 408]  =  -0.76754
  [407, 408]  =  0.808076
  [408, 408]  =  6.95753

In [293]:
BF = cholesky(Symmetric(B))

SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     30549
success: true


In [294]:
@doc Symmetric

```
Symmetric(A, uplo=:U)
```

Construct a `Symmetric` view of the upper (if `uplo = :U`) or lower (if `uplo = :L`) triangle of the matrix `A`.

# Examples

```jldoctest
julia> A = [1 0 2 0 3; 0 4 0 5 0; 6 0 7 0 8; 0 9 0 1 0; 2 0 3 0 4]
5×5 Array{Int64,2}:
 1  0  2  0  3
 0  4  0  5  0
 6  0  7  0  8
 0  9  0  1  0
 2  0  3  0  4

julia> Supper = Symmetric(A)
5×5 Symmetric{Int64,Array{Int64,2}}:
 1  0  2  0  3
 0  4  0  5  0
 2  0  7  0  8
 0  5  0  1  0
 3  0  8  0  4

julia> Slower = Symmetric(A, :L)
5×5 Symmetric{Int64,Array{Int64,2}}:
 1  0  6  0  2
 0  4  0  9  0
 6  0  7  0  3
 0  9  0  1  0
 2  0  3  0  4
```

Note that `Supper` will not be equal to `Slower` unless `A` is itself symmetric (e.g. if `A == transpose(A)`).


In [295]:
(bλ,λ,gδ) = (zeros(λNp), zeros(λNp), zeros(λNp)) 

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [296]:
(Δ,u,g) = (zeros(VNp), zeros(VNp), zeros(VNp))

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [297]:
δ = zeros(δNp)

0-element Array{Float64,1}

In [298]:
for f = 1:nfaces
    if FToB[f] == BC_JUMP_INTERFACE
        (e1, e2) = FToE[:,f]
        (lf1, lf2) = FToLF[:,f]
        (xf, yf) = lop[e1].facecoord
        @views δ[FToδstarts[f]:(FToδstarts[f+1]-1)] = 
            vex(xf[lf1],yf[lf1],e2) - vex(xf[lf1],yf[lf1],e1)
    end
end

In [299]:
bc_Dirichlet = (lf, x, y, e, δ) -> vex(x,y,e)
bc_Neumann   = (lf, x, y, nx, ny, e, δ) -> (nx .* vex_x(x,y,e) + ny .* vex(x,y,e))

#865 (generic function with 1 method)

In [300]:
in_jump      = (lf, x, y, e, δ) -> begin
    f = EToF[lf, e]
    if EToS[lf, e] == 1
        if EToO[lf, e]
            return -δ[FToδstarts[f]:(FToδstarts[f+1]-1)]
        else
            error("shouldn't get here")
        end
    else
        if EToO[lf,e]
            return δ[FToδstarts[f]:(FToδstarts[f+1]-1)]
        else
            return δ[(FToδstarts[f+1]-1):-1:FToδstarts[f]]
        end
    end
end
    

#867 (generic function with 1 method)

In [301]:
@benchmark for e = 1:nelems
      gδe = ntuple(4) do lf
        f = EToF[lf, e]
        if EToO[lf, e]
          return @view gδ[FToλstarts[f]:(FToλstarts[f+1]-1)]
        else
          return  @view gδ[(FToλstarts[f+1]-1):-1:FToλstarts[f]]
        end
      end
      locbcarray!((@view g[vstarts[e]:vstarts[e+1]-1]), gδe, lop[e],
                  FToB[EToF[:,e]], bc_Dirichlet, bc_Neumann, in_jump, (e, δ))

      source = (x, y, e) -> (-vex_xx(x, y, e)  - vex_yy(x, y, e))
      locsourcearray!((@view g[vstarts[e]:vstarts[e+1]-1]), source, lop[e], e)
end

BenchmarkTools.Trial: 
  memory estimate:  11.24 MiB
  allocs estimate:  19945
  --------------
  minimum time:     5.919 ms (0.00% GC)
  median time:      6.422 ms (0.00% GC)
  mean time:        7.234 ms (8.07% GC)
  maximum time:     36.453 ms (0.00% GC)
  --------------
  samples:          691
  evals/sample:     1

In [302]:
@benchmark @threads for e = 1:nelems
      gδe = ntuple(4) do lf
        f = EToF[lf, e]
        if EToO[lf, e]
          return @view gδ[FToλstarts[f]:(FToλstarts[f+1]-1)]
        else
          return  @view gδ[(FToλstarts[f+1]-1):-1:FToλstarts[f]]
        end
      end
      locbcarray!((@view g[vstarts[e]:vstarts[e+1]-1]), gδe, lop[e],
                  FToB[EToF[:,e]], bc_Dirichlet, bc_Neumann, in_jump, (e, δ))

      source = (x, y, e) -> (-vex_xx(x, y, e)  - vex_yy(x, y, e))
      locsourcearray!((@view g[vstarts[e]:vstarts[e+1]-1]), source, lop[e], e)
end

BenchmarkTools.Trial: 
  memory estimate:  11.23 MiB
  allocs estimate:  19463
  --------------
  minimum time:     1.589 ms (0.00% GC)
  median time:      2.583 ms (0.00% GC)
  mean time:        4.062 ms (31.89% GC)
  maximum time:     85.767 ms (97.36% GC)
  --------------
  samples:          1229
  evals/sample:     1

In [303]:
LocalToGLobalRHS!(bλ, g, gδ,  u, locfactors, FbarT, vstarts)

8160-element Array{Float64,1}:
      -1.0389409911639342e6 
 -448635.738116863          
   86402.04556066295        
 -107582.01096304778        
  -16053.450446613082       
  -15914.863437247423       
   -8850.542573052297       
   -3837.788715803652       
    -347.1372940248865      
     833.4804468987234      
    1356.5196469489915      
    1602.376575113663       
    1465.8418576583038      
       ⋮                    
       0.2112277813018417   
       0.1762674168596875   
       0.128668873187123    
       0.06952838437158883  
       0.09518670724196997  
       0.11070764115420459  
       0.11536012214862185  
       0.09881313730423052  
       0.11936090364606834  
       0.04891479027127519  
       0.0692471576776887   
       0.0013823466628772494

In [304]:
@benchmark LocalToGLobalRHS!(bλ, g, gδ,  u, locfactors, FbarT, vstarts)

BenchmarkTools.Trial: 
  memory estimate:  2.54 MiB
  allocs estimate:  3072
  --------------
  minimum time:     6.998 ms (0.00% GC)
  median time:      7.230 ms (0.00% GC)
  mean time:        7.549 ms (0.83% GC)
  maximum time:     19.767 ms (0.00% GC)
  --------------
  samples:          661
  evals/sample:     1

In [305]:
@benchmark threaded_LocalToGLobalRHS!(bλ, g, gδ, u, locfactors, FbarT, vstarts)

BenchmarkTools.Trial: 
  memory estimate:  2.54 MiB
  allocs estimate:  3102
  --------------
  minimum time:     4.522 ms (0.00% GC)
  median time:      4.723 ms (0.00% GC)
  mean time:        5.111 ms (0.74% GC)
  maximum time:     121.750 ms (13.63% GC)
  --------------
  samples:          976
  evals/sample:     1

Conclusion: threaded_LocalToGLobalRHS! would help but not too much

In [306]:
@show bλ
@show g
@show gδ
@show u
@show locfactors # M
@show FbarT 
@show vstarts

bλ = [-1.0389409911639342e6, -448635.738116863, 86402.04556066295, -107582.01096304778, -16053.450446613082, -15914.863437247423, -8850.542573052297, -3837.788715803652, -347.1372940248865, 833.4804468987234, 1356.5196469489915, 1602.376575113663, 1465.8418576583038, 1840.8859484807483, 773.7997640783578, 1115.0768855251658, 23.20914750720026, -734642.2201046911, -317233.37270506594, 61095.47232433375, -76071.96948565662, -11351.503672242363, -11253.507858135536, -6258.278670585539, -2713.72642570599, -245.46313460774974, 589.3596759884795, 959.2042411704131, 1133.0513422773452, 1036.5067176972711, 1301.7029375617658, 547.1590604603575, 788.4784272992205, 16.41134558790014, 23.209090721864623, 946.1851411989511, 623.2457145563852, 1415.324388141524, 1091.5295140951443, 1183.2006039777464, 1048.3692005037983, 818.6164946612431, 498.2697997028187, 767.9577932477782, 949.4027624618892, 1039.070165245374, 930.5091175249927, 1171.7365714745124, 500.2896262514, 736.3393793614297, 16.40638834




Excessive output truncated after 1718049 bytes.

g = 

257-element Array{Int64,1}:
     1
   290
   579
   868
  1157
  1446
  1735
  2024
  2313
  2602
  2891
  3180
  3469
     ⋮
 70806
 71095
 71384
 71673
 71962
 72251
 72540
 72829
 73118
 73407
 73696
 73985

In [307]:
for e=1:length(locfactors)
    @views u[vstarts[e]:(vstarts[e+1]-1)] = (locfactors[e] \ g[vstarts[e]:(vstarts[e+1]-1)])
end

In [308]:
mul!(bλ,FbarT,u)

8160-element Array{Float64,1}:
      1.0389409911639342e6 
 448635.738116863          
 -86402.04556066295        
 107582.01096304778        
  16053.450446613082       
  15914.863437247423       
   8850.542573052297       
   3837.788715803652       
    347.1372940248865      
   -833.4804468987234      
  -1356.5196469489915      
  -1602.376575113663       
  -1465.8418576583038      
      ⋮                    
     -0.2112277813018417   
     -0.1762674168596875   
     -0.128668873187123    
     -0.06952838437158883  
     -0.09518670724196997  
     -0.11070764115420459  
     -0.11536012214862185  
     -0.09881313730423052  
     -0.11936090364606834  
     -0.04891479027127519  
     -0.0692471576776887   
     -0.0013823466628772494

In [309]:
@show typeof(bλ)
@show typeof(FbarT)
@show typeof(u)

typeof(bλ) = Array{Float64,1}
typeof(FbarT) = SparseMatrixCSC{Float64,Int64}
typeof(u) = Array{Float64,1}


Array{Float64,1}

In [310]:
@doc mul!

```
mul!(Y, A, B) -> Y
```

Calculates the matrix-matrix or matrix-vector product $AB$ and stores the result in `Y`, overwriting the existing value of `Y`. Note that `Y` must not be aliased with either `A` or `B`.

# Examples

```jldoctest
julia> A=[1.0 2.0; 3.0 4.0]; B=[1.0 1.0; 1.0 1.0]; Y = similar(B); mul!(Y, A, B);

julia> Y
2×2 Array{Float64,2}:
 3.0  3.0
 7.0  7.0
```

# Implementation

For custom matrix and vector types, it is recommended to implement 5-argument `mul!` rather than implementing 3-argument `mul!` directly if possible.

```
mul!(C, A, B, α, β) -> C
```

Combined inplace matrix-matrix or matrix-vector multiply-add $A B α + C β$. The result is stored in `C` by overwriting it.  Note that `C` must not be aliased with either `A` or `B`.

!!! compat "Julia 1.3"
    Five-argument `mul!` requires at least Julia 1.3.


# Examples

```jldoctest
julia> A=[1.0 2.0; 3.0 4.0]; B=[1.0 1.0; 1.0 1.0]; C=[1.0 2.0; 3.0 4.0];

julia> mul!(C, A, B, 100.0, 10.0) === C
true

julia> C
2×2 Array{Float64,2}:
 310.0  320.0
 730.0  740.0
```


In [317]:
@benchmark λ[:] = BF \ bλ

DimensionMismatch: DimensionMismatch("LHS and RHS should have the same number of rows. LHS has 408 rows, but RHS has 8160 rows.")

In [312]:
u[:] = -FbarT' * λ

73984-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮  
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [313]:
u[:] .= g .+ u

73984-element view(::Array{Float64,1}, :) with eltype Float64:
 -870013.4890578167         
      -3.82404673904831e6   
      -1.7197224950795262e6 
      -3.3789573032148853e6 
      -2.462237501073863e6  
      -2.708288371974637e6  
      -2.6350797284354335e6 
      -2.5926837723838678e6 
      -2.544041814032352e6  
      -2.4892710361190867e6 
      -2.4285033862454193e6 
      -2.394743894202986e6  
      -2.087384161773896e6  
       ⋮                    
       0.011654276122675607 
       0.010430744824881058 
       0.009467738845050656 
       0.008596058952608539 
       0.007807296897672173 
       0.0070938442582585874
       0.006538535258680068 
       0.0053479718505510185
       0.00662402572880479  
       0.0030530344661299875
       0.00616950181930277  
       0.0012803001080033934

In [314]:
@benchmark for e = 1:nelems
      F = locfactors[e]
      (x, y) = lop[e].coord
      JH = lop[e].JH
      start = time()
      @views u[vstarts[e]:(vstarts[e+1]-1)] = F \ u[vstarts[e]:(vstarts[e+1]-1)]
      #=
      ldiv!((@view u[vstarts[e]:(vstarts[e+1]-1)]), F,
            (@view u[vstarts[e]:(vstarts[e+1]-1)]))
      =#

      @views Δ[vstarts[e]:(vstarts[e+1]-1)] = (u[vstarts[e]:(vstarts[e+1]-1)] -
                                               vex(x[:], y[:], e))
      ϵ[lvl] += Δ[vstarts[e]:(vstarts[e+1]-1)]' * JH * Δ[vstarts[e]:(vstarts[e+1]-1)]
 end
   

BenchmarkTools.Trial: 
  memory estimate:  7.06 MiB
  allocs estimate:  16361
  --------------
  minimum time:     10.577 ms (0.00% GC)
  median time:      11.419 ms (0.00% GC)
  mean time:        11.794 ms (2.68% GC)
  maximum time:     26.125 ms (24.02% GC)
  --------------
  samples:          424
  evals/sample:     1

In [315]:
@benchmark @threads for e = 1:nelems
      F = locfactors[e]
      (x, y) = lop[e].coord
      JH = lop[e].JH
      start = time()
      @views u[vstarts[e]:(vstarts[e+1]-1)] = F \ u[vstarts[e]:(vstarts[e+1]-1)]
      #=
      ldiv!((@view u[vstarts[e]:(vstarts[e+1]-1)]), F,
            (@view u[vstarts[e]:(vstarts[e+1]-1)]))
      =#

      @views Δ[vstarts[e]:(vstarts[e+1]-1)] = (u[vstarts[e]:(vstarts[e+1]-1)] -
                                               vex(x[:], y[:], e))
      ϵ[lvl] += Δ[vstarts[e]:(vstarts[e+1]-1)]' * JH * Δ[vstarts[e]:(vstarts[e+1]-1)]
 end

BenchmarkTools.Trial: 
  memory estimate:  7.06 MiB
  allocs estimate:  16135
  --------------
  minimum time:     4.644 ms (0.00% GC)
  median time:      5.117 ms (0.00% GC)
  mean time:        6.112 ms (9.60% GC)
  maximum time:     99.624 ms (64.38% GC)
  --------------
  samples:          817
  evals/sample:     1

Conclusion: using Multithreading would significantly boost the performance of parallel solve

In [316]:
nthreads()

4