In [3]:
if (typeof(Base.find_package("UnicodePlots")) == Nothing)
    println("Package Unicode not installed")
    using Pkg;
    Pkg.add("UnicodePlots")
end
using UnicodePlots

## Using Packages

In [4]:
using .Threads
using BenchmarkTools
using SparseArrays
using LinearAlgebra

## Including External Files|

In [163]:
include("global_curved_multithreading.jl")

plot_blocks (generic function with 1 method)

## Setting SBP operator Orders

In [6]:
SBPp = 6

6

## Reading .inp files and initializing boundary conditions

In [7]:
bc_map = [BC_DIRICHLET, BC_DIRICHLET, BC_NEUMANN, BC_NEUMANN, BC_JUMP_INTERFACE]
# 1 refers to Dirichlet boundary condition
# 2 refers to Neumann boundary condition
# 7 refers to Jump interface condition

5-element Array{Int64,1}:
 1
 1
 2
 2
 7

In [193]:
(verts, EToV, EToF, FToB, EToDomain) = read_inp_2d("../meshes/8_8_block.inp", bc_map=bc_map)

([0.5 -0.5 … 0.125 0.125; 0.5 0.5 … -0.5 -0.375], [24 4 … 22 16; 73 70 … 65 47; 72 73 … 58 65; 74 74 … 69 69], [1 5 … 122 136; 2 6 … 144 141; 3 7 … 137 93; 4 2 … 143 144], [0, 0, 1, 0, 1, 0, 1, 1, 0, 0  …  0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [194]:
verts'

81×2 Array{Float64,2}:
  0.5     0.5  
 -0.5     0.5  
 -0.5    -0.5  
  0.5    -0.5  
  0.0     0.5  
 -0.5     0.0  
  0.0    -0.5  
  0.5     0.0  
  0.0     0.0  
  0.25    0.5  
  0.0     0.25 
  0.25    0.0  
  0.5     0.25 
  ⋮            
  0.5    -0.375
  0.375  -0.25 
  0.25   -0.375
  0.375  -0.5  
  0.375  -0.375
  0.25   -0.125
  0.5    -0.125
  0.375  -0.125
  0.125  -0.25 
  0.125  -0.125
  0.125  -0.5  
  0.125  -0.375

In [195]:
EToV

4×64 Array{Int64,2}:
 24   4  23  25  20  25  24   7  12  …  19   9  16  22  20   6  21  22  16
 73  70  71  72  61  78  72  80  35     63  52  65  62  66  68  58  65  47
 72  73  70  71  78  72  80  61  75     57  66  52  65  62  47  68  58  65
 74  74  74  74  81  81  81  81  79     64  67  67  67  67  69  69  69  69

In [196]:
FToB

144-element Array{Int64,1}:
 0
 0
 1
 0
 1
 0
 1
 1
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 1
 0
 1
 0
 0

In [197]:
EToDomain

64-element Array{Int64,1}:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

In [198]:
(nelems, nfaces) = (size(EToV, 2), size(FToB, 1))
@show (nelems, nfaces)

(nelems, nfaces) = (64, 144)


(64, 144)

In [199]:
# This is needed to fix up points that should be on the boundary of the
# circle, but the mesh didn't quite put them there
for v in 1:size(verts, 2)
    x,y = verts[1,v], verts[2,v]
    if abs(hypot(x,y) - 1) < 1e-5
        Q = atan(y,x)
        verts[1,v], verts[2,v] = cos(Q), sin(Q)
    end
end


In [200]:
plot_connectivity(verts, EToV)

[1m                                        connectivity[22m
[90m      ┌────────────────────────────────────────────────────────────────────────────────┐[39m 
    [90m1[39m[90m │[39m[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[90m│[39m 
     [90m │[39m[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀[0m⠀

## Setting base mesh sizes in each direction

In [201]:
N1 = N0 = 16

16

In [202]:
EToN0 = zeros(Int64, 2, nelems)
EToN0[1,:] .= N0;
@show EToN0
EToN0[2,:] .= N1;
@show EToN0

EToN0 = [16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16; 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
EToN0 = [16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16; 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16]


2×64 Array{Int64,2}:
 16  16  16  16  16  16  16  16  16  …  16  16  16  16  16  16  16  16  16
 16  16  16  16  16  16  16  16  16     16  16  16  16  16  16  16  16  16

In [203]:
# Checking types and sizes 
@assert typeof(EToV) == Array{Int, 2} && size(EToV) == (4, nelems)
@assert typeof(EToF) == Array{Int, 2} && size(EToF) == (4, nelems)
@assert maximum(maximum(EToF)) == nfaces   # The maximum number of EToF element should equal the number of faces 

# Determine secondary arrays
# FToE : Unique Global Face to Element Number
#        (the i'th column of this stores the element numbers that share the
#        global face number i)
# FToLF: Unique Global Face to Element local face number
#        (the i'th column of this stores the element local face numbers that
#        shares the global face number i)
# EToO : Element to Unique Global Faces Orientation
#        (the i'th column of this stores the whether the element and global
#        face are oriented in the same way in physical memory or need to be
#        rotated)
# EToS : Element to Unique Global Face Side
#        (the i'th column of this stores whether an element is on the
#        plus side or minus side of the global face)


In [204]:
# connectivity arrays
(FToE, FToLF, EToO, EToS) = connectivityarrays(EToV, EToF)

([1 1 … 62 63; 7 2 … 63 64], [1 2 … 2 2; 3 4 … 4 4], Bool[1 1 … 1 1; 1 1 … 1 1; 1 1 … 1 1; 1 1 … 1 1], [1 1 … 2 2; 1 1 … 1 2; 1 1 … 2 2; 1 2 … 2 2])

## Forming exact solutions

In [205]:
Lx = maximum(verts[1,:])
@show Lx
Ly = maximum(abs.(verts[2,:]))
@show Ly

Lx = 0.5
Ly = 0.5


0.5

In [206]:
(kx, ky) = (2*π / Lx, 4*π / Ly)

(12.566370614359172, 25.132741228718345)

In [207]:
vex(x,y,e) = begin
    if EToDomain[e] == 1
        return cos.(kx * x) .* cosh.(ky * y)
    elseif EToDomain[e] == 2
        return 10 .+ cos.(kx * x) .* cosh.(ky * y)
    else
        error("invalid block")
    end
end

vex (generic function with 1 method)

In [208]:
vex_x(x,y,e) = begin
  if EToDomain[e] == 1
    return -kx * sin.(kx * x) .* cosh.(ky * y)
  elseif EToDomain[e] == 2
    return -kx * sin.(kx * x) .* cosh.(ky * y)
  else
    error("invalid block")
  end
end

vex_x (generic function with 1 method)

In [209]:
vex_y(x,y,e) = begin
  if EToDomain[e] == 1
    return ky * cos.(kx * x) .* sinh.(ky * y)
  elseif EToDomain[e] == 2
    return ky * cos.(kx * x) .* sinh.(ky * y)
  else
    error("invalid block")
  end
end

vex_y (generic function with 1 method)

In [210]:
vex_xx(x,y,e) = begin
  if EToDomain[e] == 1
    return -kx^2 * cos.(kx * x) .* cosh.(ky * y)
  elseif EToDomain[e] == 2
    return -kx^2 * cos.(kx * x) .* cosh.(ky * y)
  else
    error("invalid block")
  end
end

vex_xx (generic function with 1 method)

In [211]:
vex_xy(x,y,e) = begin
  if EToDomain[e] == 1
    return -kx * ky * sin.(kx * x) .* sinh.(ky * y)
  elseif EToDomain[e] == 2
    return -kx * ky * sin.(kx * x) .* sinh.(ky * y)
  else
    error("invalid block")
  end
end

vex_xy (generic function with 1 method)

In [212]:
vex_yy(x,y,e) = begin
  if EToDomain[e] == 1
    return ky^2 * cos.(kx * x) .* cosh.(ky * y)
  elseif EToDomain[e] == 2
    return ky^2 * cos.(kx * x) .* cosh.(ky * y)
  else
    error("invalid block")
  end
end

vex_yy (generic function with 1 method)

In [213]:
ϵ = zeros(4)

4-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0

In [214]:
lvl = 1

1

In [215]:
# generate base mesh size for each element
Nr = EToN0[1,:] * (2^(lvl - 1))
Ns = EToN0[2,:] * (2^(lvl - 1))

64-element Array{Int64,1}:
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
  ⋮
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16
 16

In [216]:
OPTYPE = typeof(locoperator(2,8,8))

NamedTuple{(:M̃, :F, :coord, :facecoord, :JH, :sJ, :nx, :ny, :Hf, :HfI, :τ, :bctype),Tuple{SparseMatrixCSC{Float64,Int64},NTuple{4,SparseMatrixCSC{Float64,Int64}},Tuple{Array{Float64,2},Array{Float64,2}},Tuple{Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}},Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}}},SparseMatrixCSC{Float64,Int64},NTuple{4,Array{Float64,1}},NTuple{4,Array{Float64,1}},NTuple{4,A

In [217]:
lop = Dict{Int64, OPTYPE}()

Dict{Int64,NamedTuple{(:M̃, :F, :coord, :facecoord, :JH, :sJ, :nx, :ny, :Hf, :HfI, :τ, :bctype),Tuple{SparseMatrixCSC{Float64,Int64},NTuple{4,SparseMatrixCSC{Float64,Int64}},Tuple{Array{Float64,2},Array{Float64,2}},Tuple{Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}},Tuple{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true},SubArray{Float64,1,Array{Float64,2},Tuple{Base.Slice{Base.OneTo{Int64}},Int64},true}}},SparseMatrixCSC{Float64,Int64},NTuple{4,Array{Float64,1}},NTuple{4,Array{Float64,1}}

In [218]:
@benchmark for e = 1:nelems
      # Get the element corners
      (x1, x2, x3, x4) = verts[1, EToV[:, e]]
      (y1, y2, y3, y4) = verts[2, EToV[:, e]]

      # Initialize the block transformations as transfinite between the corners
      ex = [(α) -> x1 * (1 .- α) / 2 + x3 * (1 .+ α) / 2,
            (α) -> x2 * (1 .- α) / 2 + x4 * (1 .+ α) / 2,
            (α) -> x1 * (1 .- α) / 2 + x2 * (1 .+ α) / 2,
            (α) -> x3 * (1 .- α) / 2 + x4 * (1 .+ α) / 2]
      exα = [(α) -> -x1 / 2 + x3 / 2,
             (α) -> -x2 / 2 + x4 / 2,
             (α) -> -x1 / 2 + x2 / 2,
             (α) -> -x3 / 2 + x4 / 2]
      ey = [(α) -> y1 * (1 .- α) / 2 + y3 * (1 .+ α) / 2,
            (α) -> y2 * (1 .- α) / 2 + y4 * (1 .+ α) / 2,
            (α) -> y1 * (1 .- α) / 2 + y2 * (1 .+ α) / 2,
            (α) -> y3 * (1 .- α) / 2 + y4 * (1 .+ α) / 2]
      eyα = [(α) -> -y1 / 2 + y3 / 2,
             (α) -> -y2 / 2 + y4 / 2,
             (α) -> -y1 / 2 + y2 / 2,
             (α) -> -y3 / 2 + y4 / 2]

      # For blocks on the circle, put in the curved edge transform
      if FToB[EToF[1, e]] == BC_JUMP_INTERFACE
        error("curved face 1 not implemented yet")
      end
      if FToB[EToF[2, e]] == BC_JUMP_INTERFACE
        error("curved face 2 not implemented yet")
      end
      if FToB[EToF[3, e]] == BC_JUMP_INTERFACE
        Q1 = atan(y1, x1)
        Q2 = atan(y2, x2)
        if !(-π/2 < Q1 - Q2 < π/2)
          Q2 -= sign(Q2) * 2 * π
        end
        ex[3] = (α) -> cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        ey[3] = (α) -> sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        β3 = (Q2 - Q1) / 2
        exα[3] = (α) -> -β3 .* sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        eyα[3] = (α) -> +β3 .* cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
      end
      if FToB[EToF[4, e]] == BC_JUMP_INTERFACE
        Q3 = atan(y3, x3)
        Q4 = atan(y4, x4)
        if !(-π/2 < Q3 - Q4 < π/2)
          error("curved face 4 angle correction not implemented yet")
        end
        ex[4] = (α) -> cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        ey[4] = (α) -> sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        β4 = (Q4 - Q3) / 2
        exα[4] = (α) -> -β4 .* sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        eyα[4] = (α) -> +β4 .* cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
      end

      # Create the volume transform as the transfinite blending of the edge
      # transformations
      xt(r,s) = transfinite_blend(ex[1], ex[2], ex[3], ex[4],
                                  exα[1], exα[2], exα[3], exα[4],
                                  r, s)
      yt(r,s) = transfinite_blend(ey[1], ey[2], ey[3], ey[4],
                                  eyα[1], eyα[2], eyα[3], eyα[4],
                                  r, s)


      metrics = create_metrics(SBPp, Nr[e], Ns[e], xt, yt)

      # Build local operators
      lop[e] = locoperator(SBPp, Nr[e], Ns[e], metrics, FToB[EToF[:, e]])
    end

BenchmarkTools.Trial: 
  memory estimate:  466.80 MiB
  allocs estimate:  904641
  --------------
  minimum time:     166.211 ms (12.78% GC)
  median time:      173.839 ms (13.23% GC)
  mean time:        186.241 ms (16.32% GC)
  maximum time:     400.632 ms (54.31% GC)
  --------------
  samples:          27
  evals/sample:     1

In [219]:
@benchmark @threads for e = 1:nelems
      # Get the element corners
      (x1, x2, x3, x4) = verts[1, EToV[:, e]]
      (y1, y2, y3, y4) = verts[2, EToV[:, e]]

      # Initialize the block transformations as transfinite between the corners
      ex = [(α) -> x1 * (1 .- α) / 2 + x3 * (1 .+ α) / 2,
            (α) -> x2 * (1 .- α) / 2 + x4 * (1 .+ α) / 2,
            (α) -> x1 * (1 .- α) / 2 + x2 * (1 .+ α) / 2,
            (α) -> x3 * (1 .- α) / 2 + x4 * (1 .+ α) / 2]
      exα = [(α) -> -x1 / 2 + x3 / 2,
             (α) -> -x2 / 2 + x4 / 2,
             (α) -> -x1 / 2 + x2 / 2,
             (α) -> -x3 / 2 + x4 / 2]
      ey = [(α) -> y1 * (1 .- α) / 2 + y3 * (1 .+ α) / 2,
            (α) -> y2 * (1 .- α) / 2 + y4 * (1 .+ α) / 2,
            (α) -> y1 * (1 .- α) / 2 + y2 * (1 .+ α) / 2,
            (α) -> y3 * (1 .- α) / 2 + y4 * (1 .+ α) / 2]
      eyα = [(α) -> -y1 / 2 + y3 / 2,
             (α) -> -y2 / 2 + y4 / 2,
             (α) -> -y1 / 2 + y2 / 2,
             (α) -> -y3 / 2 + y4 / 2]

      # For blocks on the circle, put in the curved edge transform
      if FToB[EToF[1, e]] == BC_JUMP_INTERFACE
        error("curved face 1 not implemented yet")
      end
      if FToB[EToF[2, e]] == BC_JUMP_INTERFACE
        error("curved face 2 not implemented yet")
      end
      if FToB[EToF[3, e]] == BC_JUMP_INTERFACE
        Q1 = atan(y1, x1)
        Q2 = atan(y2, x2)
        if !(-π/2 < Q1 - Q2 < π/2)
          Q2 -= sign(Q2) * 2 * π
        end
        ex[3] = (α) -> cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        ey[3] = (α) -> sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        β3 = (Q2 - Q1) / 2
        exα[3] = (α) -> -β3 .* sin.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
        eyα[3] = (α) -> +β3 .* cos.(Q1 * (1 .- α) / 2 + Q2 * (1 .+ α) / 2)
      end
      if FToB[EToF[4, e]] == BC_JUMP_INTERFACE
        Q3 = atan(y3, x3)
        Q4 = atan(y4, x4)
        if !(-π/2 < Q3 - Q4 < π/2)
          error("curved face 4 angle correction not implemented yet")
        end
        ex[4] = (α) -> cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        ey[4] = (α) -> sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        β4 = (Q4 - Q3) / 2
        exα[4] = (α) -> -β4 .* sin.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
        eyα[4] = (α) -> +β4 .* cos.(Q3 * (1 .- α) / 2 + Q4 * (1 .+ α) / 2)
      end

      # Create the volume transform as the transfinite blending of the edge
      # transformations
      xt(r,s) = transfinite_blend(ex[1], ex[2], ex[3], ex[4],
                                  exα[1], exα[2], exα[3], exα[4],
                                  r, s)
      yt(r,s) = transfinite_blend(ey[1], ey[2], ey[3], ey[4],
                                  eyα[1], eyα[2], eyα[3], eyα[4],
                                  r, s)


      metrics = create_metrics(SBPp, Nr[e], Ns[e], xt, yt)

      # Build local operators
      lop[e] = locoperator(SBPp, Nr[e], Ns[e], metrics, FToB[EToF[:, e]])
    end

BenchmarkTools.Trial: 
  memory estimate:  466.80 MiB
  allocs estimate:  904609
  --------------
  minimum time:     71.012 ms (22.80% GC)
  median time:      92.369 ms (39.01% GC)
  mean time:        92.782 ms (37.61% GC)
  maximum time:     118.904 ms (49.15% GC)
  --------------
  samples:          54
  evals/sample:     1

Conclusing 1: Using @threads macro could improve performance for building local operators

In [220]:
(M, FbarT, D, vstarts, FToλstarts) = threaded_LocalGlobalOperators(lop, Nr, Ns, FToB, FToE, FToLF, EToO, EToS, (x) -> cholesky(Symmetric(x)))

(SBPLocalOperator1{Float64,SuiteSparse.CHOLMOD.Factor{Float64}}([1, 290, 579, 868, 1157, 1446, 1735, 2024, 2313, 2602  …  15896, 16185, 16474, 16763, 17052, 17341, 17630, 17919, 18208, 18497], [6.092762292984909e-6, 2.6812350144929195e-5, 1.2101603470057945e-5, 2.3921981924028226e-5, 1.7581027394623096e-5, 1.9552280840723275e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5  …  1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9552280840723275e-5, 1.7581027394623096e-5, 2.3921981924028226e-5, 1.2101603470057945e-5, 2.6812350144929195e-5, 6.092762292984909e-6], [0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125  …  -0.3046875, -0.3125, -0.3203125, -0.328125, -0.3359375, -0.34375, -0.3515625, -0.359375, -0.3671875, -0.375], [-0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5  …  -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, -0.1

In [221]:
@show lvl

lvl = 1


1

In [222]:
M

SBPLocalOperator1{Float64,SuiteSparse.CHOLMOD.Factor{Float64}}([1, 290, 579, 868, 1157, 1446, 1735, 2024, 2313, 2602  …  15896, 16185, 16474, 16763, 17052, 17341, 17630, 17919, 18208, 18497], [6.092762292984909e-6, 2.6812350144929195e-5, 1.2101603470057945e-5, 2.3921981924028226e-5, 1.7581027394623096e-5, 1.9552280840723275e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5  …  1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9284001103153935e-5, 1.9552280840723275e-5, 1.7581027394623096e-5, 2.3921981924028226e-5, 1.2101603470057945e-5, 2.6812350144929195e-5, 6.092762292984909e-6], [0.25, 0.2578125, 0.265625, 0.2734375, 0.28125, 0.2890625, 0.296875, 0.3046875, 0.3125, 0.3203125  …  -0.3046875, -0.3125, -0.3203125, -0.328125, -0.3359375, -0.34375, -0.3515625, -0.359375, -0.3671875, -0.375], [-0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5  …  -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, -0.125, -0.12

In [223]:
locfactors = M.F

64-element Array{SuiteSparse.CHOLMOD.Factor{Float64},1}:
 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     15033
success: true

 SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
metho

In [224]:
lop[1]

(M̃ = 
  [1  ,   1]  =  11.5809
  [2  ,   1]  =  0.801039
  [3  ,   1]  =  -0.820124
  [4  ,   1]  =  0.337643
  [5  ,   1]  =  -0.0274042
  [6  ,   1]  =  -0.0128202
  [7  ,   1]  =  -1.09617e-18
  [9  ,   1]  =  3.42553e-20
  [18 ,   1]  =  0.801039
  [35 ,   1]  =  -0.820124
  [52 ,   1]  =  0.337643
  [69 ,   1]  =  -0.0274042
  ⋮
  [204, 289]  =  -0.0128202
  [221, 289]  =  -0.0274042
  [238, 289]  =  0.337643
  [255, 289]  =  -0.820124
  [272, 289]  =  0.801039
  [281, 289]  =  3.42553e-20
  [283, 289]  =  -1.09617e-18
  [284, 289]  =  -0.0128202
  [285, 289]  =  -0.0274042
  [286, 289]  =  0.337643
  [287, 289]  =  -0.820124
  [288, 289]  =  0.801039
  [289, 289]  =  11.5809, F = (
  [1  ,  1]  =  -6.06879
  [2  ,  1]  =  -1.2638
  [3  ,  1]  =  0.947847
  [4  ,  1]  =  -0.421265
  [5  ,  1]  =  0.0789873
  [18 ,  2]  =  -26.7069
  [19 ,  2]  =  -5.56157
  [20 ,  2]  =  4.17118
  [21 ,  2]  =  -1.85386
  [22 ,  2]  =  0.347598
  [35 ,  3]  =  -12.054
  [36 ,  3]  =  -2.51019
  ⋮

In [225]:
FToδstarts = bcstarts(FToB, FToE, FToLF, BC_JUMP_INTERFACE, Nr, Ns);

In [226]:
VNp = vstarts[nelems+1] - 1
λNp = FToλstarts[nfaces+1] - 1
δNp = FToδstarts[nfaces+1] - 1

0

In [227]:
vstarts

65-element Array{Int64,1}:
     1
   290
   579
   868
  1157
  1446
  1735
  2024
  2313
  2602
  2891
  3180
  3469
     ⋮
 15318
 15607
 15896
 16185
 16474
 16763
 17052
 17341
 17630
 17919
 18208
 18497

In [228]:
@benchmark B = assembleλmatrix(FToλstarts,vstarts,EToF,FToB,locfactors,D,FbarT)

BenchmarkTools.Trial: 
  memory estimate:  276.49 MiB
  allocs estimate:  2234993
  --------------
  minimum time:     243.223 ms (4.06% GC)
  median time:      253.429 ms (5.52% GC)
  mean time:        253.853 ms (5.30% GC)
  maximum time:     271.803 ms (5.01% GC)
  --------------
  samples:          20
  evals/sample:     1

In [229]:
@benchmark B = threaded_assembleλmatrix(FToλstarts,vstarts,EToF,FToB,locfactors,D,FbarT)

BenchmarkTools.Trial: 
  memory estimate:  276.49 MiB
  allocs estimate:  2234993
  --------------
  minimum time:     241.096 ms (4.34% GC)
  median time:      252.090 ms (5.49% GC)
  mean time:        253.176 ms (5.35% GC)
  maximum time:     269.086 ms (5.44% GC)
  --------------
  samples:          20
  evals/sample:     1

 assembleλmatrix can not be parallelized easily with @threads despite it has for loop over elements

In [230]:
B

408×408 SparseMatrixCSC{Float64,Int64} with 36992 stored entries:
  [1  ,   1]  =  6.95753
  [2  ,   1]  =  0.808076
  [3  ,   1]  =  -0.76754
  [4  ,   1]  =  0.245415
  [5  ,   1]  =  0.0275435
  [6  ,   1]  =  -0.022875
  [7  ,   1]  =  -0.000761631
  [8  ,   1]  =  -1.48435e-5
  [9  ,   1]  =  -1.12919e-5
  [10 ,   1]  =  -2.60621e-6
  [11 ,   1]  =  -8.78924e-7
  [12 ,   1]  =  -3.7289e-7
  ⋮
  [396, 408]  =  -1.46664e-7
  [397, 408]  =  -3.7289e-7
  [398, 408]  =  -8.78924e-7
  [399, 408]  =  -2.60621e-6
  [400, 408]  =  -1.12919e-5
  [401, 408]  =  -1.48435e-5
  [402, 408]  =  -0.000761631
  [403, 408]  =  -0.022875
  [404, 408]  =  0.0275435
  [405, 408]  =  0.245415
  [406, 408]  =  -0.76754
  [407, 408]  =  0.808076
  [408, 408]  =  6.95753

In [231]:
BF = cholesky(Symmetric(B))

SuiteSparse.CHOLMOD.Factor{Float64}
type:    LLt
method:  supernodal
maxnnz:  0
nnz:     30549
success: true


In [232]:
@doc Symmetric

```
Symmetric(A, uplo=:U)
```

Construct a `Symmetric` view of the upper (if `uplo = :U`) or lower (if `uplo = :L`) triangle of the matrix `A`.

# Examples

```jldoctest
julia> A = [1 0 2 0 3; 0 4 0 5 0; 6 0 7 0 8; 0 9 0 1 0; 2 0 3 0 4]
5×5 Array{Int64,2}:
 1  0  2  0  3
 0  4  0  5  0
 6  0  7  0  8
 0  9  0  1  0
 2  0  3  0  4

julia> Supper = Symmetric(A)
5×5 Symmetric{Int64,Array{Int64,2}}:
 1  0  2  0  3
 0  4  0  5  0
 2  0  7  0  8
 0  5  0  1  0
 3  0  8  0  4

julia> Slower = Symmetric(A, :L)
5×5 Symmetric{Int64,Array{Int64,2}}:
 1  0  6  0  2
 0  4  0  9  0
 6  0  7  0  3
 0  9  0  1  0
 2  0  3  0  4
```

Note that `Supper` will not be equal to `Slower` unless `A` is itself symmetric (e.g. if `A == transpose(A)`).


In [233]:
(bλ,λ,gδ) = (zeros(λNp), zeros(λNp), zeros(λNp)) 

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [234]:
(Δ,u,g) = (zeros(VNp), zeros(VNp), zeros(VNp))

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [235]:
δ = zeros(δNp)

0-element Array{Float64,1}

In [236]:
for f = 1:nfaces
    if FToB[f] == BC_JUMP_INTERFACE
        (e1, e2) = FToE[:,f]
        (lf1, lf2) = FToLF[:,f]
        (xf, yf) = lop[e1].facecoord
        @views δ[FToδstarts[f]:(FToδstarts[f+1]-1)] = 
            vex(xf[lf1],yf[lf1],e2) - vex(xf[lf1],yf[lf1],e1)
    end
end

In [237]:
bc_Dirichlet = (lf, x, y, e, δ) -> vex(x,y,e)
bc_Neumann   = (lf, x, y, nx, ny, e, δ) -> (nx .* vex_x(x,y,e) + ny .* vex(x,y,e))

#735 (generic function with 1 method)

In [238]:
in_jump      = (lf, x, y, e, δ) -> begin
    f = EToF[lf, e]
    if EToS[lf, e] == 1
        if EToO[lf, e]
            return -δ[FToδstarts[f]:(FToδstarts[f+1]-1)]
        else
            error("shouldn't get here")
        end
    else
        if EToO[lf,e]
            return δ[FToδstarts[f]:(FToδstarts[f+1]-1)]
        else
            return δ[(FToδstarts[f+1]-1):-1:FToδstarts[f]]
        end
    end
end
    

#737 (generic function with 1 method)

In [239]:
@benchmark for e = 1:nelems
      gδe = ntuple(4) do lf
        f = EToF[lf, e]
        if EToO[lf, e]
          return @view gδ[FToλstarts[f]:(FToλstarts[f+1]-1)]
        else
          return  @view gδ[(FToλstarts[f+1]-1):-1:FToλstarts[f]]
        end
      end
      locbcarray!((@view g[vstarts[e]:vstarts[e+1]-1]), gδe, lop[e],
                  FToB[EToF[:,e]], bc_Dirichlet, bc_Neumann, in_jump, (e, δ))

      source = (x, y, e) -> (-vex_xx(x, y, e)  - vex_yy(x, y, e))
      locsourcearray!((@view g[vstarts[e]:vstarts[e+1]-1]), source, lop[e], e)
end

BenchmarkTools.Trial: 
  memory estimate:  2.93 MiB
  allocs estimate:  5006
  --------------
  minimum time:     1.469 ms (0.00% GC)
  median time:      1.513 ms (0.00% GC)
  mean time:        1.718 ms (9.11% GC)
  maximum time:     6.597 ms (69.18% GC)
  --------------
  samples:          2906
  evals/sample:     1

In [240]:
@benchmark @threads for e = 1:nelems
      gδe = ntuple(4) do lf
        f = EToF[lf, e]
        if EToO[lf, e]
          return @view gδ[FToλstarts[f]:(FToλstarts[f+1]-1)]
        else
          return  @view gδ[(FToλstarts[f+1]-1):-1:FToλstarts[f]]
        end
      end
      locbcarray!((@view g[vstarts[e]:vstarts[e+1]-1]), gδe, lop[e],
                  FToB[EToF[:,e]], bc_Dirichlet, bc_Neumann, in_jump, (e, δ))

      source = (x, y, e) -> (-vex_xx(x, y, e)  - vex_yy(x, y, e))
      locsourcearray!((@view g[vstarts[e]:vstarts[e+1]-1]), source, lop[e], e)
end

BenchmarkTools.Trial: 
  memory estimate:  2.93 MiB
  allocs estimate:  4908
  --------------
  minimum time:     390.000 μs (0.00% GC)
  median time:      528.099 μs (0.00% GC)
  mean time:        759.889 μs (28.80% GC)
  maximum time:     42.023 ms (98.32% GC)
  --------------
  samples:          6557
  evals/sample:     1

In [241]:
LocalToGLobalRHS!(bλ, g, gδ,  u, locfactors, FbarT, vstarts)

1904-element Array{Float64,1}:
      -1.0384926660911115e6 
 -437321.36751612386        
   92600.20403835559        
  -95970.03956722301        
   -8583.310842436855       
   -9191.509798786115       
   -3940.3542585642995      
    -773.1793794357795      
     851.7504226176612      
    1910.2509426550675      
    2302.2409371227036      
    2352.1533158622096      
    1939.9463408069475      
       ⋮                    
       1.9982866587130674   
       1.6995353199901202   
       1.2434974889861212   
       0.6566410278266643   
       0.894602293323984    
       1.0233744922016184   
       1.0266407345194069   
       0.8248944531135468   
       0.9057309638832659   
       0.32004879785094087  
       0.36420679490708957  
      -1.6860322099368314e-5

In [242]:
@benchmark LocalToGLobalRHS!(bλ, g, gδ,  u, locfactors, FbarT, vstarts)

BenchmarkTools.Trial: 
  memory estimate:  649.50 KiB
  allocs estimate:  768
  --------------
  minimum time:     1.570 ms (0.00% GC)
  median time:      1.698 ms (0.00% GC)
  mean time:        1.771 ms (0.96% GC)
  maximum time:     8.322 ms (31.00% GC)
  --------------
  samples:          2811
  evals/sample:     1

In [243]:
@benchmark threaded_LocalToGLobalRHS!(bλ, g, gδ, u, locfactors, FbarT, vstarts)

BenchmarkTools.Trial: 
  memory estimate:  652.98 KiB
  allocs estimate:  798
  --------------
  minimum time:     1.065 ms (0.00% GC)
  median time:      1.125 ms (0.00% GC)
  mean time:        1.196 ms (1.33% GC)
  maximum time:     105.080 ms (34.27% GC)
  --------------
  samples:          4157
  evals/sample:     1

Conclusion: threaded_LocalToGLobalRHS! would help but not too much

In [244]:
@show bλ
@show g
@show gδ
@show u
@show locfactors # M
@show FbarT 
@show vstarts

bλ = [-1.0384926660911115e6, -437321.36751612386, 92600.20403835559, -95970.03956722301, -8583.310842436855, -9191.509798786115, -3940.3542585642995, -773.1793794357795, 851.7504226176612, 1910.2509426550675, 2302.2409371227036, 2352.1533158622096, 1939.9463408069475, 2238.642758883456, 871.1601099617116, 1169.1697182357598, 19.292570996050152, 0.001669667594449309, 14.145496089902565, 11.880498814891638, 30.996397888208328, 25.094566418091638, 26.578669388677312, 21.25901928712366, 13.396691443789342, 4.812158595151071, 6.3250217901819195, 8.171060089502822, 9.198312099894919, 8.129900526337192, 9.608842451079715, 3.5782981934260008, 4.19322838228049, 0.00032292720868428684, 19.262385188499035, 841.4175810594948, 558.2114733458033, 1263.673604798965, 960.5574428376549, 1015.1450475296508, 865.6231782471215, 637.1306033784753, 339.9453957672934, 460.5161417762606, 522.476778258924, 520.0402538520642, 414.909920352992, 452.89234277755935, 159.32053559966812, 180.82980607121112, -0.01030


g = [-869703.110364473, -3.808863185715654e6, -1.6942349888694114e6, -3.2676746378652183e6, -2.3185495688859234e6, -2.4614103898557606e6, -2.288760333013442e6, -2.1278416390491016e6, -1.9464306669041584e6, -1.7462745034096926e6, -1.529300761836642e6, -1.3156512635228843e6, -960374.6764668435, -991238.2645042374, -337004.29253346386, -375140.2803565735, -1.5976186954745234e-10, -179701.12016370843, -787000.7280261117, -350068.8012864421, -675178.4439506438, -479066.8789367405, -508585.2849880057, -472911.7220891784, -439662.13471153384, -402178.3606328464, -360821.3890882093, -315989.5103216172, -271844.4984067962, -198435.9985393478, -204813.1418155937, -69633.01400964541, -77512.80614640158, -3.301056024175191e-11, 136453.24247066924, 597596.7265429876, 265819.283596221, 512686.2194260965, 363771.96164202684, 386186.30282471696, 359098.13930302166, 333850.58378217573, 305387.8646382142, 273984.09341579844, 239941.70559884762, 206420.879411476, 150679.28012319302, 155521.66439409263, 

Excessive output truncated after 812256 bytes.


gδ = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

65-element Array{Int64,1}:
     1
   290
   579
   868
  1157
  1446
  1735
  2024
  2313
  2602
  2891
  3180
  3469
     ⋮
 15318
 15607
 15896
 16185
 16474
 16763
 17052
 17341
 17630
 17919
 18208
 18497

In [245]:
for e=1:length(locfactors)
    @views u[vstarts[e]:(vstarts[e+1]-1)] = (locfactors[e] \ g[vstarts[e]:(vstarts[e+1]-1)])
end

In [246]:
mul!(bλ,FbarT,u)

1904-element Array{Float64,1}:
      1.0384926660911115e6 
 437321.36751612386        
 -92600.20403835559        
  95970.03956722301        
   8583.310842436855       
   9191.509798786115       
   3940.3542585642995      
    773.1793794357795      
   -851.7504226176612      
  -1910.2509426550675      
  -2302.2409371227036      
  -2352.1533158622096      
  -1939.9463408069475      
      ⋮                    
     -1.9982866587130674   
     -1.6995353199901202   
     -1.2434974889861212   
     -0.6566410278266643   
     -0.894602293323984    
     -1.0233744922016184   
     -1.0266407345194069   
     -0.8248944531135468   
     -0.9057309638832659   
     -0.32004879785094087  
     -0.36420679490708957  
      1.6860322099368314e-5

In [247]:
@show typeof(bλ)
@show typeof(FbarT)
@show typeof(u)

typeof(bλ) = Array{Float64,1}
typeof(FbarT) = SparseMatrixCSC{Float64,Int64}
typeof(u) = Array{Float64,1}


Array{Float64,1}

In [248]:
@doc mul!

```
mul!(Y, A, B) -> Y
```

Calculates the matrix-matrix or matrix-vector product $AB$ and stores the result in `Y`, overwriting the existing value of `Y`. Note that `Y` must not be aliased with either `A` or `B`.

# Examples

```jldoctest
julia> A=[1.0 2.0; 3.0 4.0]; B=[1.0 1.0; 1.0 1.0]; Y = similar(B); mul!(Y, A, B);

julia> Y
2×2 Array{Float64,2}:
 3.0  3.0
 7.0  7.0
```

# Implementation

For custom matrix and vector types, it is recommended to implement 5-argument `mul!` rather than implementing 3-argument `mul!` directly if possible.

```
mul!(C, A, B, α, β) -> C
```

Combined inplace matrix-matrix or matrix-vector multiply-add $A B α + C β$. The result is stored in `C` by overwriting it.  Note that `C` must not be aliased with either `A` or `B`.

!!! compat "Julia 1.3"
    Five-argument `mul!` requires at least Julia 1.3.


# Examples

```jldoctest
julia> A=[1.0 2.0; 3.0 4.0]; B=[1.0 1.0; 1.0 1.0]; C=[1.0 2.0; 3.0 4.0];

julia> mul!(C, A, B, 100.0, 10.0) === C
true

julia> C
2×2 Array{Float64,2}:
 310.0  320.0
 730.0  740.0
```


In [249]:
@benchmark λ[:] = BF \ bλ

DimensionMismatch: DimensionMismatch("LHS and RHS should have the same number of rows. LHS has 408 rows, but RHS has 1904 rows.")

In [250]:
u[:] = -FbarT' * λ

18496-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮  
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [251]:
u[:] .= g .+ u

18496-element view(::Array{Float64,1}, :) with eltype Float64:
 -869703.110364473          
      -3.808863185715654e6  
      -1.6942349888694114e6 
      -3.2676746378652183e6 
      -2.3185495688859234e6 
      -2.4614103898557606e6 
      -2.288760333013442e6  
      -2.1278416390491016e6 
      -1.9464306669041584e6 
      -1.7462745034096926e6 
      -1.529300761836642e6  
      -1.3156512635228843e6 
 -960374.6764668435         
       ⋮                    
       0.09469451967594071  
       0.08805238707095672  
       0.08186157935574032  
       0.07488239988123685  
       0.06718206196099342  
       0.05883472405861472  
       0.050615275280301686 
       0.03694719867591601  
       0.038134571840798284 
       0.012965111279983118 
       0.014432265666005075 
       6.146302770832472e-18

In [252]:
@benchmark for e = 1:nelems
      F = locfactors[e]
      (x, y) = lop[e].coord
      JH = lop[e].JH
      start = time()
      @views u[vstarts[e]:(vstarts[e+1]-1)] = F \ u[vstarts[e]:(vstarts[e+1]-1)]
      #=
      ldiv!((@view u[vstarts[e]:(vstarts[e+1]-1)]), F,
            (@view u[vstarts[e]:(vstarts[e+1]-1)]))
      =#

      @views Δ[vstarts[e]:(vstarts[e+1]-1)] = (u[vstarts[e]:(vstarts[e+1]-1)] -
                                               vex(x[:], y[:], e))
      ϵ[lvl] += Δ[vstarts[e]:(vstarts[e+1]-1)]' * JH * Δ[vstarts[e]:(vstarts[e+1]-1)]
 end
   

BenchmarkTools.Trial: 
  memory estimate:  1.77 MiB
  allocs estimate:  4073
  --------------
  minimum time:     2.484 ms (0.00% GC)
  median time:      2.617 ms (0.00% GC)
  mean time:        2.772 ms (2.96% GC)
  maximum time:     7.773 ms (48.75% GC)
  --------------
  samples:          1801
  evals/sample:     1

In [253]:
@benchmark @threads for e = 1:nelems
      F = locfactors[e]
      (x, y) = lop[e].coord
      JH = lop[e].JH
      start = time()
      @views u[vstarts[e]:(vstarts[e+1]-1)] = F \ u[vstarts[e]:(vstarts[e+1]-1)]
      #=
      ldiv!((@view u[vstarts[e]:(vstarts[e+1]-1)]), F,
            (@view u[vstarts[e]:(vstarts[e+1]-1)]))
      =#

      @views Δ[vstarts[e]:(vstarts[e+1]-1)] = (u[vstarts[e]:(vstarts[e+1]-1)] -
                                               vex(x[:], y[:], e))
      ϵ[lvl] += Δ[vstarts[e]:(vstarts[e+1]-1)]' * JH * Δ[vstarts[e]:(vstarts[e+1]-1)]
 end

BenchmarkTools.Trial: 
  memory estimate:  1.77 MiB
  allocs estimate:  4039
  --------------
  minimum time:     1.106 ms (0.00% GC)
  median time:      1.229 ms (0.00% GC)
  mean time:        1.403 ms (7.23% GC)
  maximum time:     40.355 ms (62.02% GC)
  --------------
  samples:          3552
  evals/sample:     1

Conclusion: using Multithreading would significantly boost the performance of parallel solve