# Which findmax implementation is the fastest? 

Julia has its own findmax function. However during the testing it seems a bit slow. So this notebook is tries to find the fastest implementation. 

I am using JULIA_NUM_THREADS=16 to see if multithreading can help speeding it up. 

In [2]:
using Base.Threads 

In [3]:
# This function uses purely julia findmax. It does not take advantage of multithreading.
function findmax_v1(lod::AbstractArray{<:Real,2}, dims::Int)
    res = findmax(lod, dims=dims)
    # get the first element, which is the max of the first dimension, and turn it into a column
    max = res[1]
    # get the second element, which is the cartisian index, and only get the first index of the tuple(cartisian index), and turn it into column
    maxidx = getindex.(res[2], 2)
    return hcat(maxidx, max)
end


findmax_v1 (generic function with 1 method)

In [4]:
# This function uses fused approach. Multhreading implemented at top level. 
function findmax_v2(lod::AbstractArray{<:Real,2}, dims::Int)
    max_array = Array{typeof(lod[1,1]),2}(undef, size(lod, dims), 2)
    Threads.@threads for i in 1:size(lod, dims)
        if dims == 1 
            (max, index) = findmax(lod[:, i])
        elseif dims == 2
            (max, index) = findmax(lod[i, :])
        else 
            error("dims must be 1 or 2")
        end
        max_array[i,1] = convert(typeof(lod[1,1]), index)
        max_array[i,2] = max
    end
end

findmax_v2 (generic function with 1 method)

In [5]:
# This function is purely hand written, a good old nested for loop. It only findmax across dim=2
function findmax_v3(lod::AbstractArray{<:Real,2})
    max_array = Array{typeof(lod[1,1]),2}(undef, size(lod)[1], 2)
    Threads.@threads for i in 1:size(lod)[1]
        # for i in 1:size(lod)[1]
        temp = lod[i, 1]
        idx = 1
        for j in 2:size(lod)[2]
            if temp < lod[i,j]
                temp = lod[i,j]
                idx = j
            end
        end
        max_array[i,1] = idx
        max_array[i,2] = temp
    end
    return max_array
end

findmax_v3 (generic function with 1 method)

In [34]:
a = rand(20000, 10000);

In [39]:
@time v1 = findmax_v1(a,2)
@time v2 = findmax_v2(a,2)
@time v3 = findmax_v3(a);

  0.890251 seconds (13 allocations: 937.906 KiB)
  0.179299 seconds (20.20 k allocations: 763.876 MiB)
  0.099519 seconds (196 allocations: 333.609 KiB)


In [37]:
using Distributed
using SharedArrays
# addprocs(16)
res = SharedArray{Float64}(size(a,1), 2)
@everywhere using Distributed   
nprocs()

17

In [40]:
s = time_ns()
@sync @distributed for row = 1:size(a,1)
    res[row,1] = findmax(a[row, :])[2]# Index
    res[row,2] = findmax(a[row, :])[1]# Max
end
e = time_ns()
println("distributed takes $((e-s)*0.000000001) seconds")

distributed takes 0.656867482 seconds


In [28]:
res

10000×2 SharedArray{Float64,2}:
 7096.0  0.999984
  632.0  0.999862
 4020.0  0.99978
 7533.0  0.999974
 8254.0  0.99984
 7085.0  0.999555
 4669.0  0.999912
 2967.0  0.999976
 9637.0  0.999906
 1118.0  0.999717
 4106.0  0.999408
 2865.0  0.999976
 8906.0  0.999803
    ⋮    
 7991.0  0.999848
 2624.0  0.999709
 7006.0  0.999985
 3026.0  0.999982
 5001.0  0.999988
 8674.0  0.999993
 2178.0  0.999991
 5425.0  0.999738
 9324.0  0.999971
 8314.0  0.999955
 7622.0  0.99975
 3042.0  0.999859

In [22]:
nprocs()

1