In [1]:
using Distributed
using Base.Threads
@everywhere begin
    using BenchmarkTools
    using ParallelTemperingMonteCarlo
end
#addprocs(nthreads())

  ** incremental compilation may be fatally broken for this module **



In order to use a function of several variables in the pmap environment we require a curry function

In [2]:
@everywhere curry(f,y) = x -> f(x,y)
@everywhere add_xy(x,y) = x + y 


In [3]:
pmap(curry(add_xy,10), 1:5)


5-element Vector{Int64}:
 11
 12
 13
 14
 15

NB spawning processes is expensive and _not recommended at all_ for simple loops and functions.

# Sync macros

sync requires all tasks inside to complete before moving on, async moves right on along without waiting, 

In [4]:
@sync begin
    sleep(2)
    println("slept for two")

    @async begin 
        sleep(5)
        println("nice and rested")
    end
#the async wrapper skips straight to done
    println("done")
end

slept for two
done


nice and rested


We'll simulate a complex process with a 2 second sleep

In [5]:
function simtest(x)
    sleep(2)
    return x
end

simtest (generic function with 1 method)

In [6]:
@time begin 
    veccy = []
    for i = 1:10

        y = simtest(i)
        push!(veccy,y)
        
    end
    println(veccy)
end

@time begin 
    @sync for i = 1:10
        veccy = []
        @async begin 
            y = simtest(i)
            push!(veccy,y)
        end
    end
    println(veccy)
end

Any[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 20.272400 seconds (353.32 k allocations: 20.348 MiB, 1.28% compilation time)


Any[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
  2.008591 seconds (4.32 k allocations: 232.928 KiB, 0.54% compilation time)


NB: Asynchronous tasks are not parallel, demonstrated above they do boost performance for large operations. Use @spawn not @async for parallel operations

Let's test a data unsafe operation

In [7]:
function printtest(x::Bool = true)
    println("begin")
    if x == true
        @sync for i=1:10
            @spawn println("$i $i $i $i $i ")
        end
    else
        for i=1:10
            println("$i $i $i $i $i ")
        end
    end
    println("end")
end

printtest (generic function with 2 methods)

In [9]:
@time printtest()
println("done")

@time printtest(false)
println("done")
# @time begin 
#     println("begin")
#     @threads for i=1:10
#         println("$i $i $i $i $i ")
#     end
#     println("end")
# end

begin


1 1 1 1 1 
2 2 2 2 2 
3 3 3 3 3 
4 4 4 4 4 
5 5 5 5 5 
6 6 6 6 6 
7 7 7 7 7 
8 8 8 8 8 
9 9 9 9 9 
10 10 10 10 10 
end
  0.229576 seconds (133.82 k allocations: 7.858 MiB, 99.36% compilation time)
done
begin
1 1 1 1 1 
2 2 2 2 2 
3 3 3 3 3 
4 4 4 4 4 
5 5 5 5 5 
6 6 6 6 6 
7 7 7 7 7 
8 8 8 8 8 
9 9 9 9 9 
10 10 10 10 10 
end


  0.000160 seconds (467 allocations: 13.656 KiB)
done


# NB 
Categorically do not use @spawn where data ordering is relevant. Additionally, it is considered bad practice to parallelise any operation faster than 100 $\mu m$ for threads or 100ms for spawning as it does not increase speed enough to compensate for the time taken to spawn the operation.

Below we test specific functions for this purpose.

## Onwards, time-testing some functions

In [10]:
n_atoms = 13

# temperature grid
ti = 5.
tf = 16.
n_traj = 32

temp = TempGrid{n_traj}(ti,tf) 

# MC simulation details
mc_cycles = 300000 #default 20% equilibration cycles on top
mc_sample = 1  #sample every mc_sample MC cycles

#move_atom=AtomMove(n_atoms) #move strategy (here only atom moves, n_atoms per MC cycle)
displ_atom = 0.1 # Angstrom
n_adjust = 100

max_displ_atom = [0.1*sqrt(displ_atom*temp.t_grid[i]) for i in 1:n_traj]

mc_params = MCParams(mc_cycles, n_traj, n_atoms, mc_sample = mc_sample, n_adjust = n_adjust)

#moves - allowed at present: atom, volume and rotation moves (volume,rotation not yet implemented)
move_strat = MoveStrategy(atom_moves = n_atoms)  

#ensemble
ensemble = NVT(n_atoms)

#ELJpotential for neon
#c1=[-10.5097942564988, 0., 989.725135614556, 0., -101383.865938807, 0., 3918846.12841668, 0., -56234083.4334278, 0., 288738837.441765]
#elj_ne1 = ELJPotential{11}(c1)

c=[-10.5097942564988, 989.725135614556, -101383.865938807, 3918846.12841668, -56234083.4334278, 288738837.441765]
pot = ELJPotentialEven{6}(c)

#starting configurations
#icosahedral ground state of Ne13 (from Cambridge cluster database) in Angstrom
pos_ne13 = [[2.825384495892464, 0.928562467914040, 0.505520149314310],
[2.023342172678102,	-2.136126268595355, 0.666071287554958],
[2.033761811732818,	-0.643989413759464, -2.133000349161121],
[0.979777205108572,	2.312002562803556, -1.671909307631893],
[0.962914279874254,	-0.102326586625353, 2.857083360096907],
[0.317957619634043,	2.646768968413408, 1.412132053672896],
[-2.825388342924982, -0.928563755928189, -0.505520471387560],
[-0.317955944853142, -2.646769840660271, -1.412131825293682],
[-0.979776174195320, -2.312003751825495, 1.671909138648006],
[-0.962916072888105, 0.102326392265998,	-2.857083272537599],
[-2.023340541398004, 2.136128558801072,	-0.666071089291685],
[-2.033762834001679, 0.643989905095452, 2.132999911364582],
[0.000002325340981,	0.000000762100600, 0.000000414930733]]

#convert to Bohr
AtoBohr = 1.8897259886
pos_ne13 = pos_ne13 * AtoBohr

length(pos_ne13) == n_atoms || error("number of atoms and positions not the same - check starting config")

#boundary conditions 
bc_ne13 = SphericalBC(radius=5.32*AtoBohr)   #5.32 Angstrom

#starting configuration
start_config = Config(pos_ne13, bc_ne13)

#histogram information
n_bin = 100
#en_min = -0.006    #might want to update after equilibration run if generated on the fly
#en_max = -0.001    #otherwise will be determined after run as min/max of sampled energies (ham vector)

#construct array of MCState (for each temperature)
mc_states = [MCState(temp.t_grid[i], temp.beta_grid[i], start_config, pot; max_displ=[max_displ_atom[i],0.01,1.]) for i in 1:n_traj]

#results = Output(n_bin, max_displ_vec)
results = Output{Float64}(n_bin; en_min = mc_states[1].en_tot)


Output{Float64}(100, 0.0, 0.0, Float64[], Float64[], Float64[], Vector{Float64}[], Float64[], Float64[], Float64[], Float64[], Float64[])

above we just define the 13 atom system, below we show that threading halves the time taken to complete one mc_step per trajectory.

In [11]:
@benchmark begin 
    for i in 1:mc_params.n_traj
        mc_step!(mc_states[i],pot,ensemble,1,0,0);
    end
end



BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m15.680 μs[22m[39m … [35m 4.818 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 98.05%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m21.117 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m23.495 μs[22m[39m ± [32m56.969 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m3.79% ±  1.69%

  [39m▇[39m█[39m▂[39m [39m [39m [39m▁[39m▁[39m [34m [39m[39m [39m [39m [39m [32m [39m[39m [39m▆[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[39m▆[39m▄[39m▄

In [12]:
@benchmark begin 
    @threads for i in 1:mc_params.n_traj
        mc_step!(mc_states[i],pot,ensemble,1,0,0);
    end
end


BenchmarkTools.Trial: 10000 samples with 3 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 7.436 μs[22m[39m … [35m  8.439 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 99.42%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m12.423 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m26.228 μs[22m[39m ± [32m134.776 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m8.36% ±  1.72%

  [39m█[39m█[34m█[39m[39m▅[39m▄[39m▃[39m▂[39m▂[32m▂[39m[39m▂[39m▁[39m▁[39m▁[39m [39m▁[39m▃[39m▄[39m▅[39m▄[39m▃[39m▂[39m▂[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[34m█[39m[39

In [13]:
@benchmark begin 
    for i in 1:mc_params.n_traj
        x = MCRun.atom_displacement(mc_states[i].config.pos[2],mc_states[i].max_displ[1],mc_states[i].config.bc);
    end
end


BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m11.124 μs[22m[39m … [35m 4.813 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 98.22%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m13.325 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m15.110 μs[22m[39m ± [32m48.199 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m3.13% ±  0.98%

  [39m█[39m█[39m▆[39m▆[39m▅[39m▅[39m▆[34m▆[39m[39m▆[39m▅[39m▅[39m▅[32m▄[39m[39m▃[39m▃[39m▃[39m▂[39m▃[39m▁[39m▂[39m▂[39m▂[39m▃[39m▂[39m▁[39m▂[39m▂[39m▁[39m▂[39m▁[39m▁[39m▁[39m▁[39m [39m▁[39m▁[39m▁[39m▁[39m▁[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[39m█[39m█[39m█

In [14]:
@benchmark begin 
    @threads for i in 1:mc_params.n_traj
        x = MCRun.atom_displacement(mc_states[i].config.pos[2],mc_states[i].max_displ[1],mc_states[i].config.bc);
    end
end

BenchmarkTools.Trial: 10000 samples with 4 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 7.340 μs[22m[39m … [35m 4.463 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 99.28%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m10.937 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m20.207 μs[22m[39m ± [32m88.047 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m8.10% ±  1.97%

  [39m█[39m█[34m▇[39m[39m▄[39m▃[39m▃[39m▂[32m▂[39m[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m▁[39m▂[39m▃[39m▂[39m▂[39m▂[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[34m█[39m[39m█[39m█

There is a speed-up in calculation for the atom_displacement step, but the greatest increase is to complete the metropolis condition as well, so threading works perfectly for the dimer by simply threading the loops. 

# Time to test the writing step

Let's test printing 32 (traj) iterations of 55(atoms) and time this with the @sync @sync vs without

In [15]:
t1 = @benchmark begin
    filetest = open("test.dat", "w+")
    
    @sync begin
    @async for i=1:32
         @async for j=1:55
            write(filetest,"$i $j \n")
        end
    end
end

close(filetest)
end

BenchmarkTools.Trial: 6012 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m636.238 μs[22m[39m … [35m 10.302 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m685.524 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m817.720 μs[22m[39m ± [32m635.968 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m8.38% ± 9.99%

  [39m█[34m▆[39m[39m▄[32m▃[39m[39m▂[39m▂[39m▁[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[39m

In [16]:
t2 = @benchmark begin
    filetest = open("test2.dat", "w+")
    
    begin
    for i=1:32
        for j=1:55
            write(filetest,"$i $j \n")
        end
    end
end

    close(filetest)
end

BenchmarkTools.Trial: 6217 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m615.488 μs[22m[39m … [35m 11.069 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m673.038 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m789.215 μs[22m[39m ± [32m598.373 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m8.20% ± 9.86%

  [39m█[34m▆[39m[32m▅[39m[39m▄[39m▃[39m▂[39m▂[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[34m█[39m[32m

It's slower to use the sync async functionality, but the atom-invariance within trajectories may make @spawn or @threads more useful

In [17]:
t3 = @benchmark begin
    filetest = open("test3.dat", "w+")
   
   @sync begin

   @async for i=1:32
        @threads for j=1:55
           write(filetest,"$i $j \n")
       end
   end
end

close(filetest)
end

BenchmarkTools.Trial: 1738 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.814 ms[22m[39m … [35m44.202 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 76.58%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.311 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.844 ms[22m[39m ± [32m 2.476 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m4.14% ±  5.27%

  [39m▅[39m█[39m█[39m▇[34m▆[39m[39m▆[39m▅[39m▅[32m▄[39m[39m▄[39m▂[39m▃[39m▂[39m▂[39m▁[39m [39m▁[39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m█[39m█[39m█[39m█[34m█[39m[39m█[39m█

In [18]:
t4 = @benchmark begin
    filetest = open("test4.dat", "w+")
   
   @sync begin

   @async for i=1:32
        for j=1:55
        @spawn write(filetest,"$i $j \n")
       end
   end
end

close(filetest)
end

BenchmarkTools.Trial: 374 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m 5.674 ms[22m[39m … [35m206.947 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 73.95%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m 6.925 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m13.449 ms[22m[39m ± [32m 23.893 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m28.89% ± 14.41%

  [39m█[34m▆[39m[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[34m█[39m[39m█[39

Nope. Writing shouldn't be parallelised. This isn't advantageous. Atom moves on the otherhand benefit from this. Operations, and in particular for-loops for the storage steps are next on the agenda

Testing some Runner functionality

In [20]:
function test_things(x,mc_states)
    file = RuNNer.writeinit(pwd())
    if x == true
        @sync begin
        @threads for mc_state in mc_states
            writeconfig(file,mc_state.config,"Cu")
        end
        end
    elseif x == false
        for mc_state in mc_states
            writeconfig(file,mc_state.config,"Cu")
        end
    elseif x == 1
        for mc_state in mc_states
           @spawn writeconfig(file,mc_state.config,"Cu")
        end
    end
    close(file)
end


test_things (generic function with 1 method)

In [21]:
@benchmark test_things(true,mc_states)

@benchmark test_things(1,mc_states)

@benchmark test_things(false,mc_states)

BenchmarkTools.Trial: 5115 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m421.198 μs[22m[39m … [35m75.135 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 94.18%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m703.488 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m951.503 μs[22m[39m ± [32m 2.796 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m12.12% ±  4.29%

  [39m [39m [39m▁[39m█[39m█[34m▅[39m[39m▃[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▇[39m█[39m█[

As expected, significantly faster to not thread the writing step, what we can try is writing several files. Next the pmap formalism.

In [25]:
 @everywhere curry(f,y1,y2,y3,y4,y5,y6,y7,y8) = x -> f(x,y1,y2,y3,y4,y5,y6,y7,y8)
 

In [29]:
pmap(curry(mc_cycle!,move_strat, mc_params, pot, ensemble, 10, 1,0,0),mc_states)

MethodError: MethodError: no method matching getindex(::MCState{Float64, 13, SphericalBC{Float64}}, ::Int64)

getindex (generic function with 237 methods)