In [1]:
using LinearAlgebra, Statistics, Random, ProgressMeter, ScikitLearn
using ACE: alloc_B, alloc_temp, evaluate!,alloc_dB, alloc_temp_d, evaluate_d!, rpi_basis, rnn
using NeighbourLists: maxneigs
using JuLIP: sites, neighbourlist, cutoff, JVec, AbstractAtoms, fltype, AtomicNumber
using JuLIP.Potentials: neigsz!
using IPFitting.Data: read_xyz
@sk_import linear_model: Ridge
@sk_import linear_model: RidgeCV
using ScikitLearn.CrossValidation: train_test_split

┌ Info: Precompiling IPFitting [3002bd4c-79e4-52ce-b924-91256dde4e52]
└ @ Base loading.jl:1317
┌ Info: Skipping precompilation since __precompile__(false). Importing IPFitting [3002bd4c-79e4-52ce-b924-91256dde4e52].
└ @ Base loading.jl:1025
┌ Info: Precompiling ASE [51974c44-a7ed-5088-b8be-3e78c8ba416c]
└ @ Base loading.jl:1317
┌ Info: Skipping precompilation since __precompile__(false). Importing ASE [51974c44-a7ed-5088-b8be-3e78c8ba416c].
└ @ Base loading.jl:1025


In [2]:
function sum_descriptor(shipB, at::AbstractAtoms{T}) where {T}
   E = zeros(fltype(shipB), length(shipB))
   B = alloc_B(shipB)
   nlist = neighbourlist(at, cutoff(shipB))
   maxnR = maxneigs(nlist)
   tmp = alloc_temp(shipB, maxnR)
   tmpRZ = (R = zeros(JVec{T}, maxnR), Z = zeros(AtomicNumber, maxnR))
   for i = 1:length(at)
      j, R, Z = neigsz!(tmpRZ, nlist, at, i)
      fill!(B, 0)
      evaluate!(B, tmp, shipB, R, Z, at.Z[i])
      E[:] .+= B[:]
   end
   return E
end

function sum_descriptor_traj(basis, traj)
    A_all = Array{Float64, 2}(undef, length(traj), length(basis)) 
    @showprogress "Computing descriptor for structure " for i in 1:length(traj)
        atoms = traj[i].at
        A_all[i, :] = sum_descriptor(basis, atoms)
    end
    return A_all
end;


function sum_d_descriptor(shipB, at::AbstractAtoms{T}) where {T}
   # precompute the neighbourlist to count the number of neighbours
   nlist = neighbourlist(at, cutoff(shipB); storelist=false)
   maxR = maxneigs(nlist)
   # allocate space accordingly
   F = zeros(JVec{T}, length(at), length(shipB))
   B = alloc_B(shipB, maxR)
   dB = alloc_dB(shipB, maxR)
   tmp = alloc_temp_d(shipB, maxR)
   tmpRZ = (R = zeros(JVec{T}, maxR), Z = zeros(AtomicNumber, maxR))
   return sum_d_descriptor_inner!(shipB, at, nlist, F, B, dB, tmp, tmpRZ)
end

# this is a little hack to remove a type instability. It probably makes no
# difference in practise...
function sum_d_descriptor_inner!(shipB, at::AbstractAtoms{T},
                       nlist, F, B, dB, tmp, tmpRZ) where {T}
   # assemble site gradients and write into F
   for i = 1:length(at)
      j, R, Z = neigsz!(tmpRZ, nlist, at, i)
      fill!(dB, zero(JVec{T}))
      fill!(B, 0)
      evaluate_d!(B, dB, tmp, shipB, R, Z, at.Z[i])
      for a = 1:length(R)
         F[j[a], :] .-= dB[:, a]
         F[i, :] .+= dB[:, a]
      end
   end
   return [ F[:, iB] for iB = 1:length(shipB) ]
end

function sum_d_descriptor_traj(basis, traj)
    dA_all = []
    @showprogress "Computing desc. der. for structure " for i in 1:length(traj)
        atoms = traj[i].at
        XF = sum_d_descriptor(basis, atoms);
        XF = hcat([collect(Iterators.flatten(a)) for a in XF]...)
        push!(dA_all, XF)
    end
    return dA_all
end;

function fit_potential(XE_tr, YE_tr, XF_tr, YF_tr, alpha=1.0)
    YF_tr = hcat(YF_tr'...)'
    XF_tr = hcat(XF_tr'...)'
    X_tr = vcat(XE_tr, XF_tr)
    Y_tr = vcat(YE_tr, YF_tr);
    ridge_pred = fit!(Ridge(alpha), X_tr, Y_tr)
    return ridge_pred
end

function fit_potential_with_contributions(XE_tr, YE_tr, XF_tr, YF_tr, atomic_energies,nats_tr,alpha=1.0,)
    nYE_tr = YE_tr - dot(nats_tr,atomic_energies)
    YF_tr = hcat(YF_tr'...)'
    XF_tr = hcat(XF_tr'...)'
    X_tr = vcat(XE_tr, XF_tr)
    Y_tr = vcat(nYE_tr, YF_tr);
    ridge_pred = fit!(Ridge(alpha), X_tr, Y_tr)
    return ridge_pred
end

function predict_potential(ridge_pred, XE_tst, XF_tst)
    n_struc = length(XE_tst[:, 1])
    XF_tst = hcat(XF_tst'...)'
    X_tst = vcat(XE_tst, XF_tst)
    result = predict(ridge_pred, X_tst)
    return result[1:n_struc], result[n_struc+1:end]
end


function predict_potential_with_contributions(ridge_pred, XE_tst, XF_tst,atomic_energies,nats_ts)
    n_struc = length(XE_tst[:, 1])
    XF_tst = hcat(XF_tst'...)'
    X_tst = vcat(XE_tst, XF_tst)
    result = predict(ridge_pred, X_tst)
    return result[1:n_struc] + dot(nats_ts,atomic_energies), result[n_struc+1:end]
end

function extract_info(B, traj)
    XE = sum_descriptor_traj(B, traj);
    XF = sum_d_descriptor_traj(B, traj);

    YE = Vector{Float64}(undef, length(traj)) 
    for i in 1:length(traj)
        YE[i] = traj[i].D["E"][1]
    end

    YF = Vector{Float64}[]
    for i in 1:length(traj)
        push!(YF, traj[i].D["F"])
    end

    nat = Vector{Float64}(undef, length(traj)) 
    for i in 1:length(traj)
        nat[i] = length(traj[i])
    end
    
    return XE, YE, XF, YF, nat
    end;

In [127]:
# Data loading
all_traj = read_xyz("/home/users/anellia/Desktop/datasets/Cat-s_minima.xyz",
    energy_key="energy", force_key="forces", verbose=false);

siliconset = false
# Data parsing
if siliconset == true
    types = ["bcc","fcc","dia","liq","amorph","bt","st12"]
    traj = Any[]
    for tt in all_traj
       if tt.configtype in types
            push!(traj, tt)
        end
    end
end

┌ Info: Keys used: E => "energy", F => "forces", V => "dft_virial"
└ @ IPFitting.Data /home/users/anellia/.julia/packages/IPFitting/Ypo4v/src/data.jl:153
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:04:21[39m


┌─────────────┬───────┬────────┬───────┬────────┬───────┐
│[1m config_type [0m│[1m #cfgs [0m│[1m  #envs [0m│[1m    #E [0m│[1m     #F [0m│[1m    #V [0m│
│[90m      String [0m│[90m Int64 [0m│[90m  Int64 [0m│[90m Int64 [0m│[90m  Int64 [0m│[90m Int64 [0m│
├─────────────┼───────┼────────┼───────┼────────┼───────┤
│     nothing │  1405 │ 302016 │  1405 │ 906048 │     0 │
├─────────────┼───────┼────────┼───────┼────────┼───────┤
│       total │  1405 │ 302016 │  1405 │ 906048 │     0 │
│     missing │     0 │      0 │     0 │      0 │ 12645 │
└─────────────┴───────┴────────┴───────┴────────┴───────┘


In [112]:
# Descriptor parameters

N = 4       # Body order
maxdeg = 6        # Number of basis (?)
rcut = 4.5         # Radial Cutoff
species = [:H,:];      # Well... Species
r0 = 0.2           # Lowest radius for basis

B = rpi_basis(; species=species, N = N, r0 = r0,
   maxdeg = maxdeg, rcut = rcut,
   rin = 1.0* 0.5,
   constants = false);
println("Feature size :",length(B))

Feature size :213


In [113]:
# Generate descriptors and extract data

XE, YE, XF, YF, nat = extract_info(B, all_traj);

[32mComputing descriptor for structure 100%|████████████████| Time: 0:00:12[39m
[32mComputing desc. der. for structure 100%|████████████████| Time: 0:06:34[39m


In [124]:
# split

XE_tr, XE_tst, YE_tr, YE_tst, XF_tr, XF_tst, YF_tr, YF_tst, nat_tr, nat_tst = train_test_split(
    XE, YE, XF, YF, nat, test_size=0.1, random_state=10);

LoadError: MethodError: no method matching fit(::Type{ZScoreTransform}, ::Vector{Any}; dims=1)
[0mClosest candidates are:
[0m  fit([91m::StatisticalModel[39m, ::Any...) at /home/users/anellia/.julia/packages/StatsBase/DU1bT/src/statmodels.jl:178[91m got unsupported keyword argument "dims"[39m
[0m  fit([91m::Type{D}[39m, ::Any) where D<:Distributions.Distribution at /home/users/anellia/.julia/packages/Distributions/bawf4/src/genericfit.jl:33[91m got unsupported keyword argument "dims"[39m
[0m  fit([91m::Type{D}[39m, ::Any...) where D<:Distributions.Distribution at /home/users/anellia/.julia/packages/Distributions/bawf4/src/genericfit.jl:34[91m got unsupported keyword argument "dims"[39m
[0m  ...

In [117]:
# Fit and Predict
reg = 1e-12;         # Ridge regularizer

rr = fit_potential(XE_tr, YE_tr, XF_tr, YF_tr, reg)
e_hat, f_hat = predict_potential(rr, XE_tst, XF_tst)

rmse_e = mean((YE_tst./nat_tst .-  e_hat./nat_tst).^2).^0.5
print("RMSE Energy [meV/atom]: ", 1000*rmse_e)
rmse_f = mean((hcat(YF_tst'...)' .-  f_hat).^2).^0.5
print("\nRMSE Forces [eV/A]:     ", rmse_f)

plottingall = false
if plottingall
    pe = scatter( YE_tst./nat_tst, e_hat./nat_tst,title="Energies correlation")
    pf = scatter(hcat(YF_tst'...)' ,  f_hat,title="Forces correlation")
    plot(pe, pf, layout=(1,2), legend = false,size = (700, 350))
end

RMSE Energy [meV/atom]: 4.600640345439763
RMSE Forces [eV/A]:     0.0385025063253788