-
Notifications
You must be signed in to change notification settings - Fork 10
/
dataset_analysis.jl
118 lines (101 loc) · 3.31 KB
/
dataset_analysis.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
using JuLIP: Atoms, energy, cutoff
using JuLIP.Potentials: AbstractZList
function copy_zz_sym!(D::Dict)
_zz = collect(keys(D))
for z12 in _zz
sym12 = chemical_symbol.(z12)
D[sym12] = D[z12]
end
end
"""
`function get_rdf(data::AbstractVector{<: Atoms}, r_cut; kwargs...)` :
Produce a list of r values that occur in the dataset, restricted to the cutoff
radius `r_cut`. Keyword arguments:
* `rescale = true` : resample the data to account for volume scaling, i.e. a distance r will be kept with probability `min(1, (r0/r)^2)`.
* `r0 = :min` : parameter for resampling. If `:min` then the minimum r occuring in the dataset is taken.
* `maxsamples = 100_000` : maximum number of samples to return.
"""
function get_rdf(data::AbstractVector{<: Atoms}, r_cut;
rescale = true,
r0 = :min,
maxsamples = 100_000)
zz = AtomicNumber[]
for at in data
zz = unique(append!(zz, unique(at.Z)))
end
zz_pairs = [ (z0, z) for z0 in zz for z in zz ]
R0 = Dict{Any, Vector{Float64}}([ z12 => Float64[] for z12 in zz_pairs ]...)
R = deepcopy(R0)
for at in data
nlist = JuLIP.neighbourlist(at, r_cut; recompute=true)
for (i, j, rr) in pairs(nlist)
z12 = (at.Z[i], at.Z[j])
push!(R[z12], norm(rr))
end
end
for z12 in zz_pairs
sort!(R[z12])
end
# drop random samples with probability selected to adjust for
# volume scaling.
if rescale
R1 = deepcopy(R0)
for z12 in keys(R)
rr = R[z12]
# choose a minimum r value relative to which we resample.
_r0 = (r0 == :min) ? rr[1] : r0
for r in rr
if rand() < min(1, (_r0/r)^2)
push!(R1[z12], r)
end
end
end
else
R1 = R
end
# sub-select uniformly to get only #maxsamples samples for each
# pair of atomic numbers.
for z12 in keys(R1)
rr = R1[z12]
if length(rr) > maxsamples
Ikeep = floor.(Int, range(1, length(rr), length = maxsamples))
R1[z12] = rr[Ikeep]
end
end
## allow access to the rdf via z or via symbols
copy_zz_sym!(R1)
return R1
end
"""
`function get_adf(data::AbstractVector{<: Atoms}, r_cut; kwargs...)` :
Angular distribution, i.e. list of angles in [0, π] between all pairs of bonds
of length at most `r_cut`. Keyword arguments:
* `skip = 3` : only consider every `skip`th atom in the dataset.
* `maxsamples = 100_000` : maximum number of samples to return.
"""
function get_adf(data::AbstractVector{<: Atoms}, r_cut;
skip = 3,
maxsamples = 100_000)
skip = max(skip, 1)
A = Float64[]
ctr = -1
for at in data
nlist = JuLIP.neighbourlist(at, r_cut; recompute=true)
for i = 1:length(at)
ctr = mod(ctr + 1, skip); if ctr != 0; continue; end
Js, Rs = neigs(nlist, i)
for a1 = 1:length(Js)-1, a2 = a1+1:length(Js)
r̂1 = Rs[a1] / norm(Rs[a1])
r̂2 = Rs[a2] / norm(Rs[a2])
d = min(max(dot(r̂1, r̂2), -1.0), 1.0)
push!(A, acos(d))
end
end
end
sort!(A)
if length(A) > maxsamples
Ikeep = floor.(Int, range(1, length(A), length = maxsamples))
A = A[Ikeep]
end
return A
end