forked from Jo0o0Hyung/Dual-Attention-for-VAD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_noise.py
130 lines (109 loc) · 3.79 KB
/
add_noise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
from numba import autojit
"""
https://github.com/jfsantos/maracas/blob/master/maracas/maracas.py
"""
@autojit
def asl_meter(x, fs, nbits=16):
'''Measure the Active Speech Level (ASL) of x following ITU-T P.56.
If x is integer, it will be scaled to (-1, 1) according to nbits.
'''
if np.issubdtype(x.dtype, np.integer):
x = x / 2**(nbits-1)
# Constants
MIN_LOG_OFFSET = 1e-20
T = 0.03 # Time constant of smoothing in seconds
g = np.exp(-1/(T*fs))
H = 0.20 # Time of handover in seconds
I = int(np.ceil(H*fs))
M = 15.9 # Margin between threshold and ASL in dB
a = np.zeros(nbits-1) # Activity count
c = 0.5**np.arange(nbits-1, 0, step=-1) # Threshold level
h = np.ones(nbits)*I # Hangover count
s = 0
sq = 0
p = 0
q = 0
asl = -100
L = len(x)
s = sum(abs(x))
sq = sum(x**2)
dclevel = s/np.arange(1, L+1)
lond_term_level = 10*np.log10(sq/np.arange(1, L+1) + MIN_LOG_OFFSET)
c_dB = 20*np.log10(c)
for i in range(L):
p = g * p + (1-g) * abs(x[i])
q = g * q + (1-g) * p
for j in range(nbits-1):
if q >= c[j]:
a[j] += 1
h[j] = 0
elif h[j] < I:
a[j] += 1;
h[j] += 1
a_dB = -100 * np.ones(nbits-1)
for i in range(nbits-1):
if a[i] != 0:
a_dB[i] = 10*np.log10(sq/a[i])
delta = a_dB - c_dB
idx = np.where(delta <= M)[0]
if len(idx) != 0:
idx = idx[0]
if idx > 1:
asl = bin_interp(a_dB[idx], a_dB[idx-1], c_dB[idx], c_dB[idx-1], M)
else:
asl = a_dB[idx]
return asl
def rms_energy(x):
return 10*np.log10((1e-12 + x.dot(x))/len(x))
@autojit
def bin_interp(upcount, lwcount, upthr, lwthr, margin, tol=0.1):
n_iter = 1
if abs(upcount - upthr - margin) < tol:
midcount = upcount
elif abs(lwcount - lwthr - margin) < tol:
midcount = lwcount
else:
midcount = (upcount + lwcount)/2
midthr = (upthr + lwthr)/2
diff = midcount - midthr - margin
while abs(diff) > tol:
n_iter += 1
if n_iter > 20:
tol *= 1.1
if diff > tol:
midcount = (upcount + midcount)/2
midthr = (upthr + midthr)/2
elif diff < -tol:
midcount = (lwcount + midcount)/2
midthr = (lwthr + midthr)/2
diff = midcount - midthr - margin
return midcount
def add_noise(speech, noise, fs, snr, speech_energy='rms', asl_level=-26.0):
'''Adds noise to a speech signal at a given SNR.
The speech level is computed as the P.56 active speech level, and
the noise level is computed as the RMS level. The speech level is considered
as the reference.
https://github.com/jfsantos/maracas/blob/master/maracas/maracas.py
'''
# Ensure masker is at least as long as signal
if len(noise) < len(speech):
raise ValueError('len(noise) needs to be at least equal to len(speech)!')
# Generate random section of masker
if len(noise) != len(speech):
idx = np.random.randint(0, len(noise)-len(speech))
noise = noise[idx:idx+len(speech)]
# Scale noise wrt speech at target SNR
N_dB = rms_energy(noise)
if speech_energy == 'rms':
S_dB = rms_energy(speech)
elif speech_energy == 'P.56':
S_dB = asl_meter(speech, fs)
else:
raise ValueError('speech_energy has to be either "rms" or "P.56"')
# Rescale N
N_new = S_dB - snr
noise_scaled = 10**(N_new/20) * noise / 10**(N_dB/20)
y = speech + noise_scaled
y = y/10**(asl_meter(y, fs)/20) * 10**(asl_level/20)
return y, noise_scaled