/
detectors.cpp
267 lines (224 loc) · 8.7 KB
/
detectors.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#include "detectors.h"
#include <iostream>
#include <numeric>
#include <algorithm>
#include <cmath>
#include <limits>
using namespace std;
#include "poptemplate.h"
static const bool kDelayMatch = false;
static const int kBlockSize = DETECTORS_BLOCK_SIZE;
static const int kLogBlockSize = 9;
static const int kSpectrumSize = kBlockSize/2;
static const int kWindowSize = kBlockSize;
static const int kNumSteps = 4;
static const int kStepSize = kBlockSize / kNumSteps;
static const size_t kMainBandLow = 40;
static const size_t kMainBandHi = 100;
static const size_t kOptionalBandHi = 180;
static const size_t kLowerBandLow = 3;
static const size_t kLowerBandHi = kMainBandLow;
static const size_t kUpperBandLo = kOptionalBandHi;
static const size_t kUpperBandHi = kSpectrumSize;
static const float kDefaultLowPassWeight = 0.6;
static const int kSpeechShadowTime = 100;
static const float kSpeechThresh = 0.5;
Detectors::Detectors() {
m_overlapBuffer = new float[kBlockSize * 2];
// === Tss Detection
m_sensitivity = 5.0;
m_hysterisisFactor = 0.4;
m_minFrames = 20;
m_minFramesLong = 100;
m_lowPassWeight = kDefaultLowPassWeight;
// === Pop detection
m_startBin = 2;
m_maxShiftDown = 4;
m_maxShiftUp = 2;
m_popSensitivity = 8.5;
m_framesSincePop = 0;
// debugLog = new std::ofstream("/Users/tristan/misc/popclick.log");
// === FFT
m_inReal = new float[kBlockSize];
m_splitData.realp = new float[kSpectrumSize];
m_splitData.imagp = new float[kSpectrumSize];
m_window = new float[kWindowSize];
memset(m_window, 0, sizeof(float) * kWindowSize);
vDSP_hann_window(m_window, kWindowSize, vDSP_HANN_NORM);
m_fftSetup = vDSP_create_fftsetup(kLogBlockSize, FFT_RADIX2);
}
Detectors::~Detectors() {
delete[] m_overlapBuffer;
delete[] m_inReal;
delete[] m_splitData.realp;
delete[] m_splitData.imagp;
delete[] m_window;
// delete debugLog;
vDSP_destroy_fftsetup(m_fftSetup);
}
bool Detectors::initialise() {
// Real initialisation work goes here!
m_savedOtherBands = 0.0002;
m_consecutiveMatches = 0;
m_framesSinceSpeech = 1000;
m_framesSinceMatch = 1000;
m_lowPassBuffer.resize(kSpectrumSize, 0.0);
m_spectrum.resize(kSpectrumSize, 0.0);
m_popBuffer.clear();
for(unsigned i = 0; i < kBufferSize; ++i) {
m_popBuffer.push_back(0.0);
}
return true;
}
int Detectors::process(const float *buffer) {
// return processChunk(buffer);
// copy last frame to start of the buffer
std::copy(m_overlapBuffer+kBlockSize, m_overlapBuffer+(kBlockSize*2), m_overlapBuffer);
// copy new input to the second half of the overlap buffer
std::copy(buffer,buffer+kBlockSize,m_overlapBuffer+kBlockSize);
int result = 0;
for(int i = 0; i < kNumSteps; ++i) {
float *ptr = m_overlapBuffer+((i+1)*kStepSize);
result |= processChunk(ptr);
}
return result;
}
void Detectors::doFFT(const float *buffer) {
vDSP_vmul(buffer, 1, m_window, 1, m_inReal, 1, kBlockSize);
vDSP_ctoz(reinterpret_cast<DSPComplex*>(m_inReal), 2, &m_splitData, 1, kSpectrumSize);
vDSP_fft_zrip(m_fftSetup, &m_splitData, 1, kLogBlockSize, FFT_FORWARD);
m_splitData.imagp[0] = 0.0f;
float scale = 1.0f / static_cast<float>(2 * kBlockSize);
vDSP_vsmul(m_splitData.realp, 1, &scale, m_splitData.realp, 1, kSpectrumSize);
vDSP_vsmul(m_splitData.imagp, 1, &scale, m_splitData.imagp, 1, kSpectrumSize);
}
int Detectors::processChunk(const float *buffer) {
doFFT(buffer);
int result = 0;
size_t n = kSpectrumSize;
for (size_t i = 0; i < n; ++i) {
float real = m_splitData.realp[i];
float imag = m_splitData.imagp[i];
float newVal = real * real + imag * imag;
m_spectrum[i] = newVal;
m_lowPassBuffer[i] = m_lowPassBuffer[i]*(1.0f-m_lowPassWeight) + newVal*m_lowPassWeight;
// infinite values happen non-deterministically, probably due to glitchy audio input at start of recording
// but inifinities it could mess up things forever
if(m_lowPassBuffer[i] >= numeric_limits<float>::infinity()) {
std::fill(m_lowPassBuffer.begin(), m_lowPassBuffer.end(), 0.0f);
return 0; // discard the frame, it's probably garbage
}
}
float lowerBand = avgBand(m_lowPassBuffer, kLowerBandLow, kLowerBandHi);
float mainBand = avgBand(m_lowPassBuffer, kMainBandLow, kMainBandHi);
float upperBand = avgBand(m_lowPassBuffer, kUpperBandLo, kUpperBandHi);
m_framesSinceSpeech += 1;
if(lowerBand > kSpeechThresh) {
m_framesSinceSpeech = 0;
}
float debugMarker = 0.0002;
float matchiness = mainBand / ((lowerBand+upperBand)/2.0f);
bool outOfShadow = m_framesSinceSpeech > kSpeechShadowTime;
int immediateMatchFrame = kDelayMatch ? m_minFramesLong : m_minFrames;
m_framesSinceMatch += 1;
if(((matchiness >= m_sensitivity) ||
(m_consecutiveMatches > 0 && matchiness >= m_sensitivity*m_hysterisisFactor) ||
(m_consecutiveMatches > immediateMatchFrame && (mainBand/m_savedOtherBands) >= m_sensitivity*m_hysterisisFactor*0.5f))
&& outOfShadow) {
debugMarker = 0.01;
// second one in double "tss" came earlier than trigger timer
if(kDelayMatch && m_consecutiveMatches == 0 && m_framesSinceMatch <= m_minFramesLong) {
result |= TSS_START_CODE;
result |= TSS_STOP_CODE;
m_framesSinceMatch = 1000;
}
m_consecutiveMatches += 1;
if(kDelayMatch && m_consecutiveMatches == m_minFrames) {
m_framesSinceMatch = m_consecutiveMatches;
} else if(m_consecutiveMatches == immediateMatchFrame) {
debugMarker = 1.0;
result |= TSS_START_CODE;
m_savedOtherBands = ((lowerBand+upperBand)/2.0f);
}
} else {
bool delayedMatch = kDelayMatch && (m_framesSinceMatch == m_minFramesLong && outOfShadow);
if(delayedMatch) {
result |= TSS_START_CODE;
}
if(m_consecutiveMatches >= immediateMatchFrame || delayedMatch) {
debugMarker = 2.0;
result |= TSS_STOP_CODE;
}
m_consecutiveMatches = 0;
}
// ===================== Pop Detection =================================
// update buffer forward one time step
for(unsigned i = 0; i < kBufferPrimaryHeight; ++i) {
m_popBuffer.pop_front();
m_popBuffer.push_back(m_spectrum[i]);
}
// high frequencies aren't useful so we bin them all together
m_popBuffer.pop_front();
float highSum = accumulate(m_spectrum.begin()+kBufferPrimaryHeight,m_spectrum.end(),0.0);
m_popBuffer.push_back(highSum);
std::deque<float>::iterator maxIt = max_element(m_popBuffer.begin(), m_popBuffer.end());
float minDiff = 10000000.0;
for(int i = -m_maxShiftUp; i < m_maxShiftDown; ++i) {
float diff = templateDiff(*maxIt, i);
if(diff < minDiff) minDiff = diff;
}
m_framesSincePop += 1;
if(minDiff < m_popSensitivity && m_framesSincePop > 15) {
result |= POP_CODE; // Detected pop
m_framesSincePop = 0;
}
// *debugLog << lowerBand << ' ' << mainBand << ' ' << optionalBand << ' ' << upperBand << '-' << matchiness << ' ' << debugMarker << std::endl;
return result;
}
float Detectors::avgBand(std::vector<float> &frame, size_t low, size_t hi) {
float sum = 0;
for (size_t i = low; i < hi; ++i) {
sum += frame[i];
}
return sum / (hi - low);
}
float Detectors::templateAt(int i, int shift) {
int bin = i % kBufferHeight;
if(i % kBufferHeight >= kBufferPrimaryHeight) {
return kPopTemplate[i]/kPopTemplateMax;
}
if(bin+shift < 0 || bin+shift >= kBufferPrimaryHeight) {
return 0.0;
}
return kPopTemplate[i+shift]/kPopTemplateMax;
}
float Detectors::diffCol(int templStart, int bufStart, float maxVal, int shift) {
float diff = 0;
for(unsigned i = m_startBin; i < kBufferHeight; ++i) {
float d = templateAt(templStart+i, shift) - m_popBuffer[bufStart+i]/maxVal;
diff += abs(d);
}
return diff;
}
float Detectors::templateDiff(float maxVal, int shift) {
float diff = 0;
for(unsigned i = 0; i < kBufferSize; i += kBufferHeight) {
diff += diffCol(i,i, maxVal,shift);
}
return diff;
}
extern "C" {
detectors_t *detectors_new() {
Detectors *dets = new Detectors();
dets->initialise();
return reinterpret_cast<detectors_t*>(dets);
}
void detectors_free(detectors_t *detectors) {
Detectors *dets = reinterpret_cast<Detectors*>(detectors);
delete dets;
}
int detectors_process(detectors_t *detectors, const float *buffer) {
Detectors *dets = reinterpret_cast<Detectors*>(detectors);
return dets->process(buffer);
}
}