|
| 1 | +from pydub import AudioSegment |
| 2 | +from pydub.silence import split_on_silence |
| 3 | +import os |
| 4 | +import collections |
| 5 | +import contextlib |
| 6 | +import sys |
| 7 | +import wave |
| 8 | +import os |
| 9 | +import webrtcvad |
| 10 | + |
| 11 | +def read_wave(path): |
| 12 | + |
| 13 | + with contextlib.closing(wave.open(path, 'rb')) as wf: |
| 14 | + num_channels = wf.getnchannels() |
| 15 | + assert num_channels == 1 |
| 16 | + sample_width = wf.getsampwidth() |
| 17 | + assert sample_width == 2 |
| 18 | + sample_rate = wf.getframerate() |
| 19 | + assert sample_rate in (8000, 16000, 32000, 48000) |
| 20 | + pcm_data = wf.readframes(wf.getnframes()) |
| 21 | + return pcm_data, sample_rate |
| 22 | + |
| 23 | + |
| 24 | +def write_wave(path, audio, sample_rate): |
| 25 | + |
| 26 | + with contextlib.closing(wave.open(path, 'wb')) as wf: |
| 27 | + wf.setnchannels(1) |
| 28 | + wf.setsampwidth(2) |
| 29 | + wf.setframerate(sample_rate) |
| 30 | + wf.writeframes(audio) |
| 31 | + frames = wf.getnframes() |
| 32 | + return frames / float(sample_rate) |
| 33 | + |
| 34 | + |
| 35 | +class Frame(object): |
| 36 | + |
| 37 | + def __init__(self, bytes, timestamp, duration): |
| 38 | + self.bytes = bytes |
| 39 | + self.timestamp = timestamp |
| 40 | + self.duration = duration |
| 41 | + |
| 42 | + |
| 43 | +def frame_generator(frame_duration_ms, audio, sample_rate): |
| 44 | + |
| 45 | + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) |
| 46 | + offset = 0 |
| 47 | + timestamp = 0.0 |
| 48 | + duration = (float(n) / sample_rate) / 2.0 |
| 49 | + while offset + n < len(audio): |
| 50 | + yield Frame(audio[offset:offset + n], timestamp, duration) |
| 51 | + timestamp += duration |
| 52 | + offset += n |
| 53 | + |
| 54 | + |
| 55 | +def vad_collector(sample_rate, frame_duration_ms, |
| 56 | + padding_duration_ms, vad, frames): |
| 57 | + |
| 58 | + num_padding_frames = int(padding_duration_ms / frame_duration_ms) |
| 59 | + ring_buffer = collections.deque(maxlen=num_padding_frames) |
| 60 | + triggered = False |
| 61 | + |
| 62 | + voiced_frames = [] |
| 63 | + for frame in frames: |
| 64 | + is_speech = vad.is_speech(frame.bytes, sample_rate) |
| 65 | + |
| 66 | + |
| 67 | + if not triggered: |
| 68 | + ring_buffer.append((frame, is_speech)) |
| 69 | + num_voiced = len([f for f, speech in ring_buffer if speech]) |
| 70 | + |
| 71 | + if num_voiced > 0.9 * ring_buffer.maxlen: |
| 72 | + triggered = True |
| 73 | + |
| 74 | + |
| 75 | + for f, s in ring_buffer: |
| 76 | + voiced_frames.append(f) |
| 77 | + ring_buffer.clear() |
| 78 | + else: |
| 79 | + |
| 80 | + voiced_frames.append(frame) |
| 81 | + ring_buffer.append((frame, is_speech)) |
| 82 | + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) |
| 83 | + |
| 84 | + if num_unvoiced > 0.9 * ring_buffer.maxlen: |
| 85 | + |
| 86 | + triggered = False |
| 87 | + yield b''.join([f.bytes for f in voiced_frames]) |
| 88 | + ring_buffer.clear() |
| 89 | + voiced_frames = [] |
| 90 | + if triggered: |
| 91 | + pass |
| 92 | + |
| 93 | + if voiced_frames: |
| 94 | + yield b''.join([f.bytes for f in voiced_frames]) |
| 95 | + |
| 96 | +path = "./frontend/speech-transcription-app/public/Original data" |
| 97 | +if not os.path.exists(path): |
| 98 | + os.makedirs(path) |
| 99 | + print("Output folder created") |
| 100 | +else: |
| 101 | + print("Output folder already present") |
| 102 | + sys.exit() |
| 103 | + |
| 104 | +def folder(path): |
| 105 | + if not os.path.exists(path): |
| 106 | + os.makedirs(path) |
| 107 | + print("Output folder created") |
| 108 | + else: |
| 109 | + print("Output folder already present") |
| 110 | + |
| 111 | +path = "./frontend/speech-transcription-app/public/Original data" |
| 112 | +folder(path) |
| 113 | +path = "./main/save" |
| 114 | +folder(path) |
| 115 | +path = "./main/discard" |
| 116 | +folder(path) |
| 117 | + |
| 118 | +file_name= "./main/mod_1.wav" |
| 119 | +op_path= "./frontend/speech-transcription-app/public/Original data/audio_chunks" |
| 120 | + |
| 121 | + |
| 122 | + |
| 123 | +def main(file_name,op_path): |
| 124 | + |
| 125 | + if os.path.isdir(op_path): |
| 126 | + print("Output folder already present") |
| 127 | + else: |
| 128 | + os.mkdir(op_path) |
| 129 | + print("Output folder created") |
| 130 | + |
| 131 | + audio, sample_rate = read_wave(file_name) |
| 132 | + vad = webrtcvad.Vad(2) |
| 133 | + frames = frame_generator(30, audio, sample_rate) |
| 134 | + segments = vad_collector(sample_rate, 30, 300, vad, frames) |
| 135 | + |
| 136 | + for i, segment in enumerate(segments): |
| 137 | + path = op_path+'/'+'chunk%004d.wav' % (i+1,) |
| 138 | + print(' Writing %s' %(path,)) |
| 139 | + write_wave(path, segment, sample_rate ) |
| 140 | + |
| 141 | + |
| 142 | + |
| 143 | +# sys.argv[1] |
| 144 | + |
| 145 | +# sys.argv[2] |
| 146 | +file_name= "./main/mod_1.wav" |
| 147 | +op_path= "./frontend/speech-transcription-app/public/Original data/audio_chunks" |
| 148 | +main(file_name,op_path) |
| 149 | + |
| 150 | +print("Audio Splitting Done") |
| 151 | + |
| 152 | + |
| 153 | + |
| 154 | + |
| 155 | + |
| 156 | + |
| 157 | + |
| 158 | + |
| 159 | + |
0 commit comments