In [1]:
import pandas as pd
import sys
import math
from keras.models import load_model
import tensorflow as tf
from Code.Misc.helper import *

import numpy as np
from Code.Misc.split_segments import *
import pyaudio
import wave
import errno
import time
import calendar
import os
import copy
import concurrent.futures
from collections import deque

from tkinter import TclError
import scipy.signal as sps
import deepspeech as ds
from textblob import TextBlob

from tqdm import tqdm
import matplotlib.pyplot as plt

from graph_formating import *

from Code.Misc.models import audio_only_model, audio_text_model

# use this backend to display in separate Tk window
%matplotlib tk
#%matplotlib inline

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)



## Clean up environment

In [2]:
# Clean all old wave files
rem_flag = False
r = os.listdir('../../Data/Thread_files/')
for i in r:
    try:
        if i.endswith('wav'):
            rem_flag = True
            os.remove(i)
    except OSError as e: # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
            pass
print(rem_flag)

True


In [3]:
# Initialize pyaudio
p = pyaudio.PyAudio()

# Setup format info
FORMAT = pyaudio.paInt16
CHANNELS = 1                 # single channel for microphone
DEVICE = 5
RATE = 44100
file_len = 5
CHUNK = int(RATE / 2)

# Create fast I/O buffer
d = deque(maxlen=int(RATE / CHUNK * file_len))
# p_name = int(calendar.timegm(time.gmtime()))


## Import Models

### Custom Model Implementation

In [4]:
# set up model
multi_modal = True
if multi_modal:
    model = load_model('../Misc/Saved_data/Trained_models' +
                                    '/Weights/Audio_Text/model')
else:
    model = load_model('../Misc/Saved_data/Trained_models' +
                                    '/Weights/Audio_only/model')

### Deepspeech model

In [5]:
version_dir = "F:\Capstone Project\Capstone---RTSD-System/Code\Misc\Saved_data\Trained_models\deep_speech_models/"
model_file_8 = version_dir + "0.8.2/deepspeech-0.8.2-models.pbmm"
scorer_file_8 = version_dir + "0.8.2/deepspeech-0.8.2-models.scorer"

deepspeech_model_8 = ds.Model(model_file_8)
deepspeech_model_8.enableExternalScorer(scorer_file_8)

print("Sample_rate = " + str(deepspeech_model_8.sampleRate()))

Sample_rate = 16000


## Initialize Plot

In [6]:
cols = ['ang', 'exc', 'neu', 'sad']

df_pred_wav = pd.DataFrame([np.zeros(4)], columns=cols)
data_prev = []
graph_window = 30

# variable for plotting
x = np.arange(0, 2 * CHUNK, 2)
fig , ax = plt.subplots(3)

# basic formatting for the axes
ax[0].set_title('Emotion Prediction')
ax[0].set_xlabel('Time')
ax[0].set_ylabel('Confidence')

ax[0] = plot_line_graph(ax[0], df_pred_wav)

# show the plot
plt.show(block=False)

# for measuring frame rate
frame_count = 0
start_time = time.time()

## Set Up Buffer

In [7]:
WINDOW_SIZE = 0.5
WINDOW_N = 10
process_buffer = deque(maxlen=WINDOW_N)

feeder_buffer =  deque() # create empty buffer for holding in file data

#fill buffer
for i in range(WINDOW_N):
    process_buffer.append([])

## Set up Passive Audio Capture

In [8]:
idx = 0
file_index = {}
# Thread called file I/O function
def write_to_file(arg):
    global wf, idx, file_index

    # Write 5s of audio to file
    for elem in arg:
        wf.writeframes(elem)

    # If file is at the desired length close it, rename it to its utc start time (cant get ms?) and open the next temp
    # file for writing to
    if wf.tell() == RATE * file_len:
        wf.close()
        recording_name = str(calendar.timegm(time.gmtime()) - file_len) + '.wav'
        os.rename('../../Data/Thread_files/temp.wav', '../../Data/Thread_files/'+ recording_name)
        wf = wave.open('../../Data/Thread_files/temp.wav', 'wb')
        wf.setnchannels(1)
        wf.setframerate(RATE)
        wf.setsampwidth(2)
        return_set = (idx, recording_name)

        # add to file index
        file_index[idx] = recording_name

        #print("idx = " + str(idx))

        idx += 1
        return return_set

# Pyaudio callback which appends HW audio buffer data to fast I/O 5s long buffer
def callback(in_data, frame_count, time_info, status):
    if status != 0:
        print ("Non zero status")
        exit()
    global d
    d.append(in_data)

    # If 5s worth of audio is collected, copy to secondary buffer and pass to thread function for file I/O, then
    # clear 5s buffer
    if len(d) == RATE / CHUNK * file_len:
        frames = copy.copy(d)
        thread_list.append(executor.submit(write_to_file, frames))
        # thread = Thread(target=write_to_file, args=[frames])
        # thread.start()
        d.clear()
        #print ('Copied 5s buffer: ')

    return in_data, pyaudio.paContinue

In [9]:
# set up executer and buffer pool
executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
thread_list = []

In [10]:
audio_path = "Code/Notebook tests/audio_export/"
audio_path = "F:\Capstone Project\Capstone---RTSD-System\Data\CMU_MOSI\Raw\Audio\WAV_16000\Full/"
file_name = "0h-zjBukYpk.wav"

if rem_flag:
    # Open initial temp file and setup
    wf = wave.open('../../Data/Thread_files/temp.wav', 'wb')
    wf.setnchannels(1)
    wf.setframerate(RATE)
    wf.setsampwidth(2)

else:
    wf = wave.open(audio_path + file_name, 'r')
    split_wav = SplitWavAudio(audio_path,file_name)
    split_wav.multiple_split(sec_per_split=5)

#os.remove('../Misc/Saved_data/temp.wav')

In [11]:
# Setup audio input stream
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK,
                input_device_index=DEVICE,
                stream_callback=callback)

# Start audio input stream
stream.start_stream()

# Capture until we have X number of files for testing
all_result = []
data_int = []
prev_result = None
thread_path = '../../Data/Thread_files/'

## Start the Real Time System

In [12]:
def load_file_into_buffer(file_path, file_name, window_size = 0.5):
    audio_data = get_audio(file_path, file_name)
    (nchannels, sampwidth, framerate, nframes, comptype, compname), sample_wav = audio_data

    audio_length = nframes / framerate # length of wav in seconds
    # buffer_windows = nframes * len_wav # number of windows we need
    load_buffer = deque()

    # split audio into chunks of window_size and feed them into the buffer
    left = sample_wav[0::nchannels]
    # for t_start in range(0, int(len_wav), int(window_size)):

    t_start = 0
    while True:
        if t_start == 0: #first sample
            start = t_start
            end = t_start + window_size
        else:
            start = t_start
            end = t_start + window_size

        if audio_length < end:
            end = audio_length

        sample_left = left[int(start * framerate):int(end * framerate)]
        load_buffer.append(np.array(sample_left).astype('int16'))

        if audio_length == end: break
        else: t_start = end
    return load_buffer

def wait_for_file_idx(file_index_, file_idx_, audio_clip_length):
    while True:
        return_file_name = file_index_.get(file_idx_)
        if not return_file_name:
            time.sleep(audio_clip_length / 2)
            continue
        else:
            return return_file_name

def get_deepspeech_predictions(buffer, deepspeech_model):
    # compile list from buffer
    data_input = []
    for s in buffer:
        data_input += list(s)

    # down sample audio to 16K
    number_of_samples = round(len(data_input) * float(16000) / RATE)
    data_16K = sps.resample(data_input, number_of_samples)

    # make prediction on audio data
    stream_context = deepspeech_model.createStream()
    stream_context.feedAudioContent(data_16K.astype('int16')) # feed audio array to model
    text = stream_context.finishStream() # text predicted

    # get polaraty of audio
    tb_result = TextBlob(text)
    sentiment = np.array([tb_result.polarity])
    #sentiment = np.array([tb_result.polarity, tb_result.subjectivity]).reshape((-1, 1))

    return text, sentiment

In [13]:
# Recording flag & file count
recording = True
file_idx = 0
while recording:

    # Step 1: Index files
    if len(file_index) == 0: # wait for the first file to be written
        #print("indexing first file")
        file_name = wait_for_file_idx(file_index, 0, file_len)
        feeder_buffer = load_file_into_buffer(thread_path, file_name, WINDOW_SIZE)

        if multi_modal:
            # When we load in a new file, run deepspeech
            print("trigger deepspeech"); start_ds_time = time.time()
            text, sentiment = get_deepspeech_predictions(feeder_buffer, deepspeech_model_8)
            print("deepspeech runtime: " + str(time.time() - start_ds_time))

    if len(feeder_buffer) == 0: # if the feeder buffer is empty, read in a new audio file
        #print("loading to feeder")
        file_idx += 1
        file_name = wait_for_file_idx(file_index, file_idx, WINDOW_SIZE)
        feeder_buffer = load_file_into_buffer(thread_path, file_name, WINDOW_SIZE)

        if multi_modal:
            # When we load in a new file, run deepspeech
            print("trigger deepspeech"); start_ds_time = time.time()
            text, sentiment = get_deepspeech_predictions(feeder_buffer, deepspeech_model_8)
            print("deepspeech runtime: " + str(time.time() - start_ds_time))

    # Step 2: Move 1 window of the Feeder Buffer into the Process Buffer
    process_buffer.append(feeder_buffer.popleft())

    data_int = []
    for sample in process_buffer:
        data_int += list(sample)

    data_int = np.array(data_int).astype('int16')

    # Step 3: Calculate Features
    st_features = calculate_features(data_int, RATE, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)
    st_features = np.array([st_features.T])

    # Step 4: predict on model
    with tf.device('/gpu:0'):
        if multi_modal:
            wav_test_results = model.predict([st_features, sentiment])
        else:
            wav_test_results = model.predict(st_features)

    # Step 5: Graph output
    predicted_values = pd.DataFrame({cols[0]:wav_test_results[0][0],
                                 cols[1]:wav_test_results[0][1],
                                 cols[2]:wav_test_results[0][2],
                                 cols[3]:wav_test_results[0][3]
    }, index=[1])

    # pass previous values to filter function
    predicted_values = noise_filter(df_pred_wav.tail(1),
                                    predicted_values)

    df_pred_wav = df_pred_wav.append(predicted_values,
                                     ignore_index=True)

    df_pred_wav_view = df_pred_wav.tail(graph_window)
    df_pred_wav_view.reset_index(drop=True, inplace=True)
    ax[0].cla()
    ax[1].cla()

    ax[0] = plot_line_graph(ax[0], df_pred_wav_view)

    last_res = wav_test_results[0]
    ax[1] = plot_bar_graph (ax[1], last_res)

    if multi_modal:
        ax[2] = plot_text([text], sentiment)

    try:
        fig.canvas.draw()
        fig.canvas.flush_events()
        frame_count += 1

    except TclError:

        # calculate average frame rate
        frame_rate = frame_count / (time.time() - start_time)
        print('stream stopped')

    if file_idx > 15:
        recording = False

trigger deepspeech
deepspeech runtime: 10.714773654937744
trigger deepspeech
deepspeech runtime: 2.439002275466919
trigger deepspeech
deepspeech runtime: 2.1010115146636963
trigger deepspeech
deepspeech runtime: 2.4699885845184326
trigger deepspeech
deepspeech runtime: 13.28990912437439
trigger deepspeech
deepspeech runtime: 7.9573073387146
trigger deepspeech
deepspeech runtime: 11.243627309799194
trigger deepspeech
deepspeech runtime: 4.762020111083984
trigger deepspeech
deepspeech runtime: 2.021017074584961
trigger deepspeech
deepspeech runtime: 7.460062265396118
trigger deepspeech
deepspeech runtime: 3.1319997310638428
trigger deepspeech
deepspeech runtime: 2.6110074520111084
trigger deepspeech
deepspeech runtime: 5.623059988021851
trigger deepspeech
deepspeech runtime: 5.428000450134277
trigger deepspeech
deepspeech runtime: 3.1059987545013428
trigger deepspeech
deepspeech runtime: 4.4939985275268555
trigger deepspeech
deepspeech runtime: 3.6309995651245117


In [14]:
stream.stop_stream()
stream.close()
wf.close()

p.terminate()