In [1]:
import pandas as pd

import os
import sys
import wave
import copy
import math
from keras.models import load_model
import tensorflow as tf
import keras


from Code.Misc.helper import *
from Code.Misc.features import *
from Code.Misc.models import audio_only_model, audio_text_model

import pyaudio
import struct
import numpy as np
import matplotlib.pyplot as plt
import time
from tkinter import TclError

# use this backend to display in separate Tk window
%matplotlib tk
#%matplotlib inline

# constants

TIME_SEC = 0.5
RATE = 16000                 # samples per second
CHUNK = int(RATE * TIME_SEC) # samples per frame
FORMAT = pyaudio.paInt16     # audio format (bytes per sample?)
CHANNELS = 1                 # single channel for microphone
DEVICE = 5

# p = pyaudio.PyAudio()
# for i in range(p.get_device_count()):
#     print(p.get_device_info_by_index(i))

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0],
                                         enable=True)

In [2]:
def plot_line_graph(graph, data):
    graph.axhline(y=0.5, color='r', linestyle='--')
    #print(data.index)

    ang_data = data['ang']
    exc_data = data['exc']
    neu_data = data['neu']
    sad_data = data['sad']

    graph.plot(ang_data, label="Anger", color='r')
    graph.plot(exc_data, label="Excited", color='y')
    graph.plot(neu_data, label="Neutral", color='g')
    graph.plot(sad_data, label="Sad", color='b')
    graph.legend(loc="upper left")

    return graph

def plot_bar_graph (graph, data):
    graph.axhline(y=0.5, color='r', linestyle='--')
    graph.bar(x=0, height=data[0], label="Anger", color='r')
    graph.bar(x=1, height=data[1], label="Excited", color='y')
    graph.bar(x=2, height=data[2], label="Neutral", color='g')
    graph.bar(x=3, height=data[3], label="Sad", color='b')
    graph.legend(loc="upper left")

    return graph

def plot_text(text_array, polarity):
    char_size = 12
    empty_string_size = char_size
    total_input_string_size = 0

    word_index = 0.012
    text_total= ""

    plt.cla()

    plt.rcParams.update({'font.size': char_size})

    if polarity > 0:
        colour = "green"
    elif polarity < 0:
        colour = "red"
    else:
        colour = "gray"

    for text in text_array:
        text = text + " "
        text_size = len(text) * 0.012
        total_input_string_size += text_size

        plt.text(word_index, 0.5, text, bbox=dict(facecolor=colour,
                                                  alpha=0.5))
        word_index += text_size

    #plt.text(0.1, 0.5, text_total, bbox=dict(facecolor='red', alpha=0.5))

    plt.xlim([0, word_index])

    return plt

def noise_filter(prev_val: pd.DataFrame, cur_val: pd.DataFrame, a = 0.8):

    filtered_value = pd.DataFrame()
    for col in prev_val.columns.values:
        filtered_value[col] = [a * float(prev_val[col]) +
                               (1-a) * float(cur_val[col])]

    return filtered_value

In [3]:
# pyaudio class instance
p = pyaudio.PyAudio()

print("starting stram decleration")

# stream object to get data from microphone
stream = p.open(
    format=FORMAT,
    channels=CHANNELS,
    rate=RATE,
    input=True,
    output=True,
    frames_per_buffer=CHUNK,
    input_device_index = DEVICE
)

starting stram decleration


In [4]:
# set up model

use_old_model = False
if use_old_model:
    model = load_model("../Misc/Saved_data/Trained_models/2_layer_LSTM.pickle")
else:
    model = keras.models.load_model('../Misc/Saved_data/Trained_models' +
                                    '/Weights/Audio_only/model')

cols = ['ang', 'exc', 'neu', 'sad']

df_pred_wav = pd.DataFrame([np.zeros(4)], columns=cols)
data_prev = []
graph_window = 30

In [5]:
# variable for plotting
x = np.arange(0, 2 * CHUNK, 2)
fig , ax = plt.subplots(3)

# basic formatting for the axes
ax[0].set_title('Emotion Prediction')
ax[0].set_xlabel('Time')
ax[0].set_ylabel('Confidence')

ax[0] = plot_line_graph(ax[0], df_pred_wav)

# show the plot
plt.show(block=False)

# for measuring frame rate
frame_count = 0
start_time = time.time()

In [6]:
import deepspeech as ds
from textblob import TextBlob
import scipy.signal as sps

version_dir = "F:\Capstone Project\Capstone---RTSD-System/Code\Misc\Saved_data\Trained_models\deep_speech_models/"
model_file_8 = version_dir + "0.8.2/deepspeech-0.8.2-models.pbmm"
scorer_file_8 = version_dir + "0.8.2/deepspeech-0.8.2-models.scorer"

deepspeech_model_8 = ds.Model(model_file_8)
deepspeech_model_8.enableExternalScorer(scorer_file_8)

print("Sample_rate = " + str(deepspeech_model_8.sampleRate()))

Sample_rate = 16000


In [7]:
from collections import deque
from scipy.io.wavfile import write as wav_write
import re

WINDOW_SIZE = 0.5
WINDOW_N = 20
buffer = deque(maxlen=WINDOW_N)

CHUNK = int(RATE * WINDOW_SIZE)

#fill buffer
for i in range(WINDOW_N):
    buffer.append([])

# set default text and sentiment
text = ""
sentiment = 0

delay = 0

buffer_fill = 0
print("starting loop")
while (True):
    time_collect = time.time()

    # binary data
    data_new = stream.read(CHUNK)

    time_collect = time.time() - time_collect

    predict_time_start = time.time()
    #convert data to integers, make np array, then offset it by 127
    # data_new = struct.unpack(str(2 * CHUNK) + 'B', data_new)
    data_from_buffer = np.frombuffer(data_new, dtype=np.int16)

    buffer.append(data_from_buffer)
    buffer_fill += 1

    # for i in range(len(buffer)):
    #     data_prev = buffer[i]
    #     data_int = np.append(data_int, data_prev)
    data_int = []
    for sample in buffer:
        data_int += list(sample)

    data_int = np.array(data_int).astype('int16')
   #data_int = data_new
    if buffer_fill >= WINDOW_N:
        print("trigger deepspeech")
        stream_context = deepspeech_model_8.createStream()

        wav_write("audio_export/______output.wav",
                  rate=16000, data=data_int)

        # feed audio array to model
        stream_context.feedAudioContent(data_int.astype('int16'))

        # print output text
        text = stream_context.finishStream()

        print(text)
        sentiment = TextBlob(text).polarity

        text = re.sub("(.{64})", "\\1\n", text, 0, re.DOTALL)

        buffer_fill = 0

    # Generate features from data
    st_features = calculate_features(data_int, RATE, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)

    # reshape input from (34, 100) to (1, 100, 34)
    st_features = np.array([st_features.T])

    # predict on model
    with tf.device('/gpu:0'):
        wav_test_results = model.predict(st_features)

    predict_time_total = time.time() - predict_time_start

    graphing_time_start = time.time()

    predicted_values = pd.DataFrame({cols[0]:wav_test_results[0][0],
                                 cols[1]:wav_test_results[0][1],
                                 cols[2]:wav_test_results[0][2],
                                 cols[3]:wav_test_results[0][3]
    }, index=[1])

    # pass previous values to filter function
    predicted_values = noise_filter(df_pred_wav.tail(1),
                                    predicted_values)

    df_pred_wav = df_pred_wav.append(predicted_values,
                                     ignore_index=True)

    df_pred_wav_view = df_pred_wav.tail(graph_window)
    df_pred_wav_view.reset_index(drop=True, inplace=True)
    ax[0].cla()
    ax[1].cla()

    ax[0] = plot_line_graph(ax[0], df_pred_wav_view)

    last_res = wav_test_results[0]
    ax[1] = plot_bar_graph (ax[1], last_res)

    ax[2] = plot_text([text, str(buffer_fill)], sentiment)

    graphing_time_total = time.time() - graphing_time_start
    # update figure canvas
    try:
        fig.canvas.draw()
        fig.canvas.flush_events()
        frame_count += 1

    except TclError:

        # calculate average frame rate
        frame_rate = frame_count / (time.time() - start_time)

        print('stream stopped')
        print('average frame rate = {:.0f} FPS'.format(frame_rate))
        print('prediction_time = {f} seconds'.format(predict_time_total))

    data_prev = data_new

starting loop
trigger deepspeech
each one hypothesis havana aguadiente reasoning basically the number here this is just eugenia shows my temples we
trigger deepspeech
once you know in there says he have i succeed this alsatian might be as a treatise what was just and the horse going to chase and he sent so i working niggards like ad with those before low damianus to be re
trigger deepspeech
manslaughter also didn't in the seminal behave to do mike fight on that if we can the teuton is the issue running into
trigger deepspeech
if the indelicate on in heaven don't remember is my debenham of much which is to be a full forty four one cares steam car must they wish i were getting condition the buff am riding of her


KeyboardInterrupt: 

In [None]:
TEST = np.frombuffer(stream.read(CHUNK * 20), dtype=np.int16)
wav_write("audio_export/______output_1.wav", rate=16000, data=TEST)

In [None]:
wav_write("audio_export/______output.wav",
          rate=16000, data=data_int)

In [None]:
len(data_int)