<a href="https://colab.research.google.com/github/2hoyeong/SoundClassification/blob/master/SoundClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Google drive 연결
from google.colab import drive
drive.mount('/content/gdrive')

from google.colab import auth
auth.authenticate_user()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import tensorflow as tf

print(tf.__version__)
!nvidia-smi

1.15.0
Mon Dec 16 01:34:24 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|

In [0]:
!pip install numpy scipy
!pip install resampy six
!pip install pysoundfile



In [0]:
# Architectural constants.
#NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
#NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
NUM_FRAMES = 40
NUM_BANDS = 313
EMBEDDING_SIZE = 128  # Size of embedding layer.

# Hyperparameters used in feature and example generation.
SAMPLE_RATE = 16000
STFT_WINDOW_LENGTH_SECONDS = 0.025
STFT_HOP_LENGTH_SECONDS = 0.010
NUM_MEL_BINS = NUM_BANDS
MEL_MIN_HZ = 125
MEL_MAX_HZ = 7500
LOG_OFFSET = 0.001  # Offset used for stabilized log of input mel-spectrogram.
EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
EXAMPLE_HOP_SECONDS = 0.48     # with zero overlap.

# Parameters used for embedding postprocessing.
PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
PCA_MEANS_NAME = 'pca_means'
QUANTIZE_MIN_VAL = -2.0
QUANTIZE_MAX_VAL = +2.0

# Hyperparameters used in training.
INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.

# Names of ops, tensors, and features.
INPUT_OP_NAME = 'vggish/input_features'
INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
OUTPUT_OP_NAME = 'embedding'
OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'

In [0]:
import numpy as np
import soundfile as sf
from os import listdir
from os.path import isfile, join
import random
import librosa
slim = tf.contrib.slim

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
def define_vggish_slim(training=False):
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, NUM_FRAMES, NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, NUM_FRAMES, NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

In [0]:
def extract_features(audio, sr):
  try:
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)

  except Exception as e:
    print("Error encountered while parsing file: ", audio)
    return None 
  return mfccs

In [0]:
NUM_EPOCH = 2000
_NUM_CLASSES = 3

sr = 16000
sampling_r = 16000

In [0]:
sess = tf.Session()

embeddings = define_vggish_slim(True)

num_units = 128
fc = slim.fully_connected(embeddings, num_units)

logits = slim.fully_connected(
    fc, _NUM_CLASSES, activation_fn=None, scope='logits')

predict = tf.sigmoid(logits, name='prediction')

# Add training ops.
global_step = tf.Variable(
    0, name='global_step', trainable=False,
    collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                    tf.GraphKeys.GLOBAL_STEP])

# Labels are assumed to be fed as a batch multi-hot vectors, with
# a 1 in the position of each positive class label, and 0 elsewhere.
labels = tf.placeholder(
    tf.float32, shape=(None, _NUM_CLASSES), name='labels')

# Cross-entropy label loss.
xent = tf.nn.sigmoid_cross_entropy_with_logits(
    logits=logits, labels=labels, name='xent')
loss = tf.reduce_mean(xent, name='loss_op')
loss_summary = tf.summary.scalar('loss', loss)

# We use the same optimizer and hyperparameters as used to train VGGish.
optimizer = tf.train.AdamOptimizer(
    learning_rate=LEARNING_RATE,
    epsilon=ADAM_EPSILON)
optimizer.minimize(loss, global_step=global_step, name='train_op')

# Initialize all variables in the model, and then load the pre-trained
# VGGish checkpoint.
sess.run(tf.global_variables_initializer())
#vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')

# Locate all the tensors and ops we need for the training loop.
features_tensor = sess.graph.get_tensor_by_name(INPUT_TENSOR_NAME)
labels_tensor = sess.graph.get_tensor_by_name('labels:0')
global_step_tensor = sess.graph.get_tensor_by_name(
'global_step:0')

loss_tensor = sess.graph.get_tensor_by_name('loss_op:0')
train_op = sess.graph.get_operation_by_name('train_op')

correct_predictions = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
accuracy_tensor = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
accuracy_summary = tf.summary.scalar('accuracy', accuracy_tensor)

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
_LABEL_CLASSES = ['car', 'siren', 'shout']
data_dir = '/content/gdrive/My Drive/colab/data/SoundClassification'
exportdir = "/content/gdrive/My Drive/colab/code/vggish"

In [0]:
total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
        variable_parameters *= dim.value
    total_parameters += variable_parameters
print(total_parameters)

147655555


In [0]:
TRAIN_NUM_BATCH = 200
EVAL_NUM_BATCH = 20
acc_for_graph = [0]
saver = tf.train.Saver()
try:
  for steps in range(1, NUM_EPOCH + 1):
      for CLASS in range(len(_LABEL_CLASSES)):
          all_files = [data_dir + '/train/' + _LABEL_CLASSES[CLASS] + '/' + f for f in listdir(data_dir + '/train/' + _LABEL_CLASSES[CLASS] + '/') if isfile(join(data_dir + '/train/' + _LABEL_CLASSES[CLASS] + '/', f))]
          files = random.sample(all_files, TRAIN_NUM_BATCH)
          y, sr = librosa.load(files[0], sr=sampling_r)
          feature = [extract_features(y, sr)]
          for file in files[1:]:
            y, sr = librosa.load(file, sr=sampling_r)
            bytes = [extract_features(y, sr)]
            feature = np.concatenate((feature, bytes))

          label = np.array([[int(i == CLASS) for i in range(3)]] * feature.shape[0])
          [num_steps, loss, _, acc, predction] = sess.run(
                [global_step_tensor, loss_tensor, train_op, accuracy_tensor, predict], feed_dict={features_tensor:feature, labels_tensor: label})
          
          print("[{:d} steps Training] loss {:g}, acc {:g}%".format(steps, loss, (acc * 100)))
          if steps % 10 == 0:
            print(np.shape(predction), predction[0])
      if steps % 5 == 0:
        for i in range(len(_LABEL_CLASSES)):
          eval_files = [data_dir + '/eval/' + _LABEL_CLASSES[i] + '/' + f for f in listdir(data_dir + '/eval/' + _LABEL_CLASSES[i] + '/') if isfile(join(data_dir + '/eval/' + _LABEL_CLASSES[i] + '/', f))]
          random.shuffle(eval_files)
          y, sr = librosa.load(eval_files[0], sr=sampling_r)
          feature = [extract_features(y, sr)]
          for file in eval_files[1:]:
            y, sr = librosa.load(file, sr=sampling_r)
            bytes = [extract_features(y, sr)]
            feature = np.concatenate((feature, bytes))
          label = np.array([[int(a == i) for a in range(3)]] * feature.shape[0])
          [num_steps, loss, _, acc, predction] = sess.run(
                  [global_step_tensor, loss_tensor, train_op, accuracy_tensor, predict],
                  feed_dict={features_tensor: feature, labels_tensor: label})

          print("[{:d} steps {:s} Evaluation] loss {:g}, acc {:g}%".format(i, _LABEL_CLASSES[i], loss, (acc * 100)))
        ckpt_path = saver.save(sess, exportdir + "/trained")
        print("SAVED :", ckpt_path)
except KeyboardInterrupt:
  print("EXIT")

#tf.train.Saver().save(sess, exportdir + '/trained.ckpt')
#tf.train.write_graph(sess.graph_def, ".", exportdir + '/trained.pb', as_text=False)

converter = tf.lite.TFLiteConverter.from_session(sess, [features_tensor], [predict])
#converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
tflite_model = converter.convert()
open(exportdir + "/new_converted_model.tflite", "wb").write(tflite_model)
ckpt_path = saver.save(sess, exportdir + "/trained")

[1 steps Training] loss 0.693125, acc 100%
[1 steps Training] loss 0.693511, acc 0%
[1 steps Training] loss 0.694585, acc 0%
[0 steps car Evaluation] loss 0.692378, acc 100%
[1 steps siren Evaluation] loss 0.692786, acc 0%
[2 steps shout Evaluation] loss 0.693231, acc 0%
SAVED : /content/gdrive/My Drive/colab/code/vggish/trained
[2 steps Training] loss 0.690671, acc 100%
[2 steps Training] loss 0.690983, acc 0%
[2 steps Training] loss 0.692566, acc 0%
EXIT
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 22 variables.


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-56df8ed5f575>", line 48, in <module>
    converter = tf.lite.TFLiteConverter.from_session(sess, [features_tensor], [predict])
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/lite/python/lite.py", line 628, in from_session
    graph_def = _freeze_graph(sess, input_tensors, output_tensors)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/lite/python/util.py", line 249, in freeze_graph
    output_arrays)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/util/deprecation.py", line 324, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/graph_util_impl.py", line 344, in convert_variables_to_constants
    data, data.shape)
  File "/usr/local/lib/python3.6

KeyboardInterrupt: ignored

In [0]:
import matplotlib.pyplot as plt

plt.plot(acc_for_graph)
plt.show()