In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os

In [2]:
params = {
    'num_classes': 12,
    
    # cnn 파라미터
    'use_cnn': True,
    'num_filters': [16, 32, 64, 64, 64, 64],
    'filter_size': [7, 3, 3, 3, 3, 3],
    'cnn_batch_norm' : [True, True, True, True, True, True],
    'pool_sizes': [2, 2, 1, 1, 1, 2],
    'cnn_dropout_keep_prob': [0, 0.2, 0.3, 0.25, 0.3, 0.3],
    # dense 파라미터
    'use_fc': False,
    'fc_hidden_units': [1028, 512, 256],
    'fc_batch_norm': [True, True, True],
    'fc_dropout_keep_prob': [0.2, 0.3, 0.35],
    
    # rnn(lstm) 파라미터
    'use_rnn': True,
    'rnn_n_hiddens': [1028, 512],
    'rnn_dropout_keep_prob': [0.6, 0.7],
    
    # Global Average Pooling / RNN이랑 동시사용불가
    'use_GAP': False,
    
    'learning_rate': 0.001,
    'activation': tf.nn.relu,
    'batch_size': 128,
    'epochs': 5,
    'height': 128,
    'width': 100,
    'model_path': './model/6conv_lstm_e5/' 
}

In [3]:

class Model:
    def __init__(self, params,name):
        # 하이퍼파라미터
        self.num_classes = params['num_classes']
        
        self.use_cnn = params['use_cnn']
        self.num_filters = params['num_filters']
        self.filter_sizes = params['filter_size']
        self.cnn_batch_norm  = params['cnn_batch_norm']
        self.pool_sizes = params['pool_sizes']
        self.cnn_dropout_keep_prob = params['cnn_dropout_keep_prob']
        
        
        self.use_fc = params['use_fc']
        self.fc_hidden_units = params['fc_hidden_units']
        self.fc_batch_norm = params['fc_batch_norm']
        self.fc_dropout_keep_prob = params['fc_dropout_keep_prob']
        
        self.use_rnn = params['use_rnn']
        self.rnn_n_hiddens = params['rnn_n_hiddens']
        self.rnn_dropout_keep_prob = params['rnn_dropout_keep_prob']
        
        self.use_GAP = params['use_GAP']
        
        self.learning_rate = params['learning_rate']
        self.activation = params['activation']
        
        self.height = params['height']
        self.width = params['width']
        self.model_path = params['model_path']
        self.idx_convolutional_layers = range(1, len(self.filter_sizes) + 1)
        self.idx_fc_layers = range(1, len(self.fc_hidden_units) + 1)
        self.idx_rnn_layers = range(1, len(self.rnn_n_hiddens) + 1)
        self.name = name
        

    #  컨볼루션 레이어를 params에서 받은 파라미터를 따라 구축
    def convolutional_layers(self, X, is_training = True, reuse = False):
        
        inputs = X
        for i, num_filter, filter_size, use_bn, pool_size, keep_prob in zip(self.idx_convolutional_layers,
                                                                            self.num_filters,
                                                                            self.filter_sizes,
                                                                            self.cnn_batch_norm,
                                                                            self.pool_sizes,
                                                                            self.cnn_dropout_keep_prob):            
            L = tf.layers.conv2d(inputs,
                                 filters=num_filter,
                                 kernel_size=filter_size,
                                 strides=1,
                                 padding='SAME',
                                 name = 'CONV'+str(i),
                                 reuse= reuse)
            if use_bn:
                L= tf.layers.batch_normalization(L, training= is_training, name='BN' + str(i), reuse= reuse)
            L = self.activation(L)
            
            if keep_prob:
                L = tf.layers.dropout(L, keep_prob, training = is_training)
            if pool_size != 1:
                L = tf.layers.max_pooling2d(L, pool_size = pool_size, strides = pool_size, padding = 'SAME')
            inputs = L
        return inputs
    
    
    #  dense 레이어를 params에서 받은 파라미터를 따라 구축
    def fc_layers(self, X, is_training = True, reuse = False):
        inputs = X
        for i, units, use_bn, keep_prob in zip(self.idx_fc_layers, self.fc_hidden_units, self.fc_batch_norm, self.fc_dropout_keep_prob):
            fc = tf.layers.dense(inputs,
                                 units=units,
                                 reuse=reuse,
                                 name = 'FC' + str(i))
            if use_bn:
                fc = tf.layers.batch_normalization(fc, training= is_training, name='fc_BN' + str(i), reuse= reuse)
            fc = self.activation(fc)
            if keep_prob:
                fc = tf.layers.dropout(fc, rate = keep_prob, training= is_training, name = 'fc_dropout' + str(i))
            inputs = fc 
        return inputs
  

     # LSTM 레이어 
    def rnn_layers(self, inputs, is_training = True, reuse = False):
        if is_training:
            keep_probs = self.rnn_dropout_keep_prob
            
        else:
            keep_probs = np.ones_like(self.rnn_dropout_keep_prob)
            
        # single layer
        if len(self.idx_rnn_layers) == 1:
            cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_n_hiddens[0], reuse = reuse)
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_probs[0])
        # multi layer 
        else:
            cell_list = []
            for i, n_hidden, keep_prob in zip(self.idx_rnn_layers, self.rnn_n_hiddens, keep_probs):
                cell_ = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, reuse = reuse)
                cell_ = tf.nn.rnn_cell.DropoutWrapper(cell_, output_keep_prob=keep_prob)
                cell_list.append(cell_)
            cell = tf.nn.rnn_cell.MultiRNNCell(cell_list)
        # output_shape [batch_size, width(n_step), n_classes]
        outputs, states = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
        print(outputs.get_shape().as_list())
        outputs = tf.transpose(outputs, [1, 0, 2])
        outputs = outputs[-1]
        return outputs
 

    def get_reshaped_cnn_to_rnn(self, inputs):
        # [batch, height, width, n_feature map]
        shape = inputs.get_shape().as_list() 
        # 우리가 얻어야하는 사이즈 [batch, width, height x n_feature map]
        inputs = tf.transpose(inputs, [0, 2, 1, 3])
        reshaped_inputs = tf.reshape(inputs, [-1, shape[2], shape[1] * shape[3]])
        return reshaped_inputs
  


    # 모델 구축/ logit 
    def get_logits(self, X, is_training = True, reuse = False):
        with tf.variable_scope(self.name):
            L = X
            if self.use_cnn:
                L = self.convolutional_layers(L, is_training, reuse)

            if self.use_GAP:
                shape =L.get_shape().as_list()
                # 글로벌 풀링 사이즈 (height, width)
                if self.use_rnn:
                    pool_size = (shape[1], 1)
                pool_size = (shape[1], shape[2])
                L= tf.layers.average_pooling2d(L, pool_size = pool_size, strides = 1, padding = 'VALID')
                # 마지막 dense layer를 위한 flatten
                L = tf.layers.flatten(L)

            if self.use_rnn:
                reshaped_fp = self.get_reshaped_cnn_to_rnn(L)
                L = self.rnn_layers(reshaped_fp, is_training, reuse)
                                
            if self.use_fc:
                if not self.use_GAP:
                    L = tf.layers.flatten(L)
                L = self.fc_layers(L, is_training, reuse)
            
                       
            output = tf.layers.dense(L, units= self.num_classes, reuse=reuse, name = 'out')
            return output
    


In [4]:
def train_parser(serialized_example):
    features = {
        "spectrum": tf.FixedLenFeature([128 * 100], tf.float32),
        "label": tf.FixedLenFeature([12], tf.int64)
    }

    parsed_feature = tf.parse_single_example(serialized_example, features)

    spec = parsed_feature['spectrum']
    label = parsed_feature['label']

    return spec, label
        
    
def test_parser(serialized_example):
    features = {
        "spectrum": tf.FixedLenFeature([128 * 100], tf.float32),
    }

    parsed_feature = tf.parse_single_example(serialized_example, features)

    spec = parsed_feature['spectrum']

    return spec

### Training

In [5]:
tf.reset_default_graph()

test_data_dir = "../data/tfrecords/test_final.tfrecord"
train_data_dir = "../data/tfrecords/train_final.tfrecord"

train_dataset = tf.data.TFRecordDataset(train_data_dir).map(train_parser)
train_dataset = train_dataset.shuffle(700000, seed = 1)
train_dataset = train_dataset.batch(params['batch_size'])

test_dataset = tf.data.TFRecordDataset(test_data_dir).map(test_parser)
test_dataset = test_dataset.batch(params['batch_size'])

train_itr = tf.contrib.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
test_itr = tf.contrib.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes)

spec, label = train_itr.get_next()
test_spec = test_itr.get_next()

spec = tf.reshape(spec, [-1, 128, 100, 1])
spec = tf.cast(spec, tf.float32)

test_spec = tf.reshape(test_spec, [-1, 128, 100, 1])
test_spec = tf.cast(test_spec, tf.float32)

train_init_op = train_itr.make_initializer(train_dataset)
test_init_op = test_itr.make_initializer(test_dataset)

name = 'model'
model = Model(params, 'model')

with tf.device('/gpu:0'):
    X = tf.placeholder(tf.float32, [None, params['height'], params['width'], 1])
    Y = tf.placeholder(tf.float32, [None, params['num_classes']])
    global_step = tf.Variable(0, trainable = False, name = 'global_step')

    logits_train = model.get_logits(X)                              
    loss = tf.losses.softmax_cross_entropy(Y, logits_train)   

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=name)            
    with tf.control_dependencies(update_ops):    
        optimizer = tf.train.AdamOptimizer(params['learning_rate']).minimize(loss, global_step=global_step)
        
    #eval
    logits_eval = model.get_logits(X, is_training=False, reuse=True)
    predict_proba_ = tf.nn.softmax(logits_eval)
    prediction = tf.argmax(predict_proba_, 1)
    accuracy = tf.metrics.accuracy(tf.argmax(Y, 1), prediction)
    
    #predict
    logits_test = model.get_logits(X, is_training=False, reuse=True)
    test_predict_proba_ = tf.nn.softmax(logits_test)
    test_prediction = tf.argmax(test_predict_proba_, 1)
    
    # 변수 프린트/ 텐서보드 summary 생성            
    tf.summary.scalar('loss', loss)
    tf.summary.scalar('accuracy', accuracy[1])
    
#     for v in tf.trainable_variables():
#         tf.summary.histogram('Var_{}'.format(v.name), v)
#         print(v)
        
    merged = tf.summary.merge_all()
    


[None, 13, 512]
[None, 13, 512]
[None, 13, 512]


In [None]:
# 모델 저장
saver = tf.train.Saver(tf.global_variables())

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

# model restore (writer 이하는 주석처리)
#new_saver = tf.train.import_meta_graph('./model/6conv_lstm_e5/6conv_lstm_e5.ckpt-21070.meta')
#new_saver.restore(sess, tf.train.latest_checkpoint('./model/6conv_lstm_e5/'))

writer = tf.summary.FileWriter('./logs_new/6conv_lstm_e5_2/', sess.graph)
        
for epoch in range(params['epochs']):
    sess.run(train_init_op)
    acc = []
    train_acc = []
    for i in range(2186):
        try:
            step = sess.run(global_step)
            
            _spec, _label = sess.run([spec, label])
            _, c, _summ = sess.run([optimizer, loss, merged], feed_dict = {X: _spec, Y: _label})
            acc_train = sess.run(accuracy, feed_dict = {X: _spec, Y: _label})
            
            train_acc.append(acc_train[1])
            
            writer.add_summary(_summ, step)
            
            if step % 500 == 0:
                print('step: {}, cost: {}'.format(step, c))
                
        except tf.errors.OutOfRangeError:
            break 
    while True:
        try:
            _spec, _label = sess.run([spec, label])
            val_acc = sess.run(accuracy, feed_dict = {X: _spec, Y: _label})
            
            acc.append(val_acc[1]) 
        except tf.errors.OutOfRangeError:
            break
    print('epoch: {}, cost : {}, train_acc: {}, acc; {}'.format(epoch, c, np.mean(train_acc), np.mean(acc)))

saver.save(sess, './model/6conv_lstm_e5_2/6conv_lstm_e5.ckpt', global_step=sess.run(global_step))

step: 0, cost: 2.7028439044952393
step: 500, cost: 0.4938085675239563
step: 1000, cost: 0.38135242462158203
step: 1500, cost: 0.21590806543827057
step: 2000, cost: 0.15291017293930054
epoch: 0, cost : 0.1843324452638626, train_acc: 0.7470812201499939, acc; 0.8612012267112732
step: 2500, cost: 0.23617693781852722
step: 3000, cost: 0.12155231833457947
step: 3500, cost: 0.06286904960870743
step: 4000, cost: 0.06124712899327278
epoch: 1, cost : 0.19882212579250336, train_acc: 0.8917738795280457, acc; 0.9108495116233826
step: 4500, cost: 0.06294667720794678
step: 5000, cost: 0.24623721837997437
step: 5500, cost: 0.0916653648018837
step: 6000, cost: 0.0851302444934845


- total batch : 3122

In [23]:
3122*0.7

2185.3999999999996

In [7]:
saver.save(sess, './model/6conv_2lstm_/6conv_2lstm_.ckpt', global_step=sess.run(global_step))

'./model/6conv_2lstm_/6conv_2lstm_.ckpt-21070'

### Predict

In [8]:
sess.run(test_init_op)

test_spec_ = sess.run(test_spec)
predict = sess.run(test_prediction, feed_dict={X: test_spec_})

while True:
    try:
        test_spec_ = sess.run(test_spec)
        predict = np.hstack([predict, sess.run(test_prediction, feed_dict={X: test_spec_})])
        
    except tf.errors.OutOfRangeError:
        break
        
print(np.bincount(predict))
print(len(predict))

[ 5443  5430  5234  5810  6157  6546  5254  7299  5494 94569  5952  5350]
158538


In [14]:
sess.run(test_init_op)

test_spec_ = sess.run(test_spec)
predict_proba = sess.run(test_predict_proba_, feed_dict={X: test_spec_})

while True:
    try:
        test_spec_ = sess.run(test_spec)
        predict_proba = np.vstack([predict_proba, sess.run(test_predict_proba_, feed_dict={X: test_spec_})])
        
    except tf.errors.OutOfRangeError:
        break
        
predict_proba = np.array(predict_proba)
print(predict_proba.shape)

pp = pd.DataFrame(predict_proba, index = files)
pp.to_csv('6conv_2lstm_predict_proba.csv', index = False)

(158538, 12)


In [9]:
# yes : [0 0 0 0 0 0 0 0 0 0 0 1]
# no : [0 0 0 1 0 0 0 0 0 0 0 0]
# up : [0 0 0 0 0 0 0 0 0 0 1 0]
# down : [1 0 0 0 0 0 0 0 0 0 0 0]
# left : [0 0 1 0 0 0 0 0 0 0 0 0]
# right : [0 0 0 0 0 0 1 0 0 0 0 0]
# on : [0 0 0 0 0 1 0 0 0 0 0 0]
# off : [0 0 0 0 1 0 0 0 0 0 0 0]
# stop : [0 0 0 0 0 0 0 0 1 0 0 0]
# go : [0 1 0 0 0 0 0 0 0 0 0 0]
# unknown : [0 0 0 0 0 0 0 0 0 1 0 0]
# silence : [0 0 0 0 0 0 0 1 0 0 0 0]

class_names = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence', 'stop', 'unknown', 'up', 'yes']

audio_path = '../data/test/audio/'

files = os.listdir(audio_path)
files = sorted(files)

In [15]:
len(files)

158538

In [16]:
pp = pd.DataFrame(predict_proba, index = files)
pp.to_csv('6conv_2lstm_predict_proba.csv', index = False)

In [19]:
pp.to_csv('6conv_2lstm_predict_proba.csv', index = False)

In [None]:
print(files[1500], '/', predict[1500])

In [None]:
import librosa
import librosa.display
import IPython.display as ipd

ipd.Audio(audio_path + 'clip_0000adecb.wav', rate=16000)

### submission.csv 파일 생성

In [11]:
import csv

with open('./sub/6conv_2lstm_.csv', 'w') as f:
    fieldnames=['fname', 'label']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for i in range(len(predict)):
        writer.writerow({'fname': files[i], 'label': class_names[predict[i]]})

In [None]:
with open('./sub/6conv_2lstm_.csv_predict_proba', 'w') as f:
    fieldnames=['fname', 'label']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for i in range(len(predict)):
        writer.writerow({'fname': files[i], 'label': class_names[predict[i]]})

In [31]:
temp = pd.read_csv('./6conv_2lstm_.csv')
# for i in range(len(temp["label"])):
#     if temp['label'][i] == 'silence':
#         temp['label'][i] = 'unknown'

In [29]:
pd.DataFrame.to_csv(temp, "./6conv_2lstm_temp.csv", index=False)

In [30]:
ttt = pd.read_csv('./6conv_2lstm_temp.csv')
ttt

Unnamed: 0,fname,label
0,clip_000044442.wav,no
1,clip_0000adecb.wav,unknown
2,clip_0000d4322.wav,unknown
3,clip_0000fb6fe.wav,unknown
4,clip_0001d1559.wav,unknown
5,clip_0002256ed.wav,unknown
6,clip_0002a4a1f.wav,unknown
7,clip_0002d9b83.wav,unknown
8,clip_000373a5b.wav,go
9,clip_0003c7122.wav,unknown


In [34]:
ttt.label.value_counts()

unknown    101868
on           6546
off          6157
up           5952
no           5810
stop         5494
down         5443
go           5430
yes          5350
right        5254
left         5234
Name: label, dtype: int64

In [33]:
temp.label.value_counts()

unknown    94569
silence     7299
on          6546
off         6157
up          5952
no          5810
stop        5494
down        5443
go          5430
yes         5350
right       5254
left        5234
Name: label, dtype: int64

In [22]:
!nvidia-smi

Thu Jan 11 04:18:46 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81                 Driver Version: 384.81                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   34C    P0    34W / 300W |  13508MiB / 16152MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [21]:
sess.close()