In [4]:
import os
import sys
import glob
import time
import librosa
import librosa.display
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
%matplotlib inline
plt.style.use('ggplot')

In [3]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += (window_size // 2)

def extract_features(parent_dir, sub_dirs, file_ext="*.mp3", bands = 60, frames = 41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    labels = []
    for l, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            sound, sr = librosa.load(fn)
            label = fn.split('/')[-1].split('-')[-2]
            for (start, end) in windows(sound, window_size):
                if(len(sound[start:end]) == window_size):
                    signal = sound[start:end]
                    melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
                    logspec = librosa.logamplitude(melspec)
                    logspec = logspec.T.flatten()[:, np.newaxis].T
                    log_specgrams.append(logspec)
                    labels.append(label)
            
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams), bands, frames, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features), np.array(labels, dtype = np.int)


def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels, n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [11]:
parent_dir = 'dataset/train'
tr_sub_dirs = ['fold1', 'fold2', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9']
train_dataset, train_labels = extract_features(parent_dir, tr_sub_dirs)
train_labels = one_hot_encode(train_labels)

valid_sub_dirs = ['fold4']
valid_dataset, valid_labels = extract_features(parent_dir, valid_sub_dirs)
valid_labels = one_hot_encode(valid_labels)

ts_sub_dirs = ['fold3']
test_dataset, test_labels = extract_features(parent_dir, ts_sub_dirs)
test_labels = one_hot_encode(test_labels)

In [14]:
import pickle

with open('train_dataset.pickle', 'wb') as f:
    pickle.dump(train_dataset, f)

with open('train_labels.pickle', 'wb') as f:
    pickle.dump(train_labels, f)

with open('valid_dataset.pickle', 'wb') as f:
    pickle.dump(valid_dataset, f)

with open('valid_labels.pickle', 'wb') as f:
    pickle.dump(valid_labels, f)

with open('test_dataset.pickle', 'wb') as f:
    pickle.dump(test_dataset, f)

with open('test_labels.pickle', 'wb') as f:
    pickle.dump(test_labels, f)

OSError: [Errno 22] Invalid argument

In [7]:
import pickle

# with open('train_dataset.pickle', 'rb') as f:
#     train_dataset = pickle.load(f)

# with open('train_lables.pickle', 'rb') as f:
#     train_labels = pickle.load(f)

with open('dataset_features/valid_dataset.pickle', 'rb') as f:
    valid_dataset = pickle.load(f)

with open('dataset_features/valid_labels.pickle', 'rb') as f:
    valid_labels = pickle.load(f)

with open('dataset_features/test_dataset.pickle', 'rb') as f:
    test_dataset = pickle.load(f)

with open('dataset_features/test_labels.pickle', 'rb') as f:
    test_labels = pickle.load(f)

In [10]:
test_dataset.shape

(8260, 60, 41, 2)

# now nn

In [33]:
from sklearn.utils import shuffle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, TimeDistributed, Bidirectional, Conv2D
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout
from keras.utils import to_categorical

In [23]:
tr_ds = valid_dataset[:, :, :, 0]

In [27]:
model = Sequential()
model.add(LSTM(128, return_sequences = True, input_shape=(60, 41)))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences = False))
model.add(Dense(10))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
print('Fitting...')
for iteration in range(1, 100):
    sys.stdout.write('{}..'.format(iteration))
    tr_ds_sh, tr_lb_sh = shuffle(tr_ds, valid_labels)
    model.fit(tr_ds_sh, tr_lb_sh, batch_size=len(tr_ds), epochs=1, verbose=0)
print('fitted!')
print('Predicting...')
preds = model.predict(test_dataset[:, :, :, 0], verbose=0)
print('done!')
preds = np.array(list(map(lambda x: list(map(lambda i: 1 if i==max(x) else 0, x)), preds)))

from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_labels, preds))
print(accuracy_score(test_labels, preds))

Fitting...
1..2..3..4..5..6..7..8..9..10..11..12..13..14..15..16..17..18..19..20..21..22..23..24..25..26..27..28..29..30..31..32..33..34..35..36..37..38..39..40..41..42..43..44..45..46..47..48..49..50..51..52..53..54..55..56..57..58..59..60..61..62..63..64..65..66..67..68..69..70..71..72..73..74..75..76..77..78..79..80..81..82..83..84..85..86..87..88..89..90..91..92..93..94..95..96..97..98..99..fitted!
Predicting...
done!
             precision    recall  f1-score   support

          0       0.37      0.40      0.39       938
          1       0.48      0.44      0.46       892
          2       0.19      0.13      0.15       644
          3       0.15      0.09      0.11       770
          4       0.30      0.17      0.21       811
          5       0.19      0.45      0.26       755
          6       0.10      0.06      0.08       878
          7       0.28      0.20      0.23       796
          8       0.21      0.24      0.22       948
          9       0.21      0.27      0.23 

In [41]:
model = Sequential()
model.add(Conv2D(128, (57, 6), input_shape=(60, 41, 2)))
model.add(Dropout(0.5))
model.add(Conv2D(128, (1, 3)))
model.add(Flatten())
model.add(Dense(10))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [42]:
model.output_shape

(None, 10)

In [43]:
model.fit(valid_dataset, valid_labels, epochs=33, batch_size=100,  validation_split=0.1)

Train on 7303 samples, validate on 812 samples
Epoch 1/33
Epoch 2/33
Epoch 3/33
Epoch 4/33
Epoch 5/33
Epoch 6/33
Epoch 7/33
Epoch 8/33
Epoch 9/33
1500/7303 [=====>........................] - ETA: 20s - loss: 14.3559 - acc: 0.1093

KeyboardInterrupt: 

## почему могло не получиться

- мел-кепстральные коэффициенты ничего не говорят
- грязные данные
- слишком большой learning rate
- нельзя верить тому, как ты год назад делал датасет

# окей, а что ещё у нас есть

Есть большая куча англоязычных субтитров с отметками звуков. Может, можно как-то скластеризовать на условные жанры по звукам?

In [60]:
# распаковываем субтитры
folder = '/Users/Basilis/Documents/maga/ML/diplom/'
if not os.path.exists(os.path.join(folder, 'txtfiles')):
    os.mkdir(os.path.join(folder, 'txtfiles'))
for root, dirs, files in os.walk('/Users/Basilis/Downloads/files/'):
    for fn in files:
        if fn.endswith('.gz'):
            if not fn.endswith('.txt.gz'):
                with gzip.open(os.path.join(root, fn), 'rb') as f_in, open(os.path.join(folder, 'txtfiles', fn[:-3]), 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

In [5]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import *
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import re
import random
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, classification_report, f1_score, make_scorer, accuracy_score

In [64]:
# делаем выборку и засовываем в массив
sample = random.sample(os.listdir('../txtfiles'), 2000)

texts = []
for fil in sample:
    with open(os.path.join('../txtfiles', fil), 'rb') as f:
        try:
            text = f.read().decode('utf-8')
            texts.append(text)
        except:
            continue
print(len(texts))
print(texts[300][:300])

1680
1
00:00:24,991 --> 00:00:27,494
[ Door Buzzer Buzzing ]
DADDY.

2
00:00:30,497 --> 00:00:32,499
DADDY, ARE YOU HOME ?
MAYBE HE ISN'T.

3
00:00:32,999 --> 00:00:35,001
OF COURSE HE IS.
DADDY !

4
00:00:35,001 --> 00:00:38,004
[ Knocking ]
FIND HIM ?

5
00:00:38,004 --> 00:00:42,6


In [65]:
# чистим и засовываем в датафрейм
texts = pd.DataFrame({'text': texts})
texts['clear'] = texts['text'].map(lambda x: re.sub('((\r)?\n)|(</?[^>]+>)', ' ', re.sub('[0-9\[\]#"\':\-<>,]+', '', x)))
texts['clear'] = texts['clear'].map(lambda x: re.sub('[A-Z][a-z]+', ' ', x).lower())
texts.head()

Unnamed: 0,text,clear
0,"1\r\n00:00:02,240 --> 00:00:04,470\r\nAnd it w...",it was at that moment that realised th...
1,"1\r\n00:00:40,065 --> 00:00:47,784\r\n(unearth...",(unearthly howling) ? is it? ...
2,"﻿1\r\n00:00:01,099 --> 00:00:04,801\r\n<i>In 2...",﻿ i an immortal tyrant named /i i...
3,"﻿1\r\n00:00:01,367 --> 00:00:03,184\r\nOliver:...",﻿ name is . five years on a h...
4,"1\r\n00:00:54,444 --> 00:00:56,522\r\nHello, J...",. . taking this cat over to gr...


In [52]:
del texts['text']

In [66]:
stopwds = stopwords.words('english') + ['like', 'oh', 'know', 'right', 'get', 'let', 'go', 'well', 
                                        'come', 'gonna', 'yeah', 'good', 'yes', 'got', 'one', 'hey', 'think', 'want', 
                                       'us', 'man', 'men', 'okay', 'need', 'see', 'back', 'would', 'oh', 'going', 
                                       'take']

In [67]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwds)),
    ('tfidf', TfidfTransformer()),
#     ('svd', TruncatedSVD(n_components=100, random_state=27)),
    ('norm', Normalizer() ),
    ('clust', KMeans(n_clusters=10, random_state=75))
])
print('fitting...')
pipeline.fit(texts['clear'])
print('fitted')


# explained_variance = pipeline.named_steps['svd'].explained_variance_ratio_.sum()
# print("Explained variance of the SVD step: {:.0f}%".format(explained_variance * 100))

clust_labels = pipeline.named_steps['clust'].labels_
# labels = texts['event_id']

# print("Homogeneity:", homogeneity_score(labels, clust_labels))
# print("Completeness:", completeness_score(labels, clust_labels))
# print("V-measure",  v_measure_score(labels, clust_labels))
# print("Adjusted Rand-Index:",  adjusted_rand_score(labels, clust_labels))
# print()

fitting...
fitted


In [68]:
print("Top terms per cluster:")
order_centroids = pipeline.named_steps['clust'].cluster_centers_.argsort()[:, ::-1]
terms = pipeline.named_steps['vect'].get_feature_names()
for i in range(10):
    print("Cluster {}".format(i)),
    for ind in order_centroids[i, :10]:
        print(' {}'.format(terms[ind])),

Top terms per cluster:
Cluster 0
 dont
 grunting
 grunts
 growling
 screaming
 gasps
 snarling
 youre
 panting
 sighs
Cluster 1
 music
 dont
 instrumental
 youre
 dramatic
 time
 cant
 shit
 continues
 could
Cluster 2
 fucking
 dont
 fuck
 shit
 youre
 fuckin
 time
 mean
 really
 cant
Cluster 3
 dont
 youre
 time
 cant
 really
 didnt
 mean
 could
 say
 something
Cluster 4
 im
 ill
 dont
 ive
 rangers
 youre
 god
 ha
 thats
 mr
Cluster 5
 lm
 lts
 lll
 lt
 lve
 lf
 dont
 ng
 ls
 ld
Cluster 6
 dont
 youre
 didnt
 font
 time
 tell
 cant
 could
 way
 something
Cluster 7
 font
 color
 dd
 ffff
 aaffi
 ff
 dont
 fontfont
 white
 ffi
Cluster 8
 dont
 time
 youre
 cant
 way
 could
 say
 something
 make
 little
Cluster 9
 ok
 dont
 youre
 cant
 time
 really
 something
 didnt
 could
 love


Если не удалять имена, стабильно выделяется кластер Стартрека, также иногда кластеры ТБВ, Супернатуралов, Симпсонов

Если удалять, получается странно, но звуки действительно кластеризуются

## зачем это может быть нужно

- не знаю, но это весело

In [16]:
bands = 60
frames = 41
num_labels = 10
num_channels = 2

batch_size = 1000
patch1_h = 57
patch1_w = 6
patch2_h = 1
patch2_w = 3
depth = 20
num_hidden = 80
learning_rate = 0.002 # 0.05
dropout = 0.5

# nn.dropout() только на трейне

graph = tf.Graph()

with graph.as_default():

    # input
    tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, bands, frames, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset, dtype='float32_ref')
    tf_test_dataset = tf.constant(test_dataset, dtype='float32_ref')
  
    # layers
    layer1_weights = tf.Variable(tf.truncated_normal(
      [patch1_h, patch1_w, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    
    layer2_weights = tf.Variable(tf.truncated_normal(
      [patch2_h, patch2_w, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    
    layer3_weights = tf.Variable(tf.truncated_normal(
      [frames // 8 * bands * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    layer4_weights = tf.Variable(tf.truncated_normal(
      [frames // 10 * depth, num_hidden], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    
    layer5_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
    layer5_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
    # model
    def model(data, train=True):
        # первый свёрточный
        print('data ', data)
        print('l1 filters ', layer1_weights)
        conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        print('first layer ', hidden.shape)
        hidden = tf.nn.max_pool(hidden, ksize=[1, 4, 3, 1], strides=[1, 1, 3, 1],
                          padding='SAME')
        if train:
            hidden = tf.nn.dropout(hidden, dropout)
            print('here')
        print('max pool ', hidden.shape)
        # второй свёрточный
        print('filters ', layer2_weights)
        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 1, 1, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        print('second layer ', hidden.shape)
        hidden = tf.nn.max_pool(hidden, ksize=[1, 1, 3, 1], strides=[1, 1, 3, 1],
                          padding='SAME')
        print('max pool ', hidden.shape)
        # FC 1
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        print('reshape', reshape.shape)
        print(layer3_weights)
        print(layer3_biases)
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        print('third layer ', hidden.shape)
        if train:
            hidden = tf.nn.dropout(hidden, dropout)
        # FC 2
        shape = hidden.get_shape().as_list()
        print('fourth shape', shape)
#         reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(hidden, layer4_weights) + layer4_biases)
        if train:
            hidden = tf.nn.dropout(hidden, dropout)
        return tf.matmul(hidden, layer5_weights) + layer5_biases
  
    # training
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
    # optimizer
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
  
    # predictions
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset, train=False))
    test_prediction = tf.nn.softmax(model(tf_test_dataset, train=False))

data  Tensor("Placeholder:0", shape=(1000, 60, 41, 2), dtype=float32)
l1 filters  <tf.Variable 'Variable:0' shape=(57, 6, 2, 20) dtype=float32_ref>
first layer  (1000, 60, 41, 20)
here
max pool  (1000, 60, 14, 20)
filters  <tf.Variable 'Variable_2:0' shape=(1, 3, 20, 20) dtype=float32_ref>
second layer  (1000, 60, 14, 20)
max pool  (1000, 60, 5, 20)
reshape (1000, 6000)
<tf.Variable 'Variable_4:0' shape=(6000, 80) dtype=float32_ref>
<tf.Variable 'Variable_5:0' shape=(80,) dtype=float32_ref>
third layer  (1000, 80)
fourth shape [1000, 80]
data  Tensor("Const:0", shape=(8115, 60, 41, 2), dtype=float32)
l1 filters  <tf.Variable 'Variable:0' shape=(57, 6, 2, 20) dtype=float32_ref>
first layer  (8115, 60, 41, 20)
max pool  (8115, 60, 14, 20)
filters  <tf.Variable 'Variable_2:0' shape=(1, 3, 20, 20) dtype=float32_ref>
second layer  (8115, 60, 14, 20)
max pool  (8115, 60, 5, 20)
reshape (8115, 6000)
<tf.Variable 'Variable_4:0' shape=(6000, 80) dtype=float32_ref>
<tf.Variable 'Variable_5:0' sh

In [17]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [18]:
num_steps = 2000

saver = tf.train.Saver([layer1_weights])

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in tqdm(range(num_steps)):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
#             valpred = session.run([valid_prediction], feed_dict={tf_valid_dataset: valid_dataset})
#             print('Validation accuracy: %.1f%%' % accuracy(val_pred, valid_labels))
            save_path = saver.save(session, "model.ckpt")
#             print("Model saved in file: %s" % save_path)
#     print('Test accuracy: %.1f%%' % accuracy(session.run([test_prediction], feed_dict={tf_test_dataset: test_dataset}), test_labels))


  0%|          | 0/2000 [00:00<?, ?it/s]

Initialized


[A


Minibatch loss at step 0: 270.738434
Minibatch accuracy: 8.2%


  2%|▎         | 50/2000 [20:47<12:57:33, 23.93s/it]

Minibatch loss at step 50: 2.764158
Minibatch accuracy: 9.2%


  3%|▎         | 60/2000 [25:13<15:00:31, 27.85s/it]

KeyboardInterrupt: 