In [1]:
import librosa, librosa.display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image
import pathlib
import csv
import glob
import warnings
import IPython.display as ipd
import shutil
warnings.filterwarnings('ignore')

In [2]:
# Extracting the Spectrogram for every Audio
flac_fns = glob.glob('ASMR collection/*/*.flac')
flac_num = len(flac_fns)

def feature_extraction(file_name):
    y, sr = librosa.load(file_name)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    feats = np.concatenate([chroma_stft, spec_cent, spec_bw, rolloff, zcr, mfcc], axis=0)
    return feats

In [3]:
feat_matrices = []
for i in range(flac_num):
    feat_mat = feature_extraction(flac_fns[i])
    feat_matrices.append(feat_mat)

In [4]:
feat_vectors = []
for i in range(flac_num):
    feat_mat = feat_matrices[i]
    feat_vectors.append(np.mean(feat_mat, axis=1))

In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import euclidean

scaler = StandardScaler()
feat_vectors = scaler.fit_transform(feat_vectors)

kmeans = KMeans(n_clusters=3)
kmeans.fit(feat_vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [6]:
# labels
print('Labels')
print(kmeans.labels_)
# centors
print('cluster centor')
print(kmeans.cluster_centers_)

Labels
[1 2 2 1 0 0 0 1 2 0 0 0 2 0 0 1 0 0 2 2 0 2 1 0 0 1 1 0 2 1 1 2 2 2 2 0 0
 0 0 1 0 0 0 2 1 1 0 2 1 1 2 2 1 0 1 1 1 1 2 2 0 1 1 0 0 1 1 1 1 2 1 2 1 1
 0 0 1 1 0 2 0 1 0 1 2 0 0 1 1 1 1 0 0 1 1 0 0 1 0 1 0 1 2 1 0 1 1]
cluster centor
[[-3.84399052e-03  7.71034310e-02  1.77644932e-01  3.66937498e-01
   6.33527845e-01  7.17737574e-01  7.49366654e-01  6.59006423e-01
   5.17844914e-01  3.60944786e-01  3.90543006e-02 -1.54261009e-01
  -6.17183417e-01 -2.64442337e-01 -5.89160320e-01 -7.08251420e-01
  -1.82150262e-01  3.90653936e-01  1.11275086e-01  4.77333910e-02
   4.74480135e-02  2.20930529e-02  2.27065952e-01  4.53672390e-01
   3.24498415e-01  7.48011250e-01  7.72816426e-01  6.35532931e-01
   6.79955724e-01  5.13401442e-01  6.83967469e-01  4.44786070e-01
   6.30165797e-01  4.20747360e-01  6.90460395e-01  5.69147846e-01]
 [ 3.47692157e-01  3.08229092e-01  1.79929010e-01  7.17730724e-03
  -2.98993780e-01 -3.96473043e-01 -3.02060935e-01 -2.43683828e-01
  -1.06762149e-01  6.60518439e-02

In [7]:
# calculate the euclidean distrance of each sample to their centor
labels = np.unique(kmeans.labels_)
dist = np.zeros(flac_num)
for label_ in labels:
    inds = np.where(kmeans.labels_==label_)[0]
    for ind_ in inds:
        feat_ = feat_vectors[ind_]
        centor_ = kmeans.cluster_centers_[label_]
        dist[ind_] = euclidean(feat_, centor_)

print('Distance to centor')
print(dist)

# label_ = 0
# shutil.copyfile(flac_fns[0], 'result/cluster_%d/'%label_+flac_fns[0].split('/')[-1])
# flac_fns[0].split('/')[-1]

f = open('cluster_result.txt', 'w')

for label_ in labels:
    f.write('Cluster %d\n'%label_)
    
    folder_name = 'result/cluster_%d/'%label_
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)
    inds = np.where(kmeans.labels_==label_)[0]
    distance_to_centor = dist[inds]
    print('Label ', label_)
    print('The audio that closest to the cluster centor (close -> far)')
    print(inds[distance_to_centor.argsort()])
    print(distance_to_centor[distance_to_centor.argsort()])
    for ind in inds[distance_to_centor.argsort()]:
        save_to = 'result/cluster_%d/'%label_+flac_fns[ind].split('/')[-1]
        shutil.copyfile(flac_fns[ind], save_to)
        f.write('index %d %s\n'%(ind, save_to))
        
    f.write('\n\n\n')
f.close()

Distance to centor
[ 4.38376326  6.77965017  4.55418355  4.71400446  6.11348956  5.69087512
  3.42652492  4.09731041  4.77250939  2.96067629  3.49698226  5.79285052
  4.76452219  3.62591913  7.5939235   5.63590063  4.54181983  4.86336488
  4.91407665  3.97741268  4.84570083  9.35312101  6.07438823  6.89277803
  2.69021476  3.89144892  3.83534915  3.52261162  4.3006161   6.46939671
  3.54442566  4.90043924  9.52863577  4.27379569 10.25116124  3.91492187
  5.42873794  4.84911819  4.33700556  4.20554437  3.86391015  3.22272758
  2.85279099  8.81379557  4.67200779  3.7928481   5.7016607   3.74686483
  3.85109773  4.42494907  6.80262137  3.71114658  4.66002915  3.13192916
  5.85362265  4.032942    6.87327004  5.76616284  8.96745535  2.92701332
  5.69557697  4.23953946  5.4646375   3.80834532  6.72788061  5.39833855
  4.60633658  3.19515028  6.04909595  5.8296563   3.8104382   5.50796353
  2.58588934  6.36843338  3.24843736  3.55325302  4.31697213  4.93903444
  4.42505767  6.81312214  2.7977

In [8]:
with open('index_to_filename.txt', 'w') as f:
    for ind, fn in enumerate(flac_fns):
        f.write('%d %s\n'%(ind, fn))

# Class 1

In [9]:
ipd.Audio(flac_fns[72])

In [10]:
ipd.Audio(flac_fns[67])

In [11]:
ipd.Audio(flac_fns[30])

# Class 2

In [12]:
ipd.Audio(flac_fns[59])

In [13]:
ipd.Audio(flac_fns[51])

In [14]:
ipd.Audio(flac_fns[47])

# Class 0

In [15]:
ipd.Audio(flac_fns[86])

In [16]:
ipd.Audio(flac_fns[92])

In [17]:
ipd.Audio(flac_fns[24])