In [1]:
from google.colab import drive
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [2]:
!pip install pydub
import numpy as np, scipy, matplotlib.pyplot as plt, IPython.display as ipd
import librosa, librosa.display
import os
import copy
from pydub import AudioSegment, effects
from pydub.silence import split_on_silence

Collecting pydub
  Downloading https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [3]:
origin = AudioSegment.from_file('/content/gdrive/MyDrive/Audio/J&T.wav', 'wav')
normalizedSound = effects.normalize(origin)
normalizedSound.export('/content/gdrive/MyDrive/Audio/output.wav', format='wav')

<_io.BufferedRandom name='/content/gdrive/MyDrive/Audio/output.wav'>

In [4]:
def dtw_table(x, y, distance = None):
    if distance is None:
        distance = scipy.spatial.distance.euclidean
    nx = len(x)
    ny = len(y)
    table = np.zeros((nx+1, ny+1))
    
    table[1:, 0] = np.inf    
    table[0, 1:] = np.inf
        
    for i in range(1, nx+1):
        for j in range(1, ny+1):
            d = distance(x[i-1], y[j-1])
            table[i, j] = d + min(table[i-1, j], table[i, j-1], table[i-1, j-1])
    return table

In [5]:
def dtw(x, y, table):
    i = len(x)
    j = len(y)
    path = [(i, j)]
    while i > 0 or j > 0:
        minval = np.inf
        if table[i-1, j] < minval:
            minval = table[i-1, j]
            step = (i-1, j)
        if table[i][j-1] < minval:
            minval = table[i, j-1]
            step = (i, j-1)
        if table[i-1][j-1] < minval:
            minval = table[i-1, j-1]
            step = (i-1, j-1)
        path.insert(0, step)
        i, j = step
    return np.array(path)

In [6]:
def preprocess_mfcc(mfcc):
    mfcc_cp = copy.deepcopy(mfcc)
    for i in range(mfcc.shape[1]):
        mfcc_cp[:,i] = mfcc[:,i] - np.mean(mfcc[:,i])
        mfcc_cp[:,i] = mfcc_cp[:,i]/np.max(np.abs(mfcc_cp[:,i]))
    return mfcc_cp

In [7]:
def audio_preprocess(audio):
  y, sr = librosa.load(audio)
  y_filt = librosa.effects.preemphasis(y)
  y_mfcc = librosa.feature.mfcc(y = y_filt, sr = sr)
  y_mfcc = preprocess_mfcc(y_mfcc)
  return y_mfcc

In [8]:
sound = AudioSegment.from_wav('/content/gdrive/MyDrive/Audio/output.wav')
chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=-34)

output_chunks = [chunks[0]]
for chunk in chunks[1:]:
  output_chunks.append(chunk)

combined = AudioSegment.empty()
for chunk in output_chunks:
  combined += chunk
combined.export('/content/gdrive/MyDrive/Audio/output_chunk.wav', format = 'wav')

<_io.BufferedRandom name='/content/gdrive/MyDrive/Audio/output_chunk.wav'>

In [9]:
audio = AudioSegment.from_wav('/content/gdrive/MyDrive/Audio/output_chunk.wav')
n = len(audio)
counter = 1
interval = 1 * 1000
overlap = 0.5 * 1000
start = 0 
end = 0
flag = 0
for i in range(0, 2 * n, interval):
  if i == 0:
    start = 0
    end = interval
  else:
    start = end - overlap
    end = start + interval 
  if end >= n:
    end = n
    flag = 1
  chunk = audio[start:end]
  filename = '/content/gdrive/MyDrive/Audio/Result/chunk' + str(counter) + '.wav'
  chunk.export(filename, format = 'wav')
  print("Processing chunk "+str(counter)+". Start = "
                        +str(start)+" end = "+str(end))
  counter = counter + 1

Processing chunk 1. Start = 0 end = 1000
Processing chunk 2. Start = 500.0 end = 1500.0
Processing chunk 3. Start = 1000.0 end = 2000.0
Processing chunk 4. Start = 1500.0 end = 2500.0
Processing chunk 5. Start = 2000.0 end = 3000.0
Processing chunk 6. Start = 2500.0 end = 3500.0
Processing chunk 7. Start = 3000.0 end = 4000.0
Processing chunk 8. Start = 3500.0 end = 4500.0
Processing chunk 9. Start = 4000.0 end = 5000.0
Processing chunk 10. Start = 4500.0 end = 5500.0
Processing chunk 11. Start = 5000.0 end = 6000.0
Processing chunk 12. Start = 5500.0 end = 6500.0
Processing chunk 13. Start = 6000.0 end = 7000.0
Processing chunk 14. Start = 6500.0 end = 7500.0
Processing chunk 15. Start = 7000.0 end = 8000.0
Processing chunk 16. Start = 7500.0 end = 8500.0
Processing chunk 17. Start = 8000.0 end = 9000.0
Processing chunk 18. Start = 8500.0 end = 9500.0
Processing chunk 19. Start = 9000.0 end = 10000.0
Processing chunk 20. Start = 9500.0 end = 10500.0
Processing chunk 21. Start = 10000.

In [10]:
def recognize(directory, sample):
  x = audio_preprocess(sample)
  chunk_similarity = {}
  for file in os.listdir(directory):   
    filename, file_extension = os.path.splitext(file)
    if file_extension in ['.wav', '.mp3']:
      y = audio_preprocess(os.path.join(directory, file))
      D = dtw_table(x.transpose(), y.transpose(), distance=scipy.spatial.distance.cosine)
      path = dtw(x.transpose(), y.transpose(), D)
      cost = D[-1, -1]
      chunk_similarity[file] = cost 
      print(file, end = ' : ')
      print(cost)
  print(chunk_similarity)
  print({k: v for k, v in sorted(chunk_similarity.items(), key=lambda item: item[1])})

In [11]:
recognize('/content/gdrive/MyDrive/Audio/Result', '/content/gdrive/MyDrive/Audio/Test/JT - M.wav')

chunk1.wav : 2.6701258420944214
chunk2.wav : 3.248384416103363
chunk3.wav : 3.117986500263214
chunk4.wav : 3.5412585139274597
chunk5.wav : 4.4120893478393555
chunk6.wav : 4.417039215564728
chunk7.wav : 3.887979805469513
chunk8.wav : 4.651258051395416
chunk9.wav : 5.479804754257202
chunk10.wav : 4.153113842010498
chunk11.wav : 3.368163764476776
chunk12.wav : 3.3665757179260254
chunk13.wav : 3.6273345947265625
chunk14.wav : 4.74640291929245
chunk15.wav : 4.392243027687073
chunk16.wav : 3.5443586111068726
chunk17.wav : 3.321632504463196
chunk18.wav : 2.7233332991600037
chunk19.wav : 2.907004415988922
chunk20.wav : 4.982979238033295
chunk21.wav : 5.907788038253784
chunk22.wav : 4.143589377403259
chunk23.wav : 2.522533893585205
chunk24.wav : 4.002331078052521
chunk25.wav : 4.165939748287201
chunk26.wav : 3.627963423728943
chunk27.wav : 6.50015914440155
chunk28.wav : 5.575501024723053
chunk29.wav : 4.221861302852631
chunk30.wav : 5.9869484305381775
chunk31.wav : 6.453392803668976
chunk32.wav

In [12]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Test/JT - M.wav')
ipd.Audio(y, rate = sr)

In [13]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk162.wav') #1
ipd.Audio(y, rate = sr)

In [14]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk112.wav') #2
ipd.Audio(y, rate = sr)

In [15]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk23.wav') #3
ipd.Audio(y, rate = sr)

In [16]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk1.wav') #4
ipd.Audio(y, rate = sr)

In [17]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk18.wav') #5
ipd.Audio(y, rate = sr) 

In [18]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk96.wav') #6
ipd.Audio(y, rate = sr)

In [19]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk156.wav') #7
ipd.Audio(y, rate = sr)

In [20]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk58.wav') #8
ipd.Audio(y, rate = sr)

In [21]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk19.wav') #9
ipd.Audio(y, rate = sr)

In [22]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk92.wav') #10
ipd.Audio(y, rate = sr)

In [23]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk45.wav') #11
ipd.Audio(y, rate = sr)

In [24]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk3.wav') 
ipd.Audio(y, rate = sr)

In [25]:
recognize('/content/gdrive/MyDrive/Audio/Result', '/content/gdrive/MyDrive/Audio/Result/chunk3.wav')

chunk1.wav : 1.3955523371696472
chunk2.wav : 1.2071359753608704
chunk3.wav : 0.0
chunk4.wav : 1.688228189945221
chunk5.wav : 1.7155492901802063
chunk6.wav : 2.0982924103736877
chunk7.wav : 1.4251068830490112
chunk8.wav : 1.9442610144615173
chunk9.wav : 1.8332681059837341
chunk10.wav : 1.9319944381713867
chunk11.wav : 1.2376497983932495
chunk12.wav : 1.2886908054351807
chunk13.wav : 1.1317663192749023
chunk14.wav : 2.0622753500938416
chunk15.wav : 1.3003483414649963
chunk16.wav : 1.3295046091079712
chunk17.wav : 1.3467528223991394
chunk18.wav : 1.361737072467804
chunk19.wav : 1.3514559268951416
chunk20.wav : 2.218677580356598
chunk21.wav : 2.4455546140670776
chunk22.wav : 1.567433476448059
chunk23.wav : 1.3337507247924805
chunk24.wav : 2.0157001614570618
chunk25.wav : 1.0826303362846375
chunk26.wav : 1.8904433250427246
chunk27.wav : 2.7823336720466614
chunk28.wav : 2.707747757434845
chunk29.wav : 2.0275837182998657
chunk30.wav : 2.601884186267853
chunk31.wav : 1.776453673839569
chunk32.

In [26]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk61.wav') 
ipd.Audio(y, rate = sr)

In [27]:
y, sr = librosa.load('/content/gdrive/MyDrive/Audio/Result/chunk45.wav') 
ipd.Audio(y, rate = sr)