In [32]:
from IPython.display import Audio
import librosa
import librosa.display
import numpy as np
import os
from matplotlib import pyplot as plt
import random
import pandas as pd
import tensorflow as tf

In [33]:
model = tf.keras.models.load_model("my_model.h5", custom_objects=None, compile=True)

In [34]:
def long_boundaries(times):
    
    result = []
    for i in range(len(times)-1):
        result.append((times[i],times[i+1]))
    return sorted(result,key=lambda x:x[1]-x[0],reverse=True)

In [35]:
def get_cqt_and_peaks(directory):
    y,sr = librosa.load(directory)

    onset_env = librosa.onset.onset_strength(y=y, sr=sr,
                                         hop_length=512,
                                         aggregate=np.median)
    peaks = librosa.util.peak_pick(onset_env, 3, 3, 3, 5, 0.5, 10)
    peaks = np.append(np.insert(peaks,0,0.0),librosa.time_to_frames(y.shape[0]/sr))
    C = np.abs(librosa.cqt(y, sr=sr))
    bounds = np.round(np.array(long_boundaries(librosa.frames_to_time(peaks)))*sr)
    return C,librosa.time_to_frames(bounds/sr)

In [36]:
def grouping(intervals, size_of_column=10):  

    # Fill intervals with numbers
    groups = [] 
    for interval in intervals:
        group = []
        for i in range(interval[0], interval[1]):
            group.append(i)
        groups.append(group)    
        
    # Make clusters of intervals
    clusters = []
    for group in groups:
        cluster = []
        for i in range(0, round(len(group)/size_of_column) ):
            cluster.append(group[i*size_of_column:size_of_column*(i+1):])
        clusters.append(list(filter(lambda x: len(x) >= size_of_column, cluster)))  
        
    return clusters

In [37]:
def get_input(song):
    X_train = []
    song_reduced = song[8:]

    cqt, bounds = get_cqt_and_peaks(song)

    groups = grouping(list(bounds))
#     print(len(bounds),len(groups))


    for k in range(len(groups)):
        for j in range(len(groups[k])):
            X_train.append([cqt[:,i] for i in groups[k][j]])

#     print(len(X_train))
    return X_train

In [38]:
def get_shifts(song):
    X_train = np.array(get_input(song))
    print(X_train.shape)
    return model.predict([X_train.reshape(X_train.shape[0],10,84,1),X_train.reshape(X_train.shape[0],10,84)])

In [39]:
def shift(song,shifts,new_path):
    _, bounds = get_cqt_and_peaks(song)
    if len(shifts)!=len(bounds):
        print(len(bounds),len(shifts))
        print(shifts)
        
    y,sr = librosa.load(song)
    for i in range(len(bounds)):
        bounds[i] = librosa.frames_to_time(bounds[i]/sr)
        if int(bounds[i][1])-int(bounds[i][0])>0:
            y_third = librosa.effects.pitch_shift(y[int(bounds[i][0])+1:int(bounds[i][1])-1], sr, n_steps=-shifts[i])
            y[int(bounds[i][0])+1:int(bounds[i][1])-1]=y_third
    librosa.output.write_wav(new_path,y,sr)

In [40]:
def correct_song(song,new_path):
    shift(song,get_shifts(song),new_path)

In [41]:
correct_song('test2_real.wav','new2.wav')

(35, 10, 84)
22 35
[[-19.395876  ]
 [ -5.311887  ]
 [ -6.8816466 ]
 [ -8.311442  ]
 [ -0.32012853]
 [ -0.67480886]
 [  0.30232286]
 [ -8.206475  ]
 [ -9.895735  ]
 [  2.0562737 ]
 [ -0.8203241 ]
 [ -1.2406154 ]
 [ -3.9383307 ]
 [  6.2595525 ]
 [ -5.511987  ]
 [ -3.305151  ]
 [-15.089432  ]
 [  1.0225666 ]
 [  0.4240605 ]
 [  5.7863984 ]
 [  8.91118   ]
 [ -8.012244  ]
 [ -5.9830003 ]
 [  1.174616  ]
 [  0.25712886]
 [ -9.644134  ]
 [ -5.281691  ]
 [ -0.4830012 ]
 [ -0.17655832]
 [  0.4569963 ]
 [ -4.553592  ]
 [  1.2836503 ]
 [  0.5254557 ]
 [  1.2217233 ]
 [ -2.961696  ]]
