# Song similarity analyser

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import librosa.display
import sklearn

## Settings

In [2]:
import shutil
import os
from pathlib import Path


In [3]:
source_path = Path(os.environ["HOME"] + '/Music')
second_to_split_into = 100000

## Functions

In [4]:
def split_audio_track(sample, sample_rate, second_to_split_into):
    samples_per_split = np.round(sample_rate * second_to_split_into)
    array_to_split = np.arange(samples_per_split, sample.shape[0], samples_per_split)
    return np.split(sample, array_to_split)

In [5]:
def dont_split_audio_track(sample, sample_rate, samples_per_split):
    array_to_split = np.arange(samples_per_split, sample.shape[0], samples_per_split)
    return np.split(sample, array_to_split)

In [6]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

# Import Data and Create Features 

In [7]:
import numpy as np
import pandas as pd
import librosa
import pickle

In [8]:
files = [f for f in source_path.glob( "**/*.*")]

In [9]:
error_formats = []

In [15]:
df = pd.DataFrame(columns=['index', 'source', 'zero_crossing', \
                           'spectral_centroid_mean', 'spectral_centroid_median', 'spectral_centroid_max', 'spectral_centroid_min', \
                           'mfccs_mean', 'mfccs_median', 'mfccs_max','mfccs_min'])

for file_index in range(len(files)):
    update_progress((file_index+1)/len(files))
    
    file = files[file_index]
    
    try:
        sample, sample_rate = librosa.load(file, sr = None)
    except Exception as e:
        print("Reading of sample %s failed" % file)
        error_formats.append(file.suffix)
        print(e)
        continue
        
    split_audio = dont_split_audio_track(sample, sample_rate, len(sample))
    
    data = {'index':np.arange(0, 1),
           "source": file}
    # Create DataFrame 
    sample_df = pd.DataFrame(data)
    sample_df.insert(2, "data", split_audio, True)
    
    sample_df["zero_crossing"] = sample_df.apply(lambda x: np.sum(librosa.zero_crossings(x['data']) / len(x['data'])), axis = 1)

    sample_df["spectral_centroid"] = sample_df.apply(lambda x: librosa.feature.spectral_centroid(x['data'], sr=sample_rate)[0], axis = 1)
    sample_df["spectral_centroid_mean"] = sample_df.apply(lambda x: np.mean(x['spectral_centroid']), axis = 1)
    sample_df["spectral_centroid_median"] = sample_df.apply(lambda x: np.median(x['spectral_centroid']), axis = 1)
    sample_df["spectral_centroid_max"] = sample_df.apply(lambda x: np.max(x['spectral_centroid']), axis = 1)
    sample_df["spectral_centroid_min"] = sample_df.apply(lambda x: np.min(x['spectral_centroid']), axis = 1)

    sample_df = sample_df.drop(labels= "spectral_centroid", axis = 1)

    sample_df["mfccs"] = sample_df.apply(lambda x: librosa.feature.mfcc(x['data'], sr=sample_rate), axis = 1)
    sample_df["mfccs_mean"] = sample_df.apply(lambda x: np.mean(x['mfccs']), axis = 1)
    sample_df["mfccs_median"] = sample_df.apply(lambda x: np.median(x['mfccs']), axis = 1)
    sample_df["mfccs_max"] = sample_df.apply(lambda x: np.max(x['mfccs']), axis = 1)
    sample_df["mfccs_min"] = sample_df.apply(lambda x: np.min(x['mfccs']), axis = 1)
    sample_df = sample_df.drop(labels=["mfccs", "data"], axis = 1)

    df = df.append(sample_df, ignore_index=True, sort=False)

Progress: [####################] 99.9%


In [16]:
list(set(error_formats))

['.jpg', '.png']

In [17]:
df = df.reset_index(drop = True)

In [68]:
df

Unnamed: 0,index,source,zero_crossing,spectral_centroid_mean,spectral_centroid_median,spectral_centroid_max,spectral_centroid_min,mfccs_mean,mfccs_median,mfccs_max,mfccs_min
0,0,/home/echo/Music/The Killers/The Killers/02 Mr...,0.080173,3478.486893,3699.486893,9221.642040,0.000000,0.034836,0.000000,217.820206,-618.331726
1,0,/home/echo/Music/The Killers/Hot Fuss/02 Mr. B...,0.085226,4237.165629,4413.760841,9394.739657,634.324126,7.089046,2.213427,222.148407,-482.561829
2,0,/home/echo/Music/The Killers/Hot Fuss/04 Someb...,0.065370,3023.802147,2890.480863,8679.240765,0.000000,7.832765,2.546913,210.143951,-475.126678
3,0,/home/echo/Music/The Killers/Day & Age/08 Neon...,0.062050,2791.438689,2731.611321,10860.375440,0.000000,2.705099,0.656901,205.322357,-480.103882
4,0,/home/echo/Music/The Killers/Day & Age/09 The ...,0.063964,2898.526246,2749.655822,9439.706424,0.000000,3.738940,0.488139,202.427124,-500.842682
...,...,...,...,...,...,...,...,...,...,...,...
1671,0,/home/echo/Music/Alter Bridge/Blackbird/06 Bef...,0.065014,2912.522619,2891.357103,9073.641645,407.068312,7.308403,1.928936,224.799011,-464.524597
1672,0,/home/echo/Music/Alter Bridge/Blackbird/10 Wat...,0.046191,2196.631047,2110.213404,10870.742082,291.552918,0.827663,0.041015,230.467102,-531.482788
1673,0,/home/echo/Music/Alter Bridge/One Day Remains/...,0.047607,2201.405678,2132.858269,10142.083556,0.000000,2.255421,-0.646723,230.088089,-489.989685
1674,0,/home/echo/Music/Alter Bridge/One Day Remains/...,0.047392,2309.839081,2379.670617,10002.543869,0.000000,5.294450,1.895164,228.150818,-492.467834


In [69]:
pickle.dump(df, open("music_backup.p", "wb"))