In [1]:
import soundfile as sf
import math
import numpy as np
import librosa as lb
from IPython.display import Audio
from librosa import feature, frames_to_time, autocorrelate, midi_to_hz
import pandas as pd


## Group 4 Final Project: Key Finder
Group member: David Wesley Jones, Karthik R Varadharajan, Yurui Wu

### Function 1: Key Finder for audio data array 
This function gets the most likely key of an array by taking a chromagram of the values and finding the major or minor key to which it is most correlated. 

**Parameters:**
1. audio - array: array of values to be analyzed
2. (optional) nfft - int: Size for Fast Fourier Transform in the chromagram function from librosa.Defaults to 4096 (large fft size)
3. (optional) fs - int: sample rate. Defaults to 22050 

In [2]:
def getKey(audio, nfft=4096, fs=22050):

    import operator
    normalized = audio / abs(np.max(audio)) # normalize audio file
    
    chromagram = feature.chroma_stft(normalized, n_fft = nfft, hop_length = int(nfft/4), sr = fs) # makes a chromagram 
    chromagram = chromagram.mean(axis=1) # averages the chromagram
    
    major_0 = np.array([6.35,2.23,3.48,2.33,4.38,4.09,2.52,5.19,2.39,3.66,2.29,2.88]) #C Major #values: C,C#,D,D#,...B
    minor_0 = np.array([6.33,2.68,3.52,5.38,2.60,3.53,2.54,4.75,3.98,2.69,3.34,3.17]) #C Minor #values: C,C#,D,D#,...B
    
    arraysDict = {}
    for i in range(0,12):
        arraysDict['major_{0}'.format(i)] = np.roll(major_0,i)# iteratively create each label while rotating for both
        arraysDict['minor_{0}'.format(i)] = np.roll(minor_0,i)#   major and minor keys
    
    # Dictionary below takes arrayDict and assigns the key type/number a note name
    keyTypeToName = {'major_0' : 'C', 'minor_0' : 'c', 'major_1' : 'C#/Db', 'minor_1' : 'c#/db'
    , 'major_2' : 'D', 'minor_2' : 'd', 'major_3' : 'D#/Eb', 'minor_3' : 'd#/eb', 'major_4' : 'E', 'minor_4' : 'e'
    , 'major_5' : 'F', 'minor_5' : 'f', 'major_6' : 'F#/Gb', 'minor_6' : 'f#/gb', 'major_7' : 'G', 'minor_7' : 'g'
    , 'major_8' : 'G#/Ab', 'minor_8' : 'g#/ab', 'major_9' : 'A', 'minor_9' : 'a', 'major_10' : 'A#/Bb'
    , 'minor_10' : 'a#/bb', 'major_11' : 'B', 'minor_11' : 'b'}
    
    finalDict = {} # dictionary of possible key profiles
    for key, value in arraysDict.items():
        relate = np.corrcoef([chromagram,value]) # compare chromagram to Krumhansl and Kessler key profiles
        finalDict[key] = relate[0][1] # add correletion coefficient to dictionary of possible key profiles
    
    maxChecker = max(finalDict.items(), key = operator.itemgetter(1)) # find maximum correlation coefficient
    guessKeyType = maxChecker[0] # get maximum correlation coefficient's key type/number
    guessKeyName = keyTypeToName[guessKeyType] # convert key type/number to key name using keyTypeToName dictionary
    return guessKeyName # return most correlated key                    

test with Radiohead_track3.wav

In [3]:
x, fsx = lb.load('../uploaded_audio/Radiohead_track3.wav', sr=22050)
getKey(x, fs = fsx)

'c'

### Function 2: Key Finder for audio files 
This function iterates through a directory of files and gets the most correlated key. 

**Parameters:**
1. directory - String: directory of audio files
2. (optional) nfft - int: Size for Fast Fourier Transform in the chromagram function from librosa.Defaults to 4096 (large fft size)
3. (optional) fs - int: sample rate. Defaults to 22050 
4. (optional) percent of song - float: length percent of clip of the audio. Defaults to 25% to increase speed and minimize chances of detecting key changes

In [4]:
def getKeysFromFiles(directory, nfft=4096, fs=22050, percentOfSong = .25):
    import os
    dictOfKeys = {}
    for root, dirs, files in os.walk(directory): # get files from directory
        n = 0
        for file in files: # iterate through each file in directory of files
            try:
                x, fsx = lb.load(directory + '/' + file, sr = fs) # load file
                key = getKey(x[:int(x.size * percentOfSong)], nfft = nfft, fs = fsx) # get key using function from 
                dictOfKeys[file] = key # add key and filenam to dictionary of keys   #  above
                n+=1
            except Exception: # handle any error that may come from reading the file
                print(Exception) # print Exception to notify user of error
                pass
            print(n) # print file number to track progress
    return dictOfKeys # returns dictionary with filenames as keys and key type/name as items

Run the function with provided key finding data and create dataframe for the result

In [None]:
keys = getKeysFromFiles('../KeyFinding/KeyFinding_Audio/Audio', 8192) #Runtime: 15 minutes

1
2
3
4
5
6
7




8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


In [6]:
guessedKey = pd.DataFrame(keys.items()) # create dataframe using dictionary from function above
guessedKey = guessedKey.sort_values(0, ignore_index = True) # sort values and reindex
guessedKey

Unnamed: 0,0,1
0,0003_C.wav,a
1,0004_Ab.wav,A
2,0006_C.wav,C
3,0015_Eb.wav,D#/Eb
4,0016_a.wav,a
...,...,...
325,reggae.00095.au,C
326,reggae.00096.au,f
327,reggae.00097.au,a
328,reggae.00098.au,c


Read the given key results and create the dataframe

In [7]:
base = pd.read_csv('../Keyfinding/keys_corrected.csv') # read csv to compare base key to guessed key
base = base.sort_values('Title', ignore_index = True) # sort values and reindex
base

Unnamed: 0,Title,Composer,Date,Type,Unnamed: 4,Key,Source,Genre,SubGenre,Instrumentation,.MID,mid to wav?,Audio
0,0003_C.wav,,,,,C,Billboard,Popular,,,False,,.wav
1,0004_Ab.wav,,,,,Ab,Billboard,Popular,,,False,,.wav
2,0006_C.wav,,,,,C,Billboard,Popular,,,False,,.wav
3,0015_Eb.wav,,,,,Eb,Billboard,Popular,,,False,,.wav
4,0016_a.wav,,,,,a,Billboard,Popular,,,False,,.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,reggae.00095.li.txt,,,,,C,GTZAN,Popular,,,,,
326,reggae.00096.li.txt,,,,,C,GTZAN,Popular,,,,,
327,reggae.00097.li.txt,,,,,F,GTZAN,Popular,,,,,
328,reggae.00098.li.txt,,,,,c,GTZAN,Popular,,,,,


Compare function 2's result and given result

In [8]:
compare = pd.DataFrame() # make dataframe
compare['File name'] = base['Title'] # take filenames from base case csv file and make a column
compare['Guessed Key'] = guessedKey[1] # make column of guessed keys
compare['Base Key'] = base['Key'] # make column of base keys
compare

Unnamed: 0,File name,Guessed Key,Base Key
0,0003_C.wav,a,C
1,0004_Ab.wav,A,Ab
2,0006_C.wav,C,C
3,0015_Eb.wav,D#/Eb,Eb
4,0016_a.wav,a,a
...,...,...,...
325,reggae.00095.li.txt,C,C
326,reggae.00096.li.txt,f,C
327,reggae.00097.li.txt,a,F
328,reggae.00098.li.txt,c,c


## Basic Analysis ##
Gives simple binary of whether the correct key or correct tonic were guessed. 1.0 stands for **exact same**. 

In [9]:
comparison_column = np.array([])
correct_tonic = np.array([])
for i, j in zip(compare['Base Key'], compare['Guessed Key']):
    if i in j:
        comparison_column = np.append(comparison_column, True) # if key is exact same, append true to comparison array
    else:
        comparison_column = np.append(comparison_column, False) # if not the exact same, append false
    if i.lower() in j.lower():
        correct_tonic = np.append(correct_tonic, True) # if correct tonic, append true to comparison array
    else:
        correct_tonic = np.append(correct_tonic, False) # if completely different append false

compare['Correct Key'] = comparison_column
compare['Correct Tonic'] = correct_tonic
compare

Unnamed: 0,File name,Guessed Key,Base Key,Correct Key,Correct Tonic
0,0003_C.wav,a,C,0.0,0.0
1,0004_Ab.wav,A,Ab,0.0,0.0
2,0006_C.wav,C,C,1.0,1.0
3,0015_Eb.wav,D#/Eb,Eb,1.0,1.0
4,0016_a.wav,a,a,1.0,1.0
...,...,...,...,...,...
325,reggae.00095.li.txt,C,C,1.0,1.0
326,reggae.00096.li.txt,f,C,0.0,0.0
327,reggae.00097.li.txt,a,F,0.0,0.0
328,reggae.00098.li.txt,c,c,1.0,1.0


The result shows that in 330 audio files, we managed to find the correct key for 118 of them and correct tonic for other 152 of them. 

In [10]:
counts = compare['Correct Key'].value_counts(), compare['Correct Tonic'].value_counts()
pd.DataFrame(counts)

Unnamed: 0,0.0,1.0
Correct Key,212,118
Correct Tonic,178,152


## In Depth Analysis ##
Gives score to the results according to their accuracy. 
Accuracy depends on the position difference of function result and given key in the **circle of fifths**. 

1. Correct key and tonic = 1pt
2. Relative major/minor = 0.8pt
3. key signature 5th away = 0.66pt
4. Correct tonic with incorrect major/minor = 0.5pt
5. Other different number = 0pt

In [11]:
# dictionary below assigns a number to a Key based on its position in the circle of fifths
letterToCircFifths = {
    "C":0, "a":0,
    "G":1, "e":1,
    "D":2, "b":2,
    "A":3, "f#/gb":3, "f#":3, "gb":3,
    "E":4, "c#/db":4, "c#":4, "db":4,
    "B":5, "g#/ab":5, "g#":5, "ab":5,
    "F#/Gb":6, "d#/eb":6, "F#":6, "Gb":6, "d#":6, "eb":6,
    "C#/Db":7, "a#/bb":7, "C#":7, "Db":7, "a#":7, "bb":7,
    "G#/Ab":8, "f":8, "G#":8, "Ab":8,
    "D#/Eb":9, "c":9, "D#":9, "Eb":9,
    "A#/Bb":10, "g":10, "A#":10, "Bb":10,
    "F":11, "d":11
}
# Circle of Fifths = CoF
keysig_score = np.array([]) # array of key scores
for i, j in zip(compare['Base Key'], compare['Guessed Key']): # iterate through guessed keys and base keys
    temp = abs(letterToCircFifths[i] - letterToCircFifths[j]) # find difference between guessed and base keys on CoF
    diff = min(temp, 12-temp) # gets minimum difference in both directions on CoF                                
    score = 0                  
    if diff == 0: # if CoF position is the same
        if i.lower() in j.lower():
            score = 1 # if same exact key, score = 1
        else:
            score = 0.8 # if relative major/minor, score = .8
    elif diff == 1:
        score = 0.66 # if difference in accidentals is only 1, 
    elif i.lower() in j.lower():
        score = 0.5 # if correct tonic but wrong major/minor quality, score = .5
    # every other difference number is given a key signature score of 0
    keysig_score = np.append(keysig_score, score)

compare['Key Signature Score'] = keysig_score
compare

Unnamed: 0,File name,Guessed Key,Base Key,Correct Key,Correct Tonic,Key Signature Score
0,0003_C.wav,a,C,0.0,0.0,0.80
1,0004_Ab.wav,A,Ab,0.0,0.0,0.00
2,0006_C.wav,C,C,1.0,1.0,1.00
3,0015_Eb.wav,D#/Eb,Eb,1.0,1.0,1.00
4,0016_a.wav,a,a,1.0,1.0,1.00
...,...,...,...,...,...,...
325,reggae.00095.li.txt,C,C,1.0,1.0,1.00
326,reggae.00096.li.txt,f,C,0.0,0.0,0.00
327,reggae.00097.li.txt,a,F,0.0,0.0,0.66
328,reggae.00098.li.txt,c,c,1.0,1.0,1.00


In [12]:
print("Key Signature Score Accuracy:", str(np.sum(keysig_score)/keysig_score.size)) # print average key score accuracy
keySigScoreCounts = compare['Key Signature Score'].value_counts() # counts of each key score value
keySigScoreStats = pd.DataFrame()
keySigScoreStats['Counts'] = keySigScoreCounts
# line below gets percent of occurence of each key score value
keySigScoreStats["% of total"] = (100*keySigScoreStats['Counts']/keySigScoreStats['Counts'].sum()).round(decimals=1)
keySigScoreStats.sort_index(ascending=False)


Key Signature Score Accuracy: 0.5566060606060607


Unnamed: 0,Counts,% of total
1.0,105,31.8
0.8,10,3.0
0.66,73,22.1
0.5,45,13.6
0.0,97,29.4


The analysis shows that our function has a 55.7% accuracy. 

1. Correct key and tonic: 31.8%
2. Relative major/minor: 3.0%
3. 1 difference in accidentals: 22.1%
4. Correct tonic with incorrect major/minor: 13.6%
5. Other different number: 29.4%

## Shortcoming Analysis

Using the scoring analysis that we devised, we reached an accuracy rating of around 56 percent and using a binary measure, we only reached an accuracy rating of around 36 percent. This is obviously not ideal, but could definitely have been worse. 

We assumed that the major shortcoming of our function is that we are unable to detect any modulation in a song. Modulation makes the difference between tonic, dominant, subdominant notes and other notes less obvious, which makes it harder to find the correct key. The fact that using a longer clip (half) of each audio file decreased the accuracy rating to around 53 percent also proves that, because longer clips often means more modulations. 

Using a longer clip also unsurprisingly took twice as long, which brings up another shortcoming of our function: long runtime. The getKeysFromFiles function takes around 15 minutes to run, but we suspect most of this time is spent loading the files, which is another reason why we chose to read in only a quarter of each file. Considering the number of files, however, this is not horrible as it averages out to be around 2.7 seconds per file. 

This score was achieved using a nfft size of 4096 and by using only a quarter of each audio file. Using an nfft size of 8192 yielded a very similar result with an accuracy rating of .5564.  Using a hop length of 1/2 nfft yielded an accuracy rating of .5487 and a hop_length of the same size as nfft rated at .5399.

