In [2]:
#Our first function will take the training data (which we will specify in a later function) and plot points on a graph that we wont be able to see. This model of predicting is called THE k-nearest neighbors algorithm which uses data points proximity to create predictions. Through this first function we will get the prozimity of those points. 
def getNeighbors(trainingset, instance, k):
    distances = []
    for x in range(len(trainingset)):
        dist = distance(trainingset[x], instance, k) + distance(instance,trainingset[x],k)
        distances.append((trainingset[x][2], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [None]:
#We will need these 2 packages, the first package will help us ectract the features of the files and the second package will help us read the files that are in '.wav' format.
!pip install python_speech_features
!pip install scipy

#We will then import the following packages to help read the data and built the machine learning model.
import numpy as np
import pandas as pd
import scipy.io.wavfile as wav #Will help get the sample rate of the '.wav' file
from python_speech_features import mfcc #feature extraction for audio
from tempfile import TemporaryFile
import os
import math
import pickle
import random
import operator

In [3]:
#This next function will build upon the previous and sort the nighbor points based on the genre while also storing the genre with the highest neighbor count.
def nearestclass(neighbors):
    classVote = {}
    
    for x in range(len(neighbors)):
        response = neighbors[x]
        if response in classVote:
            classVote[response] += 1
        else:
            classVote[response] = 1
            
    sorter = sorted(classVote.items(), key=operator.itemgetter(1), reverse=True)
    return sorter[0][0]

In [4]:
#Acurracy Calculator to predict how accurate our model will be.
def getAccuracy(testSet, prediction):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == prediction[x]:
            correct += 1
    return 1.0 * correct / len(testSet) #We divide correct predictions by total predictions to get a percentage of accuracy. 

In [5]:
#We now need to specify the directory which will depend on where dataset it held.
 #directory = 'C:/Users/Brianrod/Downloads/music1/IA/data_out'
directory = 's3://music-data-br/data_out'

In [16]:
#We will now extract the features from the data and might take a few minutes.
f = open("mydataset.dat", "wb") #New File where 
i = 0
for folder in os.listdir(directory):
    #print(folder)
    i += 1
    if i == 11:
        break
    for file in os.listdir(directory+"/"+folder): #We go through all .wav files in the dataset.
        #print(file) 
        try:
            (rate, sig) = wav.read(directory+"/"+folder+"/"+file)
            mfcc_feat = mfcc(sig, rate, winlen = 0.020, appendEnergy=False) #Divide the audio file to 20ms long frames to capture different frquencies in the file.
            covariance = np.cov(np.matrix.transpose(mfcc_feat)) #Identifying linguistic data from the noise.
            mean_matrix = mfcc_feat.mean(0)
            feature = (mean_matrix, covariance, i)
            pickle.dump(feature, f) #Place features into created file through pickle
        except Exception as e: #those that are not read as a .wav files will be excluded
            print("Got an exception: ", e, 'in folder: ', folder, ' filename: ', file)
f.close()

<_io.BufferedWriter name='mydataset.dat'>


In [7]:
print(mfcc_feat)

[[ 75.3168253  -10.90220637 -17.78203734 ... -11.72426556   9.0288015
    5.31804317]
 [ 79.89728209  -9.96502152 -13.343992   ... -21.22692205  -1.87134395
    2.471097  ]
 [ 79.97786824  -9.33628398 -10.40629862 ... -21.94690184  -0.79591599
    3.97585499]
 ...
 [ 81.26569403 -11.91406521 -15.5590391  ...  11.31647654   3.13291442
   -5.93571387]
 [ 79.99956125  -8.796024   -19.54259644 ...  -0.24466639  -6.12410176
   -5.59057078]
 [ 78.57661837  -6.28589311 -21.92604164 ...  -4.65797261  -8.25431975
   -1.30158522]]


In [8]:
#Now we will plot the dataset to train and test
dataset = []

def loadDataset(filename, split, trset, teset):
    with open('mydataset.dat','rb') as f: #Using our dataset containing features
        while True:
            try:
                dataset.append(pickle.load(f))
            except EOFError:
                f.close()
                break
    for x in range(len(dataset)):
        if random.random() < split: #We randomly split the data for test and training
            trset.append(dataset[x])
        else:
            teset.append(dataset[x])

trainingSet = []
testSet = []
loadDataset('my.dat', 0.68, trainingSet, testSet)

In [9]:
#This function will calculate the distance 
def distance(instance1, instance2, k):
    distance = 0
    mm1 = instance1[0]
    cm1 = instance1[1]
    mm2 = instance2[0]
    cm2 = instance2[1]
    distance = np.trace(np.dot(np.linalg.inv(cm2), cm1))
    distance += (np.dot(np.dot((mm2-mm1).transpose(), np.linalg.inv(cm2)), mm2-mm1))
    distance += np.log(np.linalg.det(cm2)) - np.log(np.linalg.det(cm1))
    distance -= k
    return distance

In [10]:
# Making the prediction using KNN
length = len(testSet)
predictions = []
for x in range(length):
    predictions.append(nearestclass(getNeighbors(trainingSet, testSet[x], 5))) #As we have previously defined all our functions, we only need to call them now. Nearest class will help identify the genre of music through KNN algorithm.

accuracy1 = getAccuracy(testSet, predictions)#Accuracy of model function is called to determine % of correct predictions.
print(accuracy1)

0.6903440621531631


In [11]:
#We identify results variable to test out new .wav data will out model
from collections import defaultdict
results = defaultdict(int)
#New Audio is inputted into the model by redefining 'directory' to the location of the new data.
i = 1
for folder in os.listdir(directory):
    results[i] = folder #We create a dictionary where the value will be the genre name.
    i += 1

In [12]:
print(results)

defaultdict(<class 'int'>, {1: '0', 2: '1', 3: '10', 4: '11', 5: '12', 6: '13', 7: '14', 8: '15', 9: '16', 10: '17', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: '9'})


In [13]:
pred = nearestclass(getNeighbors(dataset, feature, 8)) #This is where the algorithm comes altogether to predict the genre of the new data.

In [14]:
print(results[pred]) # 'Pred' variable is used through the 'results' variable to determine the genre of the file.

17
