In [13]:
#!pip install librosa
#!pip install tensorflow
import IPython.display as ipd
import librosa
import librosa.display
import pandas as pd
from tqdm import tqdm
import numpy as np
import os

In [18]:
#Starting up Pyspark in Jupyter Notebook
import findspark
findspark.init('C:\spark-3.3.1-bin-hadoop2')
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import confusion_matrix
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [None]:
#Importing all necessary packages 

In [11]:
#Read in the data as a pandas dataframe
metadata = pd.read_csv('C:/Users/Brianrod/Downloads/music1/IA/train.csv')
#metadata.head(10)

In [8]:
#Defining function that will extract features from each wave file of each foler
file_name='C:/Users/Brianrod/Downloads/music1/IA/data_out'

def features_extractor(file):
    #load the file (audio)
    #We are able to load and read these files using the Librosa package
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    #we extract mfcc
    #mfcc is one of the 2 features we extarct as we slow down the rate of the audio to capture more information from each
    #frame of the audio file
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    #for our second set of features we take the mean of the array using the previous feature. 
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    return mfccs_scaled_features

In [20]:
#Now we use the features_extractor function and append that value into an array
extracted_features=[]
#tqdm will help us see the progress the loop has made and it can take up to 10 minutes.
for index_num,row in tqdm(metadata.iterrows()):
    try:
        #Through the file_name variable we are able to create a path between the .wav files and the information contained about
        #the .wav file in the csv file. This helps us know what genre each .wav file will be.
        file_name = os.path.join(os.path.abspath('C:/Users/Brianrod/Downloads/music1/IA/data_out'),str(row["genre_id"])+'/',str(row["filename"]))
        final_class_labels=row["genre"]
        data=features_extractor(file_name)
        extracted_features.append([data,final_class_labels])
    #I have place the below line so that the loop does not end when it faces a file that does not match as a 'wav' file.
    except OSError:
        pass

19905it [09:48, 33.82it/s]


In [21]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','genre'])
extracted_features_df

Unnamed: 0,feature,genre
0,"[-220.76562, 90.60733, 49.78978, 26.039112, 5....",Instrumental
1,"[18.86783, 80.36447, 0.5729405, 23.20239, -0.7...",Punk
2,"[-290.6577, 83.498055, -42.749413, 21.640993, ...",Folk
3,"[-262.08475, 144.38524, -108.97106, -26.358757...",Old-Time / Historic
4,"[15.768597, 72.60664, 4.0164847, 32.805832, 2....",Rock
...,...,...
19891,"[-64.83869, 46.36941, 8.45531, 45.98196, -0.35...",Electronic
19892,"[-170.57887, 106.53604, 42.701595, 28.565674, ...",Hip-Hop
19893,"[-12.067158, 46.283924, 0.009649797, 29.335272...",Hip-Hop
19894,"[-129.28217, 82.20686, 18.308973, 53.467686, 1...",Punk


In [22]:
#We now need to convert the pandas dataframe into a spark dataframe to being using pyspark.
spark = SparkSession.builder.appName("pandas to spark").getOrCreate()
col = [f'Value{i}' for i in range(len(extracted_features_df['feature'][0]))]
df3 = extracted_features_df.feature.apply(pd.Series)
df3.columns = col
df3['genre'] = extracted_features_df['genre']
df_sp = spark.createDataFrame(df3)
#type(df_sp)
#df_sp.show()
#df_sp.printSchema()

In [23]:
#The features column stores the data as an array and we will need to tranform it into a vector to be able to use it.
extracted_features_df[0:1].to_dict()

{'feature': {0: array([-220.76562   ,   90.60733   ,   49.78978   ,   26.039112  ,
            5.5850687 ,   19.75965   ,  -16.241432  ,   10.575141  ,
            0.7575977 ,   -5.1816783 ,    0.651591  ,   -3.3306482 ,
           -1.895248  ,    2.7754571 ,    1.3352169 ,   -6.0598707 ,
            4.5441833 ,   -5.583124  ,    3.8036451 ,    8.072292  ,
           -3.9814079 ,   10.748935  ,   -8.948521  ,   -4.6125026 ,
           -3.6112123 ,   -5.2807927 ,   -0.6133772 ,   -6.272189  ,
           -2.1429634 ,   -2.1390417 ,   -0.7451536 ,   -6.8895445 ,
           -2.2823555 ,   -7.6757054 ,   -3.4441836 ,   -3.561907  ,
           -2.9988928 ,    2.256279  ,   -0.57180977,   -1.6650013 ],
        dtype=float32)},
 'genre': {0: 'Instrumental'}}

In [24]:
#Specifying input columns and the output column where they will be combined which is 'vector'
assembler = VectorAssembler(inputCols=['Value0', 'Value1','Value2','Value3','Value4','Value5','Value6','Value7','Value8',
'Value9','Value10','Value11','Value12','Value13','Value14','Value15','Value16','Value17','Value18','Value19','Value20',
'Value21','Value22','Value23','Value24', 'Value25','Value26','Value27','Value28','Value29','Value30','Value31','Value32',
'Value33','Value34','Value35','Value36','Value37','Value38','Value39'],
                           outputCol='vector')

#Transforming the data into a vector
final_data = assembler.transform(df_sp)

#Viewing the vector
final_data.select('vector').show()

+--------------------+
|              vector|
+--------------------+
|[-220.765625,90.6...|
|[18.8678302764892...|
|[-290.65771484375...|
|[-262.08474731445...|
|[15.7685966491699...|
|[-304.86917114257...|
|[-120.08622741699...|
|[-137.44456481933...|
|[-70.225433349609...|
|[-241.76977539062...|
|[-308.45782470703...|
|[-151.09271240234...|
|[-88.015144348144...|
|[-206.65264892578...|
|[-323.08123779296...|
|[-19.575395584106...|
|[-242.74482727050...|
|[-172.43980407714...|
|[-154.71791076660...|
|[-41.144653320312...|
+--------------------+
only showing top 20 rows



In [25]:
#We now need to remove the 'genre' as it is a string and will give errors when building the pipeline as it will only take numeric data.
genre_df = df_sp.select('genre').distinct().toPandas()
genre_df["genre_label"] = genre_df.index + 1
genere_spark_df = spark.createDataFrame(genre_df)
genre_labelled_df = df_sp.join(
    genere_spark_df,
    "genre"
).drop("genre")

In [26]:
#Now we will build the pipeline to generate predictions of what the genre of music it will be.
#For this pipeline I have used RandomForestClassifier which works better with many target variable, in this case 'genres'
forestclassifier = RandomForestClassifier(
    featuresCol='vector',
    labelCol='genre_label'
)

forest_pipeline = Pipeline(
    stages= [assembler, forestclassifier]
)

In [27]:
# fit the pipeline for the trained data
model = forest_pipeline.fit(genre_labelled_df)
# transform the data
sample_data_train = model.transform(genre_labelled_df)

# view some of the columns generated
sample_data_train.select('vector', 'genre_label', 'rawPrediction', 'probability', 'prediction').show()

+--------------------+-----------+--------------------+--------------------+----------+
|              vector|genre_label|       rawPrediction|         probability|prediction|
+--------------------+-----------+--------------------+--------------------+----------+
|[-220.765625,90.6...|          3|[0.0,0.8219636575...|[0.0,0.0410981828...|      15.0|
|[-308.45782470703...|          3|[0.0,0.7811105867...|[0.0,0.0390555293...|      11.0|
|[-323.08123779296...|          3|[0.0,0.5351607894...|[0.0,0.0267580394...|       3.0|
|[-251.44256591796...|          3|[0.0,0.9394591462...|[0.0,0.0469729573...|      11.0|
|[-383.65261840820...|          3|[0.0,0.6563661210...|[0.0,0.0328183060...|      15.0|
|[-285.96405029296...|          3|[0.0,0.5798971386...|[0.0,0.0289948569...|      15.0|
|[-143.62411499023...|          3|[0.0,0.7904316344...|[0.0,0.0395215817...|      15.0|
|[-85.137138366699...|          3|[0.0,0.8070518119...|[0.0,0.0403525905...|       4.0|
|[-196.28053283691...|          