In [None]:
#from __future__ import print_function
import argparse
import random
import os
from shutil import copy as cp
import subprocess
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd


In [None]:
from google.colab import drive
drive.mount("mnt", force_remount=True)

In [None]:
#Create Spectogram Image for each file
def create_spectogram(file,output_folder):
  cmap = plt.get_cmap('inferno')
  plt.figure(figsize=(8,8))
  y, sr = librosa.load(file)
  plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
  plt.axis('off');
  plt.savefig(output_folder[:-3].replace(".", "")+".png")
  plt.clf()
  plt.close(1) 

In [None]:
def extract_features_csv(filename,label,ouput_path,input):
  y, sr = librosa.load(input)
  rmse = librosa.feature.rmse(y=y)
  chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
  spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
  spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
  rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
  zcr = librosa.feature.zero_crossing_rate(y)
  mfcc = librosa.feature.mfcc(y=y, sr=sr)  
  to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)} '    
  for m in mfcc:
    to_append += f'{np.mean(m)} '
  to_append += label
  file = open(ouput_path+'dataset.csv', 'a', newline='')
  with file:
    writer = csv.writer(file)
    writer.writerow(to_append.split())
  file.close()


In [None]:
def splitFiles(input_path,output_path,file_list,training_percentage):
  header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
  for i in range(1, 21):
    header += f' mfcc{i}'
  header += ' label'
  print(header)
  header = header.split()
  file = open(output_path+'dataset.csv', 'w', newline='')
  with file:
    writer = csv.writer(file)
    writer.writerow(header)
  split = int(training_percentage/100. * len(file_list))
  print("Count of Training",split)
  print("Count of Validation",int((len(file_list)+split)/2)-split)
  print("Count of Test",len(file_list)-int((len(file_list)+split)/2))
  train_filenames = file_list[:split]
  val_filenames   = file_list[split:int((len(file_list)+split)/2)]
  test_filenames  = file_list[int((len(file_list)+split)/2):]
  filenames = {'train': train_filenames,
                 'val'  : val_filenames,
                 'test' : test_filenames}
  print(filenames)
  if not os.path.exists(output_path):
    os.mkdir(output_path)
  else:
    print("Warning: output dir {} already exists".format(output_path))
    
    # Preprocess train, val and test
  for split in ['train', 'val', 'test']:
    output_dir_split = os.path.join(output_path, '{}'.format(split))
    output_dir_split_img=os.path.join(output_path, '{}'.format(split))
    print(output_dir_split)
    if not os.path.exists(output_dir_split):
      os.mkdir(output_dir_split)
    else:
      print("Warning: dir {} already exists".format(output_dir_split))
   
    if not os.path.exists(output_dir_split_img):
      os.mkdir(output_dir_split_img)
    else:
      print("Warning: dir {} already exists".format(output_dir_split_img))
       
    print("Copying preprocessed data to {} ...".format(split, output_dir_split))
    for filename in filenames[split]:
      if not os.path.exists(output_dir_split_img+"/"+filename[0]):
        print("Warning: dir {} created ".format(output_dir_split_img+"/"+filename[0]))
        os.mkdir(output_dir_split_img+"/"+filename[0])
        
      create_spectogram(input_path+filename[0]+"/"+filename[1],output_dir_split_img+"/"+filename[0]+"/"+filename[1])
      extract_features_csv(filename[1],filename[0],output_path,input_path+filename[0]+"/"+filename[1])
      cp(input_path+filename[0]+"/"+filename[1],output_dir_split)
          


In [None]:
def build_dataset(output_path,input_path):
  csv_files=[]
  filenames = []
  # define the ls command
  ls = subprocess.Popen(["ls", input_path],stdout=subprocess.PIPE,)
  # define the grep command
  grep = subprocess.Popen(["grep", "-v", "$/"],
                        stdin=ls.stdout,
                        stdout=subprocess.PIPE,
                        )

  # read from the end of the pipe (stdout)
  endOfPipe = grep.stdout
  classlist=[]
  # output the files line by line
  for line in endOfPipe:
    classlist.append(line.decode('ascii').rstrip("\n"))
  # Now we have all possible ClassList
  i=0
  for folder in classlist:
    endOfPipe=subprocess.Popen(["ls", input_path+folder],
                        stdout=subprocess.PIPE,
                         )
      # read from the end of the pipe (stdout)
    files = endOfPipe.stdout
    # output the files line by line

    for file in files:
      i=i+1
      csv_files.append([folder,file.decode('ascii').rstrip("\n")])
  random.seed(1240)
  random.shuffle(csv_files)
  print("File Count is:",i)
  print("Total No of Classes",len(classlist))
    # We have shuffled all files
  splitFiles(input_path,output_path,csv_files,80)
  return csv_files

In [None]:
csv_dataset=build_dataset("/content/mnt/My Drive/Dataset_bk/Spectograms/","/content/mnt/My Drive/Filtered Sound/")
print(csv_dataset)


In [None]:
def datasplit(filename,data):
  file = open("/content/mnt/My Drive/Dataset_bk/img/dataset_img.csv", 'w', newline='')
  with file:
    writer = csv.writer(file)
    for row in data:
      writer.writerow(row)
  


In [None]:
data = pd.read_csv('/content/mnt/My Drive/Dataset_bk/dataset_bk.csv')
data.head()# Dropping unneccesary columns
data = data.drop(['filename'],axis=1)#Encoding the Labels

labels = data.iloc[:, -1]
encoder = LabelEncoder()
encoder.fit(labels)
y = encoder.transform(labels)
#Scaling the Feature columns
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))
data=[]
i=0

for row in X:
  r=row.tolist()
  r.append(y[i])
  data.append(r)
  i=i+1
split = int(80/100. * len(data))
print("Count of Training",split)
print("Count of Validation",int((len(data)+split)/2)-split)
print("Count of Test",len(data)-int((len(data)+split)/2))
traindata = data[:split]
valdata   = data[split:int((len(data)+split)/2)]
testdata  = data[int((len(data)+split)/2):]
datasplit('/content/mnt/My Drive/Dataset_bk/traindataset.csv',traindata)
datasplit('/content/mnt/My Drive/Dataset_bk/valdataset.csv',valdata)
datasplit('/content/mnt/My Drive/Dataset_bk/testdataset.csv',testdata)


In [None]:
data = pd.read_csv('/content/mnt/My Drive/Dataset_bk/dataset_bk.csv')
data.head()# Dropping unneccesary columns
#data = data.drop(['filename'],axis=1)#Encoding the Labels
X=data.iloc[:,0].values
labels = data.iloc[:, -1]
y=labels

data=[]
i=0
print(X)
for row in X:
  r=row
  data.append([r,y[i]])
  i=i+1
split = int(80/100. * len(data))
print("Count of Training",split)
print("Count of Validation",int((len(data)+split)/2)-split)
print("Count of Test",len(data)-int((len(data)+split)/2))
traindata = data[:split]
valdata   = data[split:int((len(data)+split)/2)]
testdata  = data[int((len(data)+split)/2):]
datasplit('/content/mnt/My Drive/Dataset_bk/dataset_img.csv',data)
datasplit('/content/mnt/My Drive/Dataset_bk/valdataset.csv',valdata)
datasplit('/content/mnt/My Drive/Dataset_bk/testdataset.csv',testdata)
