In [1]:
!pip install mutagen
!pip install pydub
!pip install librosa
!apt install ffmpeg

Collecting mutagen
  Downloading mutagen-1.45.1-py3-none-any.whl (218 kB)
[?25l[K     |█▌                              | 10 kB 29.7 MB/s eta 0:00:01[K     |███                             | 20 kB 35.4 MB/s eta 0:00:01[K     |████▌                           | 30 kB 41.8 MB/s eta 0:00:01[K     |██████                          | 40 kB 35.2 MB/s eta 0:00:01[K     |███████▌                        | 51 kB 8.5 MB/s eta 0:00:01[K     |█████████                       | 61 kB 8.9 MB/s eta 0:00:01[K     |██████████▌                     | 71 kB 7.8 MB/s eta 0:00:01[K     |████████████                    | 81 kB 8.7 MB/s eta 0:00:01[K     |█████████████▌                  | 92 kB 9.0 MB/s eta 0:00:01[K     |███████████████                 | 102 kB 7.3 MB/s eta 0:00:01[K     |████████████████▌               | 112 kB 7.3 MB/s eta 0:00:01[K     |██████████████████              | 122 kB 7.3 MB/s eta 0:00:01[K     |███████████████████▌            | 133 kB 7.3 MB/s eta 0:00:01

In [2]:
import os
from os import listdir
from os.path import isfile, join
import shutil

import sklearn
import numpy as np
import json
import librosa
from numpy.lib.utils import source
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from mutagen.mp3 import MP3
from pydub import AudioSegment
from pydub.utils import mediainfo

In [None]:
def createDirectoryIfDoesNotExists(path):
    if not os.path.exists(path):
        os.makedirs(path)

def getAllMusicsTimeInSeconds(data_path, bad_files):
    result = []

    for subdir, dirs, files in os.walk(DATA_PATH):
        for file in tqdm(files):
            filePath = DATA_PATH + os.sep + subdir.split(os.sep)[-1] + os.sep + file   

            try:
              audio = MP3(filePath)
              result.append(int(audio.info.length))
            except:
              bad_files.append(filePath)
              pass
    return result

def splitMusics(data_path, final_path, category, lengthInMilliseconds):
    print(f"Splitting {category}")
    result = []
    category_path = join(data_path, category)

    files = [f for f in listdir(category_path) if isfile(join(category_path, f))]

    for file in tqdm(files):
        splittedFile = file.split('.')
        fileName = ".".join(splittedFile[:-1])

        filePath = join(category_path, file)

        if(filePath in bad_files):
          continue

        sound = AudioSegment.from_file(filePath)

        soundLength = len(sound)
        numberOfParts = soundLength // lengthInMilliseconds
        loss = soundLength - numberOfParts * lengthInMilliseconds
        skipEnd = loss // 2
        skipStart = loss - skipEnd

        start = skipStart + 1
        for i in range(numberOfParts):
            partFilePath = join(final_path, category, fileName + "_" + str(i+1) + ".wav")
            part = sound[start: start+lengthInMilliseconds+1]

            createDirectoryIfDoesNotExists(join(final_path, category))

            part.export(partFilePath , format="wav")
            start = start+lengthInMilliseconds+1          


def prepare_dataset(data_path, category, n_mfcc=13, hop_length=512, n_fft=2048):
    SAMPLES_TO_CONSIDER = 22050 * 30

    label = {"Bandari":0, "Gilaki":1, "Kordi":2, "Lori":3, "Torki":4}
    category_path = join(data_path, category)

    files = [f for f in listdir(category_path) if isfile(join(category_path, f))]

    # data["mappings"].append(category)
    print(f"Processing {category}")

    for file in tqdm(files):

      file_path = join(category_path, file)

      signal, sr = librosa.load(file_path)

      if len(signal) >= SAMPLES_TO_CONSIDER:
          signal = signal[:SAMPLES_TO_CONSIDER]

          features = []

          #zero crossing
          zero_crossing = librosa.zero_crossings(signal, pad = False)
          features.append(zero_crossing.sum())

          #spectral centroids
          spectral_centroids = librosa.feature.spectral_centroid(signal, sr=sr)[0]
          features.append(spectral_centroids.mean())
          features.append(spectral_centroids.var())

          #spectral rolloff
          spectral_rolloff = librosa.feature.spectral_rolloff(signal+0.01, sr=sr)[0]
          features.append(spectral_rolloff.mean())
          features.append(spectral_rolloff.var())

          #Chroma Frequencies
          chromagram = librosa.feature.chroma_stft(signal, sr=sr, hop_length=hop_length)
          features += chromagram.mean(axis=1).tolist()
          features += chromagram.var(axis=1).tolist()
          
          #Mel-Frequency Cepstral Coefficients
          MFCCs = librosa.feature.mfcc(signal, hop_length=hop_length, n_fft=n_fft)
          features += MFCCs.mean(axis=1).tolist()
          features += MFCCs.var(axis=1).tolist()

          # add label
          features.append(label[category])

          data.append(features)

In [None]:
!unrar x "/content/drive/MyDrive/ML_course_data_gathering_Arash_Rasouli/ML_Data G1.rar"

In [None]:
bad_files = []
DATA_PATH = "/content/ML_Data G1"
getAllMusicsTimeInSeconds(DATA_PATH, bad_files)
print("\n")
print(bad_files)

0it [00:00, ?it/s]
100%|██████████| 267/267 [00:01<00:00, 137.16it/s]
100%|██████████| 276/276 [00:02<00:00, 98.55it/s]
100%|██████████| 289/289 [00:01<00:00, 162.81it/s]
100%|██████████| 280/280 [00:03<00:00, 70.99it/s] 
100%|██████████| 271/271 [00:01<00:00, 169.20it/s]



['/content/ML_Data G1/Bandari/22.mp3', '/content/ML_Data G1/Torki/104.mp3', '/content/ML_Data G1/Torki/231.mp3', '/content/ML_Data G1/Lori/156.mp3', '/content/ML_Data G1/Lori/89.mp3']





In [None]:
data_path = "/content/ML_Data G1"
final_path = "/content/all"

csv_header = ["f"+str(i) for i in range(69)] + ["label"]
csv_header = ",".join(csv_header)

# add your categories
# categories = ["Bandari", "Gilaki", "Kordi", "Lori", "Torki"]

categories = ["Lori", "Torki"]

for category in categories:
  data = []

  splitMusics(data_path, final_path, category, 30000)
  prepare_dataset(final_path, category)
  shutil.rmtree(join(final_path, category))

  csv_path = "/content/drive/MyDrive/ML_Data/" + category + '.csv'

  np.savetxt(csv_path, data, delimiter=",", header=csv_header, comments="")

In [24]:
Bandari = pd.read_csv('/content/drive/MyDrive/ML_Data/Bandari.csv')
Gilaki = pd.read_csv('/content/drive/MyDrive/ML_Data/Gilaki.csv')
Kordi = pd.read_csv('/content/drive/MyDrive/ML_Data/Kordi.csv')
Lori = pd.read_csv('/content/drive/MyDrive/ML_Data/Lori.csv')
Torki = pd.read_csv('/content/drive/MyDrive/ML_Data/Torki.csv')

all = pd.concat([Bandari, Gilaki, Kordi, Lori, Torki])
all.reset_index()

all.to_csv('/content/drive/MyDrive/ML_Data/ML_Project_Data.csv')