### Load dataset

- `path:` list of paths of chours.
- `titles:` list of titles of songs.
- `data:` a list contains data for each chours.

In [None]:
import pandas as pd
dataset = pd.read_csv('Data/chorusData.csv')
paths = list(dataset['choruspath'])
titles = list(dataset['Title'])
data = []

## Two methods to extract features.
### First Method (statistics)
    That include all statistics we extract from features (skew, min, max, std, mean, median, kurtosis)
*`parameters`*
- `list:` list of feature we extracted from chours.
- `feature:` feature name shich we use with librosa.
- `columns_name:` list of all features we extracted which is 518 stored them to use it as a columns of dataframe
- `data:` data that contain 518 value which for every statistics for every feature. 

return data to collect all feature for it in second method....

### Second Method (extract_features)
    The main method of extract feature
*`parameters`*
- `audio_path:` path of chours we want to extract features from it.
- `title:` title of chours

return data and columns_name

In [None]:
import librosa
import numpy as np
from scipy.stats import skew, kurtosis

def statistics(list, feature, columns_name, data):
    i = 0
    for ele in list:
        _skew = skew(ele)
        columns_name.append(f'{feature}_kew_{i}')
        min = np.min(ele)
        columns_name.append(f'{feature}_min_{i}')
        max = np.max(ele)
        columns_name.append(f'{feature}_max_{i}')
        std = np.std(ele)
        columns_name.append(f'{feature}_std_{i}')
        mean = np.mean(ele)
        columns_name.append(f'{feature}_mean_{i}')
        median = np.median(ele)
        columns_name.append(f'{feature}_median_{i}')
        _kurtosis = kurtosis(ele)
        columns_name.append(f'{feature}_kurtosis_{i}')

        i += 1
        data.append(_skew) 
        data.append(min)
        data.append(max) 
        data.append(std) 
        data.append(mean) 
        data.append(median) 
        data.append(_kurtosis)
    
    return data

def extract_features(audio_path, title):

    data = []
    columns_name = ['title']
    data.append(title)
    x , sr = librosa.load(audio_path)

    chroma_stft = librosa.feature.chroma_stft(x, sr)
    statistics(chroma_stft, 'chroma_stft', columns_name, data)

    chroma_cqt = librosa.feature.chroma_cqt(x, sr)
    statistics(chroma_cqt, 'chroma_cqt', columns_name, data)

    chroma_cens = librosa.feature.chroma_cens(x, sr)
    statistics(chroma_cens, 'chroma_cens', columns_name, data)

    mfcc = librosa.feature.mfcc(x, sr)
    statistics(mfcc, 'mfcc', columns_name, data)
    
    rms = librosa.feature.rms(x, sr)
    statistics(rms, 'rms', columns_name, data)

    spectral_centroid = librosa.feature.spectral_centroid(x , sr)
    statistics(spectral_centroid, 'spectral_centroid', columns_name, data)

    spectral_bandwidth = librosa.feature.spectral_bandwidth(x , sr)
    statistics(spectral_bandwidth, 'spectral_bandwidth', columns_name, data)

    spectral_contrast = librosa.feature.spectral_contrast(x , sr)
    statistics(spectral_contrast, 'spectral_contrast', columns_name, data)
    
    spectral_rolloff = librosa.feature.spectral_rolloff(x , sr)
    statistics(spectral_rolloff, 'spectral_rolloff', columns_name, data)

    tonnetz = librosa.feature.tonnetz(x , sr)
    statistics(tonnetz, 'tonnetz', columns_name, data)

    zero_crossing_rate = librosa.feature.zero_crossing_rate(x , sr)
    statistics(zero_crossing_rate, 'zero_crossing_rate', columns_name, data)

    return data, columns_name


### Here we extract features
*`parameters`*
- `i:` itrator to loop in paths.
- `audio_path:` path of chorus.
- `d:` data which returned from extract_features.
- `cols:` colmuns_name which returned from extract_features.

`NOTE` It took about 30 minutes

In [None]:
i = 0
while i < len(paths):
    audio_path = paths[i]
    d, cols = extract_features(audio_path, titles[i])
    data.append(d)
    print(f'The {i} song Done...')
    i += 1

In [None]:
# At first time we should convert data.
newData = pd.DataFrame(data, columns=cols)
newData.to_csv('Data/newData.csv')


# Because I do it in several times, I use this code but I must use code above in the first time.
# newData = pd.read_csv('Data/NewData.csv')
# for row in data:
#     newData = newData.append(row, ignore_index = True)

### Now add newData to dataset and clean it.

In [None]:
# It's shape is (751, 523) if we deleted (Artist, Title, Label, Path, choruspath) columns it will be (751, 518 as mentioned in document)
finaldata = pd.concat([dataset, newData], axis=1, join='inner')
print(finaldata.shape)
finaldata.head(5)

In [None]:
newtitles = list(newData['title'])
lostdata = []  # list to Titles and titles with doesn't matches
for i in range(len(titles)):
    if titles[i] != newtitles[i]:
        lostdata.append(titles[i])

# to see if any lost data or not. if(just delete rows) else dataset is good
lostdata

In [None]:
# delete nan rows and data which titles doesn't match
# dataset = dataset[dataset.Title.isin(lostdata) == False]
# dataset.dropna(inplace=True)

In [None]:
# if we deleted rows just reset index after that delete title row
# dataset.reset_index()
del finaldata['title']

In [None]:
# Save last data
finaldata.to_csv('Data\Final Data.csv', index=False)