# Data Processing

This notebook will extract chroma, mfcc, and spectral features from each audio file and combine them into a large data matrix which we will use in our analyses.

Importantly, we segment each 30-second audio clip into 20 clips each 1.5 seconds long. In doing this, we hope to minimize the variation in each audio clip, so that the extracted features closely match the genre that it is labelled by.

In [None]:
import os
import pandas as pd
import librosa
import numpy as np
from scipy.spatial.distance import euclidean
from collections import Counter
import soundfile as sf
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from scipy.stats import mode
import csv
from sklearn.preprocessing import StandardScaler

In [None]:
train_filenames = sorted(os.listdir("Dataset/train"))
train_filenames = [filename for filename in train_filenames if filename != ".DS_Store"]
test_filenames = sorted(os.listdir("Dataset/test"))
test_filenames = [filename for filename in test_filenames if filename != ".DS_Store"]

In [None]:
n_mfcc = 35
num_segments = 20

## Extract features from training set

In [None]:
train_labels = pd.read_csv("Dataset/train.csv")
genres = train_labels['Genre'].tolist()

In [None]:
train_features = []
train_labels = []

for i in range(len(train_filenames)):
    filename = train_filenames[i]
    genre = genres[i]
    file_path = os.path.join('Dataset/train', filename)
    y, sr = librosa.load(file_path)
    segments = np.array_split(y, num_segments)
    for segment in segments:
        all_features = []
        chroma_features = librosa.feature.chroma_cqt(y=segment, sr=sr).mean(axis=1)
        mfcc_features = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=n_mfcc).mean(axis=1)
        contrast_features = librosa.feature.spectral_contrast(y=segment, sr=sr, n_fft=512).mean(axis=1)
        flatness_features = librosa.feature.spectral_flatness(y=segment, n_fft=512).mean(axis=1)

        all_features.extend(chroma_features)
        all_features.extend(mfcc_features)
        all_features.extend(contrast_features)
        all_features.extend(flatness_features)
        train_features.append(all_features)
        train_labels.append(genre)

train_features = np.matrix(train_features)



## Extract features from testing set

In [None]:
test_features = []

for i in range(len(test_filenames)):
    filename = test_filenames[i]
    file_path = os.path.join('Dataset/test', filename)
    y, sr = librosa.load(file_path)
    segments = np.array_split(y, num_segments)
    for segment in segments:
        all_features = []
        chroma_features = librosa.feature.chroma_cqt(y=segment, sr=sr).mean(axis=1)
        mfcc_features = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=n_mfcc).mean(axis=1)
        contrast_features = librosa.feature.spectral_contrast(y=segment, sr=sr, n_fft=512).mean(axis=1)
        flatness_features = librosa.feature.spectral_flatness(y=segment, n_fft=512).mean(axis=1)

        all_features.extend(chroma_features)
        all_features.extend(mfcc_features)
        all_features.extend(contrast_features)
        all_features.extend(flatness_features)
        test_features.append(all_features)

test_features = np.matrix(test_features)

## Normalize data

In [None]:
scaler = StandardScaler()

train_features = scaler.fit_transform(np.asarray(train_features))
test_features = scaler.transform(np.asarray(test_features))

In [None]:
if not os.path.isdir("Processed(7)"): os.mkdir("Processed(7)")

In [None]:
np.save("Processed(7)/train_features", train_features)
np.save("Processed(7)/test_features", test_features)
np.save("Processed(7)/train_labels", train_labels)
np.save("Processed(7)/unshortened_train_labels", genres)

## Extra: make a submission folder

In [None]:
if not os.path.isdir("Submissions"): os.mkdir("Submissions")