In [11]:
import os
import time
import json

import IPython.display as ipd
import numpy as np
import pandas as pd
import seaborn as sns

import utils

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [4]:
extra_audio_genre_df = pd.read_csv("extra_audio_genre.csv")
extra_audio_derived_genre_df = pd.read_csv("extra_audio_derived_genre.csv")

In [97]:
jazz_scrapped = pd.read_csv("jazz.csv")
blues_scrapped = pd.read_csv("blues.csv")

In [147]:
spotify_features_to_drop = [
    'artist',
    'key',
    'librosa_features',
    'loudness',
    'mode',
    'name',
    'time_signature',
    'duration_ms'
]

def clean_new_data(data, genre, spotify_features_to_drop=[]):
    librosa_info = {}

    for index, librosa_features in data.librosa_features.iteritems():
        cleaned_row = clean_new_data_row(librosa_features, librosa_info)
        librosa_info.update(cleaned_row)
        
    librosa_info_df = pd.DataFrame(librosa_info)
    
    cleaned_data =  pd.concat([data.drop(spotify_features_to_drop, axis=1), librosa_info_df], axis=1)
    cleaned_data['duration'] = data.duration_ms
    cleaned_data['genre_top'] = genre

    return cleaned_data

def clean_new_data_row(librosa_features, librosa_info={}):
    cleaned_librosa_features = librosa_info
    for feature, value in json.loads(json.loads(librosa_features)).items():
        if "ske" not in feature and "spectral_centroid" not in feature:
            feature = eval(feature) # Parsing a string repr of a tuple. Don't do this at home.
            feature_name = f'{feature[0]}_{feature[1][0:3]}_{feature[2].lstrip("0")}'
            try:            
                cleaned_librosa_features[feature_name].append(value)
            except KeyError:
                cleaned_librosa_features[feature_name] = [value]
    return cleaned_librosa_features

In [155]:
cleaned_jazz_scrapped = clean_new_data(jazz_scrapped, "Jazz", spotify_features_to_drop)
cleaned_blues_scrapped = clean_new_data(blues_scrapped, "Blues", spotify_features_to_drop)

df = pd.concat([cleaned_jazz_scrapped, cleaned_blues_scrapped, extra_audio_genre_df])

print("New added tracks:")
df.genre_top.value_counts() - extra_audio_genre_df.genre_top.value_counts()

New added tracks:


Blues                  241
Classical                0
Electronic               0
Experimental             0
Folk                     0
Hip-Hop                  0
Instrumental             0
International            0
Jazz                   201
Old-Time / Historic      0
Pop                      0
Rock                     0
Name: genre_top, dtype: int64

In [156]:
df.to_csv("final_dataset.df")