In [1]:
%load_ext jupyter_black

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import os
import sys
import pandas as pd
import numpy as np
import requests
from spotify_dl import spotify_dl
from pathlib import Path
import time
import os
from dotenv import load_dotenv  # changed magic command to explicit load
import librosa
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import pairwise
from sklearn.model_selection import train_test_split
from typing import List
from flask import Flask, redirect, request
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D  # new
from tensorflow.keras.layers import MaxPooling2D  # new
from tensorflow.keras.layers import Flatten  # new
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.callbacks import EarlyStopping
from skimage.transform import resize


pd.set_option("display.max_rows", None)  # pandas dataframe formatting options
pd.set_option("display.max_columns", None)
# pd.options.display.float_format = "{:,.2f}".format


custom_env_path = "../../brainstation_capstone_cfg.env"  # environment variables file

## Feature Function Definitions

In [3]:
# How was this solved? sampling rate must be explicitly passed to every function
# This includes the display function itself!
# Hence, the sampling rate is now an output from the function itself
def get_mfcc(y, sr):
    mfcc = librosa.power_to_db(librosa.feature.mfcc(y=y, sr=sr), ref=np.max)
    return mfcc


def get_melspectrogram(y, sr):
    melspectrogram = librosa.power_to_db(
        librosa.feature.melspectrogram(y=y, sr=sr), ref=np.max
    )
    # this is a power spectrum (amplitude squared)
    return melspectrogram


def get_chroma_vector(y, sr):
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    return chroma


def get_tonnetz(y, sr):
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    return tonnetz


def get_feature(input_file_path, track_id):
    # Load data
    y, sr = librosa.load(
        input_file_path,
        sr=None,
        offset=10,
        duration=120,
    )
    # Extracting MFCC feature
    mfcc = get_mfcc(y, sr)
    # Move save step to this point!
    np.save(f"../data/vectorized_mp3s/raw/mfcc_{track_id}.npy", mfcc)
    mfcc_mean = mfcc.mean(axis=1)
    mfcc_min = mfcc.min(axis=1)
    mfcc_max = mfcc.max(axis=1)
    mfcc_feature = np.concatenate((mfcc_mean, mfcc_min, mfcc_max))

    # Need to move the save to before the reduction of features

    # Extracting Mel Spectrogram feature
    melspectrogram = get_melspectrogram(y, sr)
    np.save(
        f"../data/vectorized_mp3s/raw/melspectrogram_{track_id}.npy",
        melspectrogram,
    )
    melspectrogram_mean = melspectrogram.mean(axis=1)
    melspectrogram_min = melspectrogram.min(axis=1)
    melspectrogram_max = melspectrogram.max(axis=1)
    melspectrogram_feature = np.concatenate(
        (melspectrogram_mean, melspectrogram_min, melspectrogram_max)
    )

    # Extracting chroma vector feature
    chroma = get_chroma_vector(y, sr)
    np.save(f"../data/vectorized_mp3s/raw/chroma_{track_id}.npy", chroma)
    chroma_mean = chroma.mean(axis=1)
    chroma_min = chroma.min(axis=1)
    chroma_max = chroma.max(axis=1)
    chroma_feature = np.concatenate((chroma_mean, chroma_min, chroma_max))

    # Extracting tonnetz feature
    tntz = get_tonnetz(y, sr)
    np.save(f"../data/vectorized_mp3s/raw/tonnetz_{track_id}.npy", tntz)
    tntz_mean = tntz.mean(axis=1)
    tntz_min = tntz.min(axis=1)
    tntz_max = tntz.max(axis=1)
    tntz_feature = np.concatenate((tntz_mean, tntz_min, tntz_max))

    # return chroma_feature, melspectrogram_feature, mfcc_feature, tntz_feature
    # this returns both the pairwise vector feature and the CNN feature
    pairwise_feature = np.concatenate(
        (chroma_feature, melspectrogram_feature, mfcc_feature, tntz_feature)
    )
    height = 224
    width = 224
    mel_resize = resize(melspectrogram, (height, width))
    chroma_resize = resize(chroma, (height, width))
    tntz_resize = resize(tntz, (height, width))
    layers = [mel_resize, chroma_resize, tntz_resize]
    image_stack = []
    for layer in layers:
        max_val = layer.max()
        min_val = layer.min()
        range = max_val - min_val
        range = max(range, 1.0)
        # need to have this to handle division by 0
        norm = (layer - min_val) / range
        image_stack.append(norm.astype(np.float32))
    cnn_feature = np.dstack(image_stack)
    return pairwise_feature, cnn_feature

## CNN Model Setup

In [4]:
def set_random_seed(seed):
    """Random seeds for reproducability"""

    random.seed(seed)
    tf.random.set_seed(seed)

In [5]:
height = 224
width = 224
channels = 3
res_model = ResNet50V2(
    weights="imagenet", include_top=True, input_shape=(height, width, channels)
)
res_model.summary()

Model: "resnet50v2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_1[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 pool1_pad (ZeroPadding2D)   (None, 114, 114, 64)         0         ['conv1_conv[0][0]']          
                                                                                         

In [6]:
# Freeze all the layers in the base model
for layer in res_model.layers:
    layer.trainable = False

In [7]:
# For image analysis we only need to flatten for embedding
set_random_seed(121)
res_out = res_model.output
output = Flatten()(res_out)

In [8]:
model = Model(inputs=res_model.input, outputs=output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_1[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 pool1_pad (ZeroPadding2D)   (None, 114, 114, 64)         0         ['conv1_conv[0][0]']          
                                                                                              

## Vectorize Data

### Count MP3s

In [9]:
data_dir = Path("../data/mp3s/")
path_glob = data_dir.rglob("*.mp3")
pairwise_file_paths = []
for pairwise_file_path in path_glob:
    pairwise_file_paths.append(
        pairwise_file_path
    )  # creates a list for repeated iteration
    # if this is not done, the .rglob command above has to be repeated to regenerate iterator
len(pairwise_file_paths)  # number of mp3s in directory

11578

### Generate Track Feature Files

#### Pairwise Parquet and CNN Matrices

In [None]:
# Generator objects need to be rerun each time to prevent exhaustion
data_dir = Path("../data/mp3s/")
output_pairwise_dir = Path("../data/vectorized_mp3s/pairwise_parquets/")
output_cnn_dir = Path("../data/vectorized_mp3s/cnn_parquets/")
data_dir_glob = data_dir.rglob("*.mp3")
output_pairwise_glob = output_pairwise_dir.rglob("*.parquet")
output_cnn_glob = output_cnn_dir.rglob("*.parquet")
count = 1
file_paths = [file_path for file_path in data_dir_glob]
print("Number of MP3 Files: ", len(file_paths), "\n")
pairwise_track_ids = [file_path.stem for file_path in output_pairwise_glob]
cnn_track_ids = [file_path.stem for file_path in output_cnn_glob]
for file_path in file_paths:
    print(f"{count}. MP3 FILE PATH: \n", f"{file_path}")
    path_split = str(file_path).split("/")
    track_id = path_split[3]
    pairwise_track, cnn_track = get_feature(file_path, track_id)
    if (len(pairwise_track_ids) > 0) & (track_id in pairwise_track_ids):
        print(f"{track_id} has already been vectorized for pairwise...skipping...")
    else:
        vectorized_df = pd.DataFrame(pairwise_track).T
        vectorized_df["track_id"] = track_id
        vectorized_df = vectorized_df.set_index(vectorized_df.track_id).drop(
            columns="track_id"
        )
        vectorized_df.columns = vectorized_df.columns.astype(str)
        vectorized_df.to_parquet(
            f"../data/vectorized_mp3s/pairwise_parquets/{track_id}.parquet"
        )
        print(f"{track_id} has been pairwise vectorized!")
    if (len(cnn_track_ids) > 0) & (track_id in cnn_track_ids):
        print(f"{track_id} has already been vectorized for CNN...skipping...\n")
        count += 1
    else:
        cnn_track = np.expand_dims(cnn_track, axis=0)
        cnn_track = model.predict(cnn_track)
        vectorized_df = pd.DataFrame(cnn_track)
        vectorized_df["track_id"] = track_id
        vectorized_df = vectorized_df.set_index(vectorized_df.track_id).drop(
            columns="track_id"
        )
        vectorized_df.columns = vectorized_df.columns.astype(str)
        vectorized_df.to_parquet(
            f"../data/vectorized_mp3s/cnn_parquets/{track_id}.parquet"
        )
        print(f"{track_id} has been CNN vectorized!\n")
        count += 1

Number of MP3 Files:  11578 

1. MP3 FILE PATH: 
 ../data/mp3s/1ZB2qWsheGabSEYvBYxjKn/Take on Me/Weezer - Take on Me.mp3
1ZB2qWsheGabSEYvBYxjKn has already been vectorized for pairwise...skipping...
1ZB2qWsheGabSEYvBYxjKn has already been vectorized for CNN...skipping...

2. MP3 FILE PATH: 
 ../data/mp3s/5V9H9J5GcUGY5ig029g5OU/Shkleepy/Manwolves - Shkleepy.mp3
5V9H9J5GcUGY5ig029g5OU has already been vectorized for pairwise...skipping...
5V9H9J5GcUGY5ig029g5OU has already been vectorized for CNN...skipping...

3. MP3 FILE PATH: 
 ../data/mp3s/34FsCOAQ0U99vAh3uoiLmm/Bandana (feat. Young Buck)/Dirty Audio, BL3R, Young Buck - Bandana (feat. Young Buck).mp3
34FsCOAQ0U99vAh3uoiLmm has already been vectorized for pairwise...skipping...
34FsCOAQ0U99vAh3uoiLmm has already been vectorized for CNN...skipping...

4. MP3 FILE PATH: 
 ../data/mp3s/25mldAmMHYzXhDXCxTpTHy/Chloroform/Phoenix - Chloroform.mp3
25mldAmMHYzXhDXCxTpTHy has already been vectorized for pairwise...skipping...
25mldAmMHYzXhDXCx

#### Create Complete Pairwise Parquet

In [None]:
vectorized_data_path = Path("../data/vectorized_mp3s/pairwise_parquets/")
path_glob = vectorized_data_path.rglob("*.parquet")
dataframes = []
for file_path in path_glob:
    tmp = pd.read_parquet(f"{file_path}")
    dataframes.append(tmp)
vectorized_df = pd.concat(dataframes)

In [None]:
output_file_date = datetime.now().strftime(
    "%Y%m%d"
)  # time stamps the file with the reverse of the date
vectorized_df.to_parquet(
    f"../data/vectorized_mp3s/pairwise_complete_parquets/{output_file_date}_complete_pairwise_data.parquet"
)
assert vectorized_df.shape[0] == 11578

#### Create Complete CNN Parquet

In [None]:
vectorized_data_path = Path("../data/vectorized_mp3s/cnn_parquets/")
path_glob = vectorized_data_path.rglob("*.parquet")
dataframes = []
for file_path in path_glob:
    tmp = pd.read_parquet(f"{file_path}")
    dataframes.append(tmp)
vectorized_df = pd.concat(dataframes)

In [None]:
output_file_date = datetime.now().strftime(
    "%Y%m%d"
)  # time stamps the file with the reverse of the date
vectorized_df.to_parquet(
    f"../data/vectorized_mp3s/cnn_complete_parquets/{output_file_date}_complete_cnn_data.parquet"
)
assert vectorized_df.shape[0] == 11578