# Spotify Recommendation Model
 This notebook file includes code to:
1. Import required toolboxes
2. Connect to a spotify dev account
3. Organize and pre-process data from provided playlists
4. Build and train and neural network regression model
5. Generate song reccommendations and filter them through model
6. Upload model-filtered song recommendations to a provided spotify playlist

To run this notebook you will need a spotify dev account.  To create one, go to this link and setup an account: https://developer.spotify.com/dashboard/

> Once you have a dev account, go to the dashboard tab and create an app.  Next navigate to your app and locate your Client ID and Client ID Secret.  Copy these two client codes into the notebook code cell below labeld "Authorization".  Next go to your spotify dev project and click "Edit Settings" and under the "Redirect URL" section, place your desired redirect url (ie: https://www.google.com/) for authorization and select save.  Place your redirect url into notebook code cell below "Authorization".

If you are running this code on google colab:
1. Use Chrome as the web browser
2. Change the Runtime type to use a GPU - select Runtime at the top, change runtime type,  select GPU under hardware accelerator



# Imports

In [None]:
!pip install -r requirements.txt

In [None]:
try:
    import spacy
    nlp_model = spacy.load("en_core_web_lg")
    print('spacy model already downloaded')
except:
    !python -m spacy download en_core_web_lg
    import spacy
    nlp_model = spacy.load("en_core_web_lg")

In [None]:
# General Toolboxes
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from numpy import argmax
import pandas as pd
from collections import Counter
from datetime import datetime
import os
import itertools
import math
import collections
from random import shuffle
import zipfile
import ast
import scipy
from scipy import stats
from tqdm import tqdm
import json
import math
import ast
from dotenv import load_dotenv
import shutil
from numpy2tfrecord import Numpy2TFRecordConverter, build_dataset_from_tfrecord

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ipywidgets import interact, interactive, fixed, interact_manual, Layout, FloatSlider
import ipywidgets as widgets

import keras
from keras.layers import Flatten
import tensorflow as tf

# Tensorflow imports
import gc
import gc
from keras.activations import relu, sigmoid, softmax, tanh, selu, elu, gelu, leaky_relu
from keras import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Activation, AlphaDropout, LSTM, RNN, GRU, SimpleRNN, LayerNormalization, InputLayer, TimeDistributed, Bidirectional
from keras.layers import ReLU, ELU, LeakyReLU, PReLU, MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Conv1D, Concatenate, Flatten
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.constraints import MaxNorm
from keras.regularizers import l2, l1, L1L2
from tensorboard.plugins.hparams import api as hp
import keras_tuner
import keras_tuner as kt

# Imports for custom tuner
# from keras_tuner.src.engine import tuner_utils
import copy
import random
try:
    import scipy
    import scipy.optimize
except ImportError:
    scipy = None


# Spotify
import spotipy
import spotipy.util as util

# SkLearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import class_weight

# NLP
import spacy

import joblib
pd.set_option('display.max_columns', None)

# Environment Variables

In [None]:
# Load environmental variables
load_dotenv(override=True)
DATA_DIR = os.environ.get('DATA_DIR')
def man_win_lin_encode(str_path):
    if ':' in str_path:
        out = str_path.replace('\\', '/')
        out = out.replace(f'{DATA_DIR[:2]}', f'/mnt/{DATA_DIR[0]}'.lower())
    else:
        out = str_path
    return out
DATA_DIR = man_win_lin_encode(DATA_DIR)
API_KEYS = ast.literal_eval(os.environ.get('APIS'))
SCOPE = os.environ.get('API_SCOPE')
REDIRECT_URI = os.environ.get('REDIRECT_URI')
API_NAMES = list(API_KEYS.keys())
INPUT_PLAYLISTS = ast.literal_eval(os.environ.get('INPUT_PLAYLISTS'))
BLACKLIST_PLAYLISTS = ast.literal_eval(os.environ.get('BLACKLIST_PLAYLISTS'))
FEATURE=None

# Check to make sure subfolders are created for environment
dir_folders = ['Databases', 'Model_Tuning', 'Saved_Models']
for folder in dir_folders:
    dirname = os.path.join(DATA_DIR, folder)
    if os.path.exists(dirname) == False:
        os.mkdir(dirname)
rec_dir = os.path.join(DATA_DIR, 'Databases', 'Recommend_Data')
if os.path.exists(rec_dir)==False:
    os.mkdir(rec_dir)
# Check for config file and load it
if os.path.isfile('config.json')==False:
  json.dump(dict(), open('config.json', 'w'))

# Set environemntal variables
BASIC_DATASET_API_COUNTER = 0
AUDIO_FEATURES_API_COUNTER = 0

In [None]:
# Widgets
style = {'description_width': '40%'}
api_wdg = widgets.Dropdown(
    options=API_NAMES,
    value=API_NAMES[0],
    description='API Key',
    layout=Layout(width='25%'),
    disabled=False,
    style=style
)
operation_wdg = widgets.Dropdown(
    options=["create initial basic dataset", "generate raw td dataset", "create feature tables", "create final data table"],
    value='create initial basic dataset',
    description='Dataset Creation Operation:',
    layout=Layout(width='25%'),
    disabled=False,
    style=style
)
weights_wdg = widgets.Dropdown(
    options=["balanced", "custom"],
    value='balanced',
    description='Select how you want to calculate class weights.  Balanced evens all ratings and custom will allow you to pick your own per playlist:',
    layout=Layout(width='25%'),
    disabled=False,
    style=style
)
use_neutral_songs_wdg = widgets.Checkbox(
    value=True,
    description='Add songs to the database that were recommended from the model and from songs only in blacklist playlists',
    disabled=False,
    indent= False,
    style={'description_width': '80%'},
  layout=Layout(width='25%'))

made_widgets = [api_wdg, operation_wdg, weights_wdg, use_neutral_songs_wdg]

# Config

Steps:
1. Create Initial Basic Dataset
> a. Optionally check box for calculate initial_min_max_values and add to saved set <br>
> b. Save Dataset, rename, and upload to Drive <br>
> c. Update File directory in Generate Raw TD Dataset
2. Create Raw Time Domain Dataset
> a. Download, rename, and upload datasets to drive <br>
> b. Update file directory in Create Feature Tables
3. Calculate Values for Feature Tables by checking the Calculate_data_dict_values box for the Following:
> a. genre <br>
> b. general <br>
> c. sct_data <br>
> d. sgm_loud <br>
> e. sgm_timbre max and mean/sdv <br>

4. Create Feature Tables
> a. general <br>
> b. genre <br>
> c. sct_data <br>
> d. sgm_loud <br>
> e. sgm_timbre <br>
> f. sgm_pitch

5. Tune Feature Models and Save Best Model
6. Put Feature model directories into notebook
7. Create Final Data Table



In [None]:
for widget in made_widgets:
  display(widget)

In [None]:
SELECTED_API = api_wdg.value
operation = operation_wdg.value
weight_mode = weights_wdg.value
prediction_type = 'Regression'
use_neutral_songs = use_neutral_songs_wdg.value
normalization_mode='minmax'

# Update API
CLIENT_ID, CLIENT_SECRET = API_KEYS[SELECTED_API]

# Update config file
config = json.load(open('config.json'))
config['prediction_type'] = prediction_type
if 'norms' not in config.keys(): config['norms'] = {}
for feature in ["general", "genre", "sct_data", "sgm_loud", "sgm_timbre"]:
  if feature not in config['norms'].keys(): config['norms'][feature] = {}
json.dump(config, open('config.json', 'w'))

GEN_INPUT_SHAPES = {
  'genre': [None, 300],
  'general': [None, 49],
  'sct_data': [None, 300, 10],
  'sgm_loud': [None, 3000, 5],
  'sgm_pitch': [None, 3000, 13],
  'sgm_timbre': [None, 3000, 13],
  'big_data': [None, 3000, 39],
  'overall': [None, 6]
}

# Authorization
To authenticate your connection to Spotify, click on the link after running this code, and paste the redirected URL into the box.

In [None]:
if operation == 'generate raw td dataset' or operation == 'create initial basic dataset':
  def create_connection(hide_output=False):
    '''Creates a spotipy connection.  If fails then delete .cache file and re-try'''
    global CLIENT_ID, CLIENT_SECRET, REDIRECT_URI, SCOPE, SELECTED_API
    try:
      sp = spotipy.Spotify(
          auth_manager=spotipy.SpotifyOAuth(
              client_id=CLIENT_ID,
              client_secret=CLIENT_SECRET,
              redirect_uri=REDIRECT_URI,
              scope=SCOPE, open_browser=False),
              requests_timeout=20, retries=3)
      form_conn = sp.artist('spotify:artist:3jOstUTkEu2JkjvRdBA5Gu')
      test_audio_analysis = sp.audio_analysis('3a1lNhkSLSkpJE4MSHpDu9')
      if hide_output==False:
        print(f'Authorization {SELECTED_API} Sucessfull!')
    except:
      try:
        print('Removing spotipy cache file since api failed the 1st try.')
        os.remove('.cache')
        sp = spotipy.Spotify(
          auth_manager=spotipy.SpotifyOAuth(
              client_id=CLIENT_ID,
              client_secret=CLIENT_SECRET,
              redirect_uri=REDIRECT_URI,
              scope=SCOPE, open_browser=False),
              requests_timeout=20, retries=3)
        form_conn = sp.artist('spotify:artist:3jOstUTkEu2JkjvRdBA5Gu')
        test_audio_analysis = sp.audio_analysis('3a1lNhkSLSkpJE4MSHpDu9')
        if hide_output==False:
          print(f'Authorization {SELECTED_API} Sucessfull!')
      except:
        try:
            os.remove('.cache')
            key_list = list(API_KEYS.keys())
            current_key_idx = key_list.index(SELECTED_API)
            if current_key_idx==len(key_list)-1:
              next_key_idx = 0
            elif current_key_idx==0 and len(key_list)==1:
              print('Need more spotify api keys to prepare data.  Please create more and add them to .env')
            else:
              next_key_idx = current_key_idx + 1
            SELECTED_API = key_list[next_key_idx]
            CLIENT_ID, CLIENT_SECRET = API_KEYS[SELECTED_API]
        except:
            sp=None
            print(f'Authorization {SELECTED_API} Failed!')
            print('API connection Failed!  Check your api client id, secret, scope and redirect ui.')
    return sp
  sp = create_connection()
else:
  sp=None

# Functions

In [None]:
# New Feature Code
pd.set_option('display.max_columns', None)


def generate_big_data():
  global FEATURE
  DATA_FILE_DIR = os.path.join(DATA_DIR, 'Databases')
  num_files = len(os.listdir(os.path.join(DATA_DIR, 'Databases', 'sgm_pitch')))
  dirname = os.path.join(DATA_FILE_DIR, 'big_data')
  if os.path.exists(dirname) == False:
    os.mkdir(dirname)
  for i in tqdm(range(1, num_files+1)):
    for feature in ['sct_data', 'sgm_loud', 'sgm_pitch', 'sgm_timbre']:
      FEATURE=feature
      feat_path = os.path.join(DATA_FILE_DIR, feature, f'{feature}_dataset_p{i}.tfrecord')
      dataset = tf.data.TFRecordDataset([feat_path])
      dataset = dataset.map(parse_feature_function_song_id)
      dataset = dataset.batch(500)
      for raw_record in dataset:
          x, y, weight, song_id = raw_record
      x, y, weight, song_id = x.numpy(), y.numpy(), weight.numpy(), song_id.numpy()
      if feature=='sct_data':
        x = np.repeat(x, 10, axis=1)
      df = pd.DataFrame(data=list(zip(song_id, y, weight, x)),
               columns=['song_id', 'rating', 'weight', feature])
      if feature=='sct_data':
          df_pred = df
      else:
          df_pred = pd.merge(df_pred, df, how='inner',
                            left_on=['song_id', 'rating', 'weight'],
                            right_on=['song_id', 'rating', 'weight'])
    x_sct_data = np.float32(np.array([np.array(row) for row in df_pred['sct_data']]))
    x_sgm_loud = np.float32(np.array([np.array(row) for row in df_pred['sgm_loud']]))
    x_sgm_pitch = np.float32(np.array([np.array(row) for row in df_pred['sgm_pitch']]))
    x_sgm_timbre = np.float32(np.array([np.array(row) for row in df_pred['sgm_timbre']]))
    x = np.array(np.dstack((np.dstack((np.dstack((x_sct_data, x_sgm_loud)), x_sgm_pitch[:,:,1:])), x_sgm_timbre[:,:,1:])))
    y = df_pred['rating'].to_numpy()
    weight_output = df_pred['weight'].to_numpy()
    song_id = df_pred['song_id'].to_numpy()
    y = np.float32(y.astype(np.float64).reshape(y.shape[0],1))
    weight_output = np.float32(weight_output.astype(np.float64).reshape(weight_output.shape[0],1))
    song_id = np.int64(song_id.reshape(song_id.shape[0],1))
    save_name = os.path.join(dirname, f'big_data_dataset_p{i}.tfrecord')
    with Numpy2TFRecordConverter(save_name) as converter:
      sample={"x": x, "y": y, "weight": weight_output, 'song_id': song_id}
      converter.convert_batch(sample)
      # delete info files
    for file in os.listdir(dirname):
        if '.info' in file:
          os.remove(os.path.join(dirname, file))


def translate_ids(song_ids, return_id_only=True):
    song_id_lookup = json.load(open('song_id_lookup.json'))
    if type(song_ids)==str:
        unq_ids, ratings = song_id_lookup[song_ids]
    else:
        unq_ids, ratings = [], []
        for song_id in song_ids:
            unq_id, rating = song_id_lookup[song_id]
            unq_ids.append(unq_id)
            ratings.append(rating)
    if return_id_only:
        return unq_ids
    else:
        return unq_ids, ratings

def reverse_translate_ids(song_ids, return_song_id=True):
    song_id_lookup = json.load(open('song_id_lookup.json'))
    reverse_dict = {}
    for key, value in song_id_lookup.items():
        reverse_dict[value[0]] = (key, value[1])
    if type(song_ids)==str:
        unq_ids, ratings = reverse_dict[song_ids]
    else:
        unq_ids, ratings = [], []
        for song_id in song_ids:
            unq_id, rating = reverse_dict[song_id]
            unq_ids.append(unq_id)
            ratings.append(rating)
    if return_song_id:
        return unq_ids
    else:
        return unq_ids, ratings


def get_overall_data():
  global FEATURE
  DATA_FILE_DIR = os.path.join(DATA_DIR, 'Databases')
  num_files = len(os.listdir(os.path.join(DATA_DIR, 'Databases', 'sgm_pitch')))
  dirname = os.path.join(DATA_FILE_DIR, 'overall')
  # Loop through files
  features = ["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]
  FEATURE='overall'
  for i in range(1, num_files+1):
      feat_path = os.path.join(dirname, f'overall_dataset_p{i}.tfrecord')
      dataset = tf.data.TFRecordDataset([feat_path])
      dataset = dataset.map(parse_feature_function_song_id)
      dataset = dataset.batch(500)
      for raw_record in dataset:
          x, y, weight, song_id = raw_record
      x, y, weight, song_id = x.numpy(), y.numpy().reshape(-1,1), weight.numpy().reshape(-1,1), song_id.numpy().reshape(-1,1)
      song_id = np.array(reverse_translate_ids(song_id.reshape(-1))).reshape(-1,1)
      data = np.hstack((x,y,weight))
      df_pred = pd.DataFrame(data=data, columns=features + ['rating', 'weight'])
      df_pred['song_id'] = song_id
      if i==1:
          out=df_pred
      else:
          out = pd.concat([out, df_pred], axis=0)
  return out


def parse_feature_function(example_proto, tune_shapes=GEN_INPUT_SHAPES):
    data_shape=tune_shapes[FEATURE][1:]
    tfrecord_format = {
            "x": tf.io.FixedLenFeature(data_shape, tf.float32),
            "y": tf.io.FixedLenFeature([], tf.float32),
            "weight": tf.io.FixedLenFeature([], tf.float32),
            "song_id": tf.io.FixedLenFeature([], tf.int64)
        }
    features = tf.io.parse_single_example(example_proto, tfrecord_format)
    x=features['x']
    y = features['y']
    weight = features['weight']
    return x, y, weight


def parse_feature_function_song_id(example_proto, tune_shapes=GEN_INPUT_SHAPES):
    data_shape=tune_shapes[FEATURE][1:]
    tfrecord_format = {
            "x": tf.io.FixedLenFeature(data_shape, tf.float32),
            "y": tf.io.FixedLenFeature([], tf.float32),
            "weight": tf.io.FixedLenFeature([], tf.float32),
            "song_id": tf.io.FixedLenFeature([], tf.int64)
        }
    features = tf.io.parse_single_example(example_proto, tfrecord_format)
    x=features['x']
    y = features['y']
    weight = features['weight']
    song_id = features['song_id']
    return x, y, weight, song_id


def generate_final_dataset(run_normalization=False):
    global FEATURE
    DATA_FILE_DIR = os.path.join(DATA_DIR, 'Databases')
    num_files = len(os.listdir(os.path.join(DATA_DIR, 'Databases', 'sgm_pitch')))
    dirname = os.path.join(DATA_FILE_DIR, 'overall')
    if os.path.exists(dirname) == False:
      os.mkdir(dirname)
    # Loop through files
    features = [["genre"], ["general"], ["sct_data"], ["sgm_loud"], ["sgm_pitch"], ['sgm_timbre']]
    for i in range(1, num_files+1):
        config = json.load(open('config.json'))
        for feature in features:
          for sub_feature in feature:
            FEATURE=sub_feature
            feat_path = os.path.join(DATA_FILE_DIR, sub_feature, f'{sub_feature}_dataset_p{i}.tfrecord')
            dataset = tf.data.TFRecordDataset([feat_path])
            dataset = dataset.map(parse_feature_function_song_id)
            dataset = dataset.batch(500)
            for raw_record in dataset:
                x, y, weight, song_id = raw_record
            x, y, weight, song_id = x.numpy(), y.numpy(), weight.numpy(), song_id.numpy()
            song_id = reverse_translate_ids(song_id)
            # Combine time series data points
            if sub_feature not in ('genre', 'general'):
              # if sub_feature=='sct_data':
              #   x = np.repeat(x, 10, axis=1)
              df = pd.DataFrame(data=list(zip(song_id, y, weight, x)),
                       columns=['song_id', 'rating', 'weight', sub_feature])
              if sub_feature=='sct_data':
                  df_pred = df
              else:
                  df_pred = pd.merge(df_pred, df, how='inner',
                                    left_on=['song_id', 'rating', 'weight'],
                                    right_on=['song_id', 'rating', 'weight'])
          if feature not in (['genre'], ['general']):
            ## non big feature setup
            feature_name=feature[0]
            x = np.float32(np.array([np.array(row) for row in df_pred[feature_name]]))
            ## big feature setup
            # feature_name='big_data'
            # x_sct_data = np.float32(np.array([np.array(row) for row in df_pred['sct_data']]))
            # x_sgm_loud = np.float32(np.array([np.array(row) for row in df_pred['sgm_loud']]))
            # x_sgm_pitch = np.float32(np.array([np.array(row) for row in df_pred['sgm_pitch']]))
            # x_sgm_timbre = np.float32(np.array([np.array(row) for row in df_pred['sgm_timbre']]))
            # x = np.array(np.dstack((np.dstack((np.dstack((x_sct_data, x_sgm_loud)), x_sgm_pitch[:,:,1:])), x_sgm_timbre[:,:,1:])))
            y = df_pred['rating'].to_numpy()
            weight = df_pred['weight'].to_numpy()
            song_id = df_pred['song_id'].to_numpy()
          else:
            feature_name=feature[0]
          # make prediction
          preds = model_dict[feature_name].predict(x, verbose=0)
          df = pd.DataFrame(data=list(zip(song_id, preds.reshape(-1,), y, weight)),
               columns=['song_id', feature_name, 'rating', 'weight'])
          if feature_name=='genre':
              df_final = df
          else:
              df_final = pd.merge(df_final, df, how='inner',
                                left_on=['song_id', 'rating', 'weight'],
                                right_on=['song_id', 'rating', 'weight'])

        # Calculate mins and maxes
        temp_norm_dict = {}
        # for feature in ['genre', 'general', 'big_data']:
        for feature in ["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]:
          temp_norm_dict[f'{feature}_min'] = float(df_final[feature].min())*0.7
          temp_norm_dict[f'{feature}_max'] = float(df_final[feature].max())*1.3
          for norm in ['min', 'max']:
            calculated_value = temp_norm_dict[f'{feature}_{norm}']
            # print(calculated_value)
            if 'overall' not in config['norms'].keys():
                config['norms']['overall'] = {}
            if f'{feature}_{norm}' not in config['norms']['overall'].keys():
              config['norms']['overall'][f'{feature}_{norm}'] = calculated_value
            elif 'min' in norm:
              if calculated_value < config['norms']['overall'][f'{feature}_{norm}']:
                config['norms']['overall'][f'{feature}_{norm}'] = calculated_value
            elif 'max' in norm:
              if calculated_value > config['norms']['overall'][f'{feature}_{norm}']:
                config['norms']['overall'][f'{feature}_{norm}'] = calculated_value
          json.dump(config, open('config.json', 'w'))
        if run_normalization:
          #normalize
          # for feature in ['genre', 'general', 'big_data']:
          for feature in ["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]:
            df_final[feature] = minmax(df_final[feature], config['norms']['overall'][f'{feature}_min'],
                                     config['norms']['overall'][f'{feature}_max'])
          # x = df_final[['genre', 'general', 'big_data']].to_numpy()
          x = df_final[["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]].to_numpy()
          y = df_final['rating'].to_numpy()
          weight_output = df_final['weight'].to_numpy()
          song_id = df_final['song_id'].to_numpy()
          song_id = np.array(translate_ids(song_id))
          save_name = os.path.join(dirname, f'overall_dataset_p{i}.tfrecord')
          with Numpy2TFRecordConverter(save_name) as converter:
            sample={"x": x, "y": y, "weight": weight_output, 'song_id': song_id}
            converter.convert_batch(sample)
            # delete info files
          for file in os.listdir(dirname):
              if '.info' in file:
                os.remove(os.path.join(dirname, file))


def calc_td_norm_values(data, feature, normalization_mode='minmax'):
  '''Calculate normalization values by feature'''
  # Get normalization mode
  if normalization_mode=='minmax':
    scope=['min', 'max']
  elif normalization_mode=='zscore':
    scope=['mean', 'sdv']
  out={}
  if feature=='sct_data':
    # sct loud
    dict_out = transform_to_td(feature_list=list(data['sct_loud']),
                               transform_feat_names=['sct_conf', 'sct_loud'],
                               transform_idxs=[2,4],
                               loud_idxs=[4], scope=scope)
    out.update(dict_out)
    # sct tempo
    dict_out = transform_to_td(feature_list=list(data['sct_tempo']),
                          transform_feat_names=['sct_tempo', 'sct_tempo_conf'],
                          transform_idxs=[4,5],
                          loud_idxs=[], loud_add=None, scope=scope)
    out.update(dict_out)
    # sct key
    dict_out = transform_to_td(feature_list=list(data['sct_key']),
                          transform_feat_names=['sct_key', 'sct_key_conf'],
                          transform_idxs=[4,5],
                          loud_idxs=[4], loud_add=1, max_val=12, scope=scope)
    out.update(dict_out)
    # sct mode
    dict_out = transform_to_td(feature_list=list(data['sct_mode']),
                          transform_feat_names=['sct_mode', 'sct_mode_conf'],
                          transform_idxs=[4,5],
                          loud_idxs=[4], loud_add=0, max_val=1, scope=scope)
    out.update(dict_out)
    # sct time sig
    dict_out = transform_to_td(feature_list=list(data['sct_time_sig']),
                          transform_feat_names=['sct_time_sig', 'sct_time_sig_conf'],
                          transform_idxs=[4,5],
                          loud_idxs=[4], loud_add=0, max_val=7, scope=scope)
    out.update(dict_out)

  elif feature=='sgm_loud':
    out = transform_to_td(feature_list=list(data['sgm_loud']),
                          transform_feat_names=['sgm_conf', 'sgm_loud_start',
                                                'sgm_loud_max', 'sgm_loud_max_time',
                                                'sgm_loud_end'],
                          transform_idxs=[2, 4, 5, 6, 7],
                          loud_idxs=[4, 5, 7], loud_add=100, max_val=1, scope=scope)


  elif feature=='sgm_timbre':
    out = transform_to_td(feature_list=list(data['sgm_timbre']),
                          transform_feat_names=[f'sgm_timbre_{i}' for i in range(1,13)],
                          transform_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                          loud_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                          loud_add=1000, max_val=None, scope=scope)

  elif feature=='genre':
    out = {}
    for col in range(300):
      if normalization_mode=='minmax':
        out[f'{col}_min'] = float(data[col].min())*0.7
        out[f'{col}_max'] = float(data[col].max())*1.3


  elif feature=='general':
    # Obtain dictionary values
    cols = ['pop', 'artist_pop', 'acoust', 'dance', 'energy', 'instrument', 'live',
            'loud', 'speech', 'valence', 'release_year', 'song_dur', 'tempo',
            'key', 'fade_in_dur', 'fade_out_dur', 'num_bars',
            'num_beats', 'num_sections', 'num_segments', 'num_tatums',
            'bar_dur_avg', 'beat_dur_avg', 'tatum_dur_avg',
            'bar_dur_sdv', 'beat_dur_sdv', 'tatum_dur_sdv',
            'bar_conf_avg', 'beat_conf_avg', 'tatum_conf_avg',
            'bar_conf_sdv', 'beat_conf_sdv', 'tatum_conf_sdv',
            'section_dur_avg', 'section_dur_sdv', 'section_conf_avg', 'section_conf_sdv',
            'segment_dur_avg', 'segment_dur_sdv', 'segment_conf_avg', 'segment_conf_sdv'
            ]
    out = {}
    for col in cols:
      if normalization_mode=='minmax':
        out[f'{col}_min'] = float(data[col].min())*0.7
        out[f'{col}_max'] = float(data[col].max())*1.3
      elif normalization_mode=='zscore':
        print(f"'{col}_mean': {np.mean(df_data[col])},")
        print(f"'{col}_sdv': {np.std(df_data[col])},")

  else:
    out=None
  return out

def generate_feature_norms(raw_td_path, raw_td_file_list, basic_dataset_path,
                           feature=feature, normalization_mode=normalization_mode,
                          run_normalization=False, save_dir=None, null_tracker=None,
                          output_format='npy'):
  '''Create feature table for feature from raw time domain dataset and basic dataset'''
  # loop through basic and raw td files and extract feature contents
  if run_normalization:
    print(f'Applying normalization and weights for {feature}')
  else:
    print(f'Calculating Normalization values for {feature}')
  for i in tqdm(range(len(raw_td_file_list))):
    df_basic = pd.read_csv(os.path.join(basic_dataset_path, f'basic_dataset_p{i+1}.csv'),
                          index_col='song_id')
    df_basic.genres = df_basic.genres.apply(process_str_list)
    if feature!='genre':
      if feature == 'sct_data':
        load_cols = ['song_id', 'rating', 'weight', 'sct_loud', 'sct_tempo', 'sct_key', 'sct_mode', 'sct_time_sig']
      elif feature == 'general':
        load_cols = ['song_id', 'rating', 'weight', 'song_dur', 'tempo', 'tempo_confidence', 'time_sig',
                     'time_sig_confidence', 'key', 'key_confidence',
                     'mode', 'mode_confidence', 'fade_in_dur', 'fade_out_dur',
                     'num_bars', 'num_beats', 'num_sections', 'num_segments', 'num_tatums',
                     'bar_dur_avg', 'beat_dur_avg', 'tatum_dur_avg',
                     'bar_dur_sdv', 'beat_dur_sdv', 'tatum_dur_sdv',
                     'bar_conf_avg', 'beat_conf_avg', 'tatum_conf_avg',
                     'bar_conf_sdv', 'beat_conf_sdv', 'tatum_conf_sdv',
                     'section_dur_avg', 'section_dur_sdv', 'section_conf_avg', 'section_conf_sdv',
                     'segment_dur_avg', 'segment_dur_sdv', 'segment_conf_avg', 'segment_conf_sdv']
      else:
        load_cols= ['song_id', 'rating', 'weight', feature]
      df_td_data = pd.read_csv(os.path.join(raw_td_path, f'raw_td_feature_p{i+1}.csv'),
                           index_col='song_id', usecols=load_cols)

    # remove rows with row in any of the dataframes by checking null tracker
    try:
      null_idxs = null_tracker[f'{i+1}']
    except:
      null_idxs = null_tracker[i+1]
    if (feature=='sgm_loud') or (feature=='sgm_timbre') or (feature=='sgm_pitch'):
        try:
            df_td_data[feature] = df_td_data[feature].apply(lambda x: json.loads(x))
        except:
            # missing data in row so check null tracker to remove
            df_td_data.drop(df_td_data.index[null_idxs], inplace=True)
            df_td_data[feature] = df_td_data[feature].apply(lambda x: json.loads(x))
    if feature=='genre':
      drop_ids=[]
      data_list = list(df_basic['genres'])
      df_output = pd.DataFrame(data_list)
      # Convert to dataframe and add index back in
      df_output = pd.DataFrame(df_output)
      test = df_output.to_numpy()
      for j, row in enumerate(test):
        for k, col in enumerate(row):
          if type(col)!=np.float64 or np.isnan(col):
            drop_ids = list(set(drop_ids + [j]))
            null_tracker[f'{i+1}'] = list(set(null_tracker[f'{i+1}'] + [j]))
      df_output.index = df_basic.index
      df_output['song_id'] = df_basic.index
      df_output['rating'] = df_basic['rating']
      df_output['weight'] = df_basic['weight']
      if run_normalization == False:
          df_output.drop(df_output.index[drop_ids], inplace=True)
    elif (feature=='sgm_pitch') or (feature=='sgm_timbre'):
      feature_list = df_td_data[feature].tolist()
      out_new = []
      for song in feature_list:
        song_data = []
        for j in range(0, len(song),5):
          segment_data = [song[j], song[j+1], song[j+2], song[j+3]]
          segment_data.extend([item for item in song[j+4]])
          song_data.append(segment_data)
        out_new.append(song_data)
      df_td_data[feature] = out_new
      df_output = df_td_data
    elif feature=='general':
      drop_ids=[]
      # Combine dataframes
      df_output = pd.merge(df_basic, df_td_data, how='right',
                                  left_on=['song_id', 'rating', 'weight'],
                                  right_on=['song_id', 'rating', 'weight'], suffixes=('_x', ''))
      df_output.drop(columns=['genres', 'mode_x', 'time_sig_x', 'key_x', 'tempo_x'], inplace=True)
      if df_output.shape[0]!=df_td_data.shape[0]:
          print('Non Norm Step')
          print(f'Different basic and td data shapes for feature {feature} and index {i+1}')
          print(df_basic.shape, df_td_data.shape, df_output.shape)
      # Check for nulls
      test = df_output.to_numpy()
      if run_normalization == False:
          for j, row in enumerate(test):
            for k, col in enumerate(row):
              if type(col)!=np.float64 or np.isnan(col):
                drop_ids = list(set(drop_ids + [j]))
                null_tracker[f'{i+1}'] = list(set(null_tracker[f'{i+1}'] + [j]))
          df_output.reset_index(inplace=True)
          df_output.drop(df_output.index[drop_ids], inplace=True)
    elif feature=='sct_data':
      # convert list in string to list
      try:
        for col in df_td_data.columns:
          if type(df_td_data[col][0])==str:
            df_td_data[col] = df_td_data[col].apply(ast.literal_eval)
      except:
        # nulls detected, grab ids and drop rows
        null_idxs = df_td_data.reset_index()[df_td_data.reset_index().isnull().any(axis=1)].index.tolist()
        null_tracker[f'{i+1}'] = list(set(null_tracker[f'{i+1}'] + null_idxs))
        if run_normalization == False:
            df_td_data.drop(df_td_data.index[null_idxs], inplace=True)
            df_basic.drop(df_basic.index[null_idxs], inplace=True)
            for col in df_td_data.columns:
              if type(df_td_data[col][0])==str:
                df_td_data[col] = df_td_data[col].apply(ast.literal_eval)
      df_output = df_td_data
    else:
      df_output = df_td_data

    config = json.load(open('config.json'))

    if run_normalization==False and feature!='sgm_pitch':
      # Get norm values
      out = calc_td_norm_values(df_output, feature)
      # Update config norm values if they are less than mins or greater than maxes
      for key in out.keys():
        if key not in config['norms'][feature].keys():
          config['norms'][feature][key] = out[key]
        elif 'min' in key:
          if out[key] < config['norms'][feature][key]:
            config['norms'][feature][key] = out[key]
        elif 'max' in key:
          if out[key] > config['norms'][feature][key]:
            config['norms'][feature][key] = out[key]
      json.dump(config, open('config.json', 'w'))

    ### RUN NORMALIZATION###
    elif run_normalization:
      # apply normalization
      # Drop null rows
      if len(null_idxs) > 0:
          df_basic.drop(df_basic.index[null_idxs], inplace=True)
          if feature!='genre':
              df_td_data.drop(df_td_data.index[null_idxs], inplace=True)
          if feature=='general':
              df_output.drop(df_output.index[null_idxs], inplace=True)
      if feature != 'sgm_pitch':
        min_max_dict = config['norms'][feature]
      if feature=='sct_data':
        for col in df_output.columns:
          if type(df_td_data[col][0])==str:
            df_output[col] = df_output[col].apply(ast.literal_eval)
        ## sct_loud
        sub_feature='sct_loud'
        mins = [min_max_dict['sct_conf_min'], min_max_dict[f'{sub_feature}_min']]
        maxs = [min_max_dict['sct_conf_max'], min_max_dict[f'{sub_feature}_max']]
        out_loud = np.array(create_td_data(feature_list=list(df_output[sub_feature]),
                                           time_segments=list(range(1,301)), feature_idxs=[2,4],
                                           mins=mins, maxs=maxs, modify_idxs=[4], add_val=100))
        ## sct_tempo
        sub_feature='sct_tempo'
        mins = [min_max_dict['sct_conf_min'], min_max_dict[f'{sub_feature}_min'], min_max_dict[f'{sub_feature}_conf_min']]
        maxs = [min_max_dict['sct_conf_max'], min_max_dict[f'{sub_feature}_max'], min_max_dict[f'{sub_feature}_conf_max']]
        out_tempo = np.array(create_td_data(feature_list=list(df_output[sub_feature]), time_segments=list(range(1,301)),
                                            feature_idxs=[2, 4, 5], mins=mins, maxs=maxs,
                                            modify_idxs=[4]))
        # sct_key
        sub_feature='sct_key'
        mins = [min_max_dict['sct_conf_min'], min_max_dict[f'{sub_feature}_min'], min_max_dict[f'{sub_feature}_conf_min']]
        maxs = [min_max_dict['sct_conf_max'], min_max_dict[f'{sub_feature}_max'], min_max_dict[f'{sub_feature}_conf_max']]
        out_key = np.array(create_td_data(feature_list=list(df_output[sub_feature]), time_segments=list(range(1,301)),
                                          feature_idxs=[2, 4, 5], mins=mins, maxs=maxs,
                                          modify_idxs=[4], add_val=1, divide_val=12))
        # sct_mode
        sub_feature='sct_mode'
        mins = [min_max_dict['sct_conf_min'], min_max_dict[f'{sub_feature}_min'], min_max_dict[f'{sub_feature}_conf_min']]
        maxs = [min_max_dict['sct_conf_max'], min_max_dict[f'{sub_feature}_max'], min_max_dict[f'{sub_feature}_conf_max']]
        out_mode = np.array(create_td_data(feature_list=list(df_output[sub_feature]), time_segments=list(range(1,301)),
                                          feature_idxs=[2, 4, 5], mins=mins, maxs=maxs,
                                          modify_idxs=[4], add_val=0, divide_val=1))
        # sct_time_sig
        sub_feature='sct_time_sig'
        mins = [min_max_dict['sct_conf_min'], min_max_dict[f'{sub_feature}_min'], min_max_dict[f'{sub_feature}_conf_min']]
        maxs = [min_max_dict['sct_conf_max'], min_max_dict[f'{sub_feature}_max'], min_max_dict[f'{sub_feature}_conf_max']]
        out_time_sig = np.array(create_td_data(feature_list=list(df_output[sub_feature]), time_segments=list(range(1,301)),
                                          feature_idxs=[2, 4, 5], mins=mins, maxs=maxs,
                                          modify_idxs=[4], add_val=0, divide_val=7))
        # Concat section features
        a,b,c = df_output.index.values, df_output.rating.values, df_output.weight.values
        df_output = np.concatenate([out_loud, out_tempo[:, :, 1:], out_key[:, :, 1:], out_mode[:, :, 1:], out_time_sig[:, :, 1:]], -1)
        df_output = pd.DataFrame(data=zip(df_output.tolist()), columns=['sct_data'])
        df_output['song_id']= a
        df_output['rating'] = b
        df_output['weight'] = c

      elif feature=='sgm_loud':
        mins = [min_max_dict[f'sgm_conf_min'], min_max_dict['sgm_loud_start_min'], min_max_dict['sgm_loud_max_min'],
                min_max_dict['sgm_loud_max_time_min'], min_max_dict['sgm_loud_end_min']]
        maxs = [min_max_dict[f'sgm_conf_max'], min_max_dict['sgm_loud_start_max'], min_max_dict['sgm_loud_max_max'],
                  min_max_dict['sgm_loud_max_time_max'], min_max_dict['sgm_loud_end_max']]
        out_sgm_loud = np.array(create_td_data(feature_list=list(df_output[feature]),
                                               time_segments=list(np.arange(0.1,300.1, 0.1)),
                                               feature_idxs=[2, 4, 5, 6, 7], mins=mins, maxs=maxs,
                                               modify_idxs=[4, 5, 7], add_val=100, divide_val=1))
        a,b,c = df_output.index.values, df_output.rating.values, df_output.weight.values
        df_output = pd.DataFrame(data=zip(out_sgm_loud.tolist()), columns=[feature])
        df_output['song_id']= a
        df_output['rating'] = b
        df_output['weight'] = c


      elif feature=='sgm_timbre':
        mins = [config['norms']['sgm_loud']['sgm_conf_min']] + [min_max_dict[f'sgm_timbre_{k}_min'] for k in range(1,13)]
        maxs = [config['norms']['sgm_loud']['sgm_conf_max']] + [min_max_dict[f'sgm_timbre_{k}_max'] for k in range(1,13)]
        out_sgm_timbre = np.array(create_td_data(feature_list=list(df_output[feature]),
                                                 time_segments=list(np.arange(0.1,300.1, 0.1)),
                                                 feature_idxs=[2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                                                 mins=mins, maxs=maxs,
                                                 modify_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                                                 add_val=1000))
        a,b,c = df_output.index.values, df_output.rating.values, df_output.weight.values
        df_output = pd.DataFrame(data=zip(out_sgm_timbre.tolist()), columns=[feature])
        df_output['song_id']= a
        df_output['rating'] = b
        df_output['weight'] = c


      elif feature=='sgm_pitch':
        mins = [config['norms']['sgm_loud']['sgm_conf_min']] + [0 for k in range(1,13)]
        maxs = [config['norms']['sgm_loud']['sgm_conf_max']] + [1 for k in range(1,13)]
        out_sgm_pitch = np.array(create_td_data(feature_list=list(df_output[feature]),
                                                time_segments=list(np.arange(0.1,300.1, 0.1)),
                                                feature_idxs=[2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                                                mins=mins, maxs=maxs,
                                                modify_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                                                add_val=0.000001, divide_val=1))
        a,b,c = df_output.index.values, df_output.rating.values, df_output.weight.values
        df_output = pd.DataFrame(data=zip(out_sgm_pitch.tolist()), columns=[feature])
        df_output['song_id']= a
        df_output['rating'] = b
        df_output['weight'] = c


      elif feature=='genre':
        for col in range(300):
          df_output[col] = minmax(df_output[col], min_max_dict[f'{col}_min'], min_max_dict[f'{col}_max'])
          df_output.loc[df_output[col] > 1, col] = 1
          df_output.loc[df_output[col] < 0, col] = 0

      elif feature=='general':
        # repeat column
        if df_output.shape[0]!=df_td_data.shape[0]:
            print('norm data')
            print(f'Different basic and td data shapes for feature {feature} and index {i+1}')
            print(df_basic.shape, df_td_data.shape, df_output.shape)
            return df_basic, df_td_data, df_output
        try:
          df_output = df_output.drop(columns=['dur'])
        except:
          pass

        cols = list(df_output.columns)
        for bad_id in ['explicit', 'album_type', 'rating', 'mode', 'weight', 'time_sig',
                       'mode_confidence', 'key_confidence', 'time_sig_confidence',
                       'tempo_confidence']:
          try:
              cols.remove(bad_id)
          except:
              print('fail removing non-norm, general columns')
              print(bad_id, cols)
              pass
        for col in cols:
            df_output[col] = minmax(df_output[col], min_max_dict[f'{col}_min'], min_max_dict[f'{col}_max'])
            df_output.loc[df_output[col] > 1, col] = 1
            df_output.loc[df_output[col] < 0, col] = 0
        df_output.reset_index(inplace=True)
        df_output = df_output.sample(frac=1)
      if output_format=='npy':
        np.save(os.path.join(save_dir, f'{feature}_dataset_p{i+1}'), np.array(df_output))
      elif output_format=='trf':
          song_id_lookup = json.load(open('song_id_lookup.json'))
          df_output['song_id'] = df_output['song_id'].map(translate_ids)
          frame=np.array(df_output)
          if feature=='sgm_loud' or feature=='sgm_timbre' or feature=='sgm_pitch' or feature=='sct_data':
              x = np.float32(np.array([np.array(row) for row in frame[:,0]]))
              output = np.float32(frame[:, -2].astype(np.float64).reshape(frame[:,-2].shape[0],1))
              weight_output = np.float32(frame[:, -1].astype(np.float64).reshape(frame[:,-1].shape[0],1))
              song_id = np.int64(frame[:,-3].reshape(frame[:,-3].shape[0],1))
          # New Feature Code
          elif feature=='general':
              x=np.float32(np.delete(frame, (0,4,5), 1))
              output=np.float32(frame[:, 4].reshape(frame[:,4].shape[0],1))
              weight_output=np.float32(frame[:, 5].reshape(frame[:,5].shape[0],1))
              song_id = np.int64(frame[:,0].reshape(frame[:,0].shape[0],1))
          elif feature=='genre':
              x=np.float32(frame[:,:-3])
              output=np.float32(frame[:, -2].reshape(frame[:,-2].shape[0],1))
              weight_output=np.float32(frame[:, -1].reshape(frame[:,-1].shape[0],1))
              song_id = np.int64(frame[:,-3].reshape(frame[:,-3].shape[0],1))
          save_path = os.path.join(save_dir, f'{feature}_dataset_p{i+1}.tfrecord')
          with Numpy2TFRecordConverter(save_path) as converter:
            sample={"x": x, "y": output, "weight": weight_output, "song_id": song_id}
            converter.convert_batch(sample)
          # delete info files
          for file in os.listdir(save_dir):
            if '.info' in file:
                os.remove(os.path.join(save_dir, file))
      else:
        df_output.to_csv(os.path.join(save_dir, f'{feature}_dataset_p{i+1}.csv'))

  return null_tracker


def process_str_list(sample):
      '''Converst string list to list'''
      sample = sample.replace("\n", "").replace("[", "").replace("]", "")
      sample_list = list(sample.split(" "))
      sample_list=[x for x in sample_list if len(x)>0]
      sample_list = [float(x.replace(',','')) for x in sample_list]
      return sample_list


def create_song_id_df(prediction_type=prediction_type, sp=sp):
  '''Create song_id playlist with ratings.'''
  config = json.load(open('config.json'))
  df_song_ids = pd.DataFrame(columns=['song_id', 'rating']).set_index('song_id')

  # Loop through provided playlists
  for playlist_name, playlist_uri in INPUT_PLAYLISTS.items():
    sp = check_api_swap(sp)
    print(f'Gathering song ids for: {playlist_name}')
    rating = config['ratings'][prediction_type][playlist_name]
    df_temp, sp = get_song_ids_from_playlist(playlist_uri, rating)
    df_song_ids = pd.concat([df_song_ids, df_temp])
    df_song_ids = df_song_ids[~df_song_ids.index.duplicated(keep='first')]

  if use_neutral_songs:
    # Add in blacklist_playlist
    for playlist_name, playlist_uri in BLACKLIST_PLAYLISTS.items():
      print(f'Gathering song ids for: {playlist_name}')
      rating = config['ratings'][prediction_type]['Neutral_Songs']
      df_temp, sp = get_song_ids_from_playlist(playlist_uri, rating)
      df_temp = df_temp[~df_temp.isin(df_song_ids)].dropna()
      df_song_ids = pd.concat([df_song_ids, df_temp])
      df_song_ids = df_song_ids[~df_song_ids.index.duplicated(keep='first')]
    # Non-rated songs that were recommended
    neutral_song_path = os.path.join(DATA_DIR, 'Databases', 'Recommend_Data', 'song_listened.csv')
    if os.path.exists(neutral_song_path):
      df_neutral_songs = pd.read_csv(neutral_song_path).set_index('song_id')
      df_neutral_songs['rating'] = [config['ratings'][prediction_type]['Neutral_Songs']] * df_neutral_songs.shape[0]
      df_neutral_songs = df_neutral_songs[~df_neutral_songs.isin(df_song_ids)].dropna()
      df_song_ids = pd.concat([df_song_ids, df_neutral_songs])
      df_song_ids = df_song_ids[~df_song_ids.index.duplicated(keep='first')]

  return df_song_ids


def get_n_chunks(data, num_per_dataset=200):
  '''Get number of chunks based on df_song_id size'''
  n_classes = data.rating.nunique()
  data_len = data.shape[0]
  n_chunks = int(np.ceil(data_len / num_per_dataset))
  return n_chunks


def get_rating_cnts(data, prediction_type=prediction_type):
  df_cnts = pd.DataFrame(data.value_counts()).reset_index()
  df_cnts['percent_of_data'] = 100 * (df_cnts['count'] / df_cnts['count'].sum())
  config = json.load(open('config.json'))
  rating_dict_map = dict((v,k) for k,v in config['ratings'][prediction_type].items())
  df_cnts['playlist'] = df_cnts['rating'].map(rating_dict_map)

  return df_cnts


def calc_weights(df_song_ids, df_cnts, weight_mode=weight_mode):
  '''Create automatic or custom class weights'''
  if weight_mode == 'balanced':
    custom_weights = compute_class_weight(class_weight = 'balanced', classes = df_cnts['rating'].values, y=df_song_ids['rating'])
    custom_weights = dict(zip(df_cnts['playlist'].values, custom_weights))
    print(custom_weights)
  else:
    fig = px.bar(df_cnts, x='playlist', y='percent_of_data', color='rating', title='Default data percentages without weights')
    fig.show()
    custom_weights = compute_class_weight(class_weight = 'balanced', classes = df_cnts['rating'].values, y=df_song_ids['rating'])
    custom_weights = dict(zip(df_cnts['playlist'].values, custom_weights))
    print(f'Example weights for balanced dataset: {custom_weights}')
    custom_weights={}
    for playlist in df_cnts['playlist']:
      input_data = float(input(f'Enter a weight value for {playlist}:'))
      custom_weights[playlist] = input_data
    df_cnts['weights'] = df_cnts['playlist'].map(custom_weights)
    df_cnts['weighted_count'] = df_cnts['count'] * df_cnts['weights']
    df_cnts['weighted_percent_of_data'] = 100 * (df_cnts['weighted_count'] / df_cnts['weighted_count'].sum())
    fig = px.bar(df_cnts, x='playlist', y='weighted_percent_of_data', color='rating', title='Weighted data percentages')
    fig.show()
    print('If you are happy with these weights then run the next cell!')

  return custom_weights


def save_weights_and_add_to_df(custom_weights, df_song_ids, prediction_type=prediction_type):
  '''Save Custom weights to config file'''
  config = json.load(open('config.json'))
  if 'weights' not in config.keys():
      config['weights'] = {}
  config['weights'][prediction_type] = custom_weights
  json.dump(config, open('config.json', 'w'))

  # add weights to df_song_ids
  mapping_dict = {}
  for playlist in config['ratings'][prediction_type].keys():
    rating = config['ratings'][prediction_type][playlist]
    mapping_dict[rating] = config['weights'][prediction_type][playlist]
  df_song_ids['weight'] = df_song_ids['rating'].map(mapping_dict)

  return df_song_ids


def make_dir(path):
  ''' Makes Folder if directory doesn't exist'''
  if os.path.exists(path) == False:
    os.mkdir(path)


def generate_basic_datasets():
  '''Using song_ids to generate basic dataset chunks'''
  basic_data_ids_save_dir = os.path.join(DATA_DIR, 'Databases', 'Basic_Dataset_Song_Ids')
  song_id_files = os.listdir(basic_data_ids_save_dir)
  save_dir = os.path.join(DATA_DIR, 'Databases', 'Basic_Dataset')
  make_dir(save_dir)
  for i in tqdm(range(len(song_id_files))):  # edit to ad all
    path = os.path.join(basic_data_ids_save_dir, f'song_ids_p{i+1}.csv')
    data = pd.read_csv(path)
    df = create_song_df(data)
    df = wrangle(df)
    df.to_csv(os.path.join(save_dir, f'basic_dataset_p{i+1}.csv'))
  print('Finished saving basic dataset chunks!')


def check_api_swap(sp):
  '''Checks api global environmental counter variables and sees if the api
  should be switched to another one to prevent over-usage and from spotify
  temporarily disabling the api'''
  global BASIC_DATASET_API_COUNTER, AUDIO_FEATURES_API_COUNTER, CLIENT_ID, CLIENT_SECRET, SELECTED_API
  if BASIC_DATASET_API_COUNTER >= 10000 or AUDIO_FEATURES_API_COUNTER >=1700:
    os.remove('.cache')  # Spotify api will block further connections if this is reached regardless of switching api
    key_list = list(API_KEYS.keys())
    current_key_idx = key_list.index(SELECTED_API)
    if current_key_idx==len(key_list)-1:
      next_key_idx = 0
    elif current_key_idx==0 and len(key_list)==1:
      print('Need more spotify api keys to prepare data.  Please create more and add them to .env')
    else:
      next_key_idx = current_key_idx + 1
    SELECTED_API = key_list[next_key_idx]
    CLIENT_ID, CLIENT_SECRET = API_KEYS[SELECTED_API]
    if AUDIO_FEATURES_API_COUNTER>=1600:
      attempts=0
      while attempts <= 5:
        try:
          sp = create_connection()
          attempts=6
        except:
          os.remove('.cache')  # Spotify api will block further connections if this is reached regardless of switching api
          attempts +=1
    else:
      sp = create_connection(hide_output=True)
    BASIC_DATASET_API_COUNTER = 0
    AUDIO_FEATURES_API_COUNTER = 0
  return sp


def split_dataset_into_chunks(data, n_splits,  class_col, save_dir):
  #Get each of the classes into their own list of samples
  n_classes = data.rating.nunique()
  make_dir(save_dir)
  class_split_list = {}
  for i, class_name in enumerate(df_song_ids[class_col].unique()):
      class_list = dict(data.groupby(class_col).groups)[class_name].values.tolist()
      shuffle(class_list)
      class_split_list[i] = np.array_split(class_list,n_splits)#create a dict of split chunks

  stratified_sample_chunks = []
  for i in range(n_splits):
      class_chunks = []
      for j in range(n_classes):
          class_chunks.extend(class_split_list[j][i])#get split from current class
      stratified_sample_chunks.append(class_chunks)
  for i, chunk in enumerate(stratified_sample_chunks):
    df = data.iloc[chunk].set_index('song_id')
    save_path = os.path.join(save_dir, f'song_ids_p{i+1}.csv')
    df.to_csv(save_path)
  print('Successfully saved all datasets')



def get_song_ids_from_playlist(playlist_uri, rating, sp=sp):
  '''Create a dictionary from playlist_uri with a column for song_id and and column for rating'''
  global BASIC_DATASET_API_COUNTER
  raw_song_data = []
  count=0
  while True:
    sp = check_api_swap(sp)
    results = sp.playlist_tracks(playlist_uri, limit=50, offset=count)['items']
    BASIC_DATASET_API_COUNTER += len(results)
    # Stop adding new song once we reach the end of liked song list
    if len(results)==0: break
    count+=50
    raw_song_data.extend([song['track']['id'] for song in results])
  df = pd.DataFrame(list(zip(raw_song_data, [rating] * len(raw_song_data))),
                   columns=['song_id', 'rating']).set_index('song_id')
  return df, sp


def rating_input(playlists, use_neutral_songs=use_neutral_songs, prediction_type=prediction_type):
  '''Add rating values to config.json file based on prediction_type'''
  config = json.load(open('config.json'))
  rating_ids = {}
  if 'ratings' not in config.keys():
    config['ratings'] = {}

  if prediction_type=='Regression':
    prompt='Enter in a numerical value with higher values being more liked songs and lower for less liked songs for'
  elif prediction_type=='Classification':
    prompt='Enter in 0 for disliked playlist and 1 for liked playlist'

  for playlist in playlists:
    rating_ids[playlist] = float(input(f'{prompt}   {playlist}:'))

  # Add in neutral songs
  if use_neutral_songs:
    neutral_id_value = float(input(f'{prompt}   Neutral Songs:'))
    rating_ids['Neutral_Songs'] = neutral_id_value

  config['ratings'][prediction_type] = rating_ids
  json.dump(config, open('config.json', 'w'))
  print(f'Successfully added {prediction_type} ratings to config!')


def create_playlists(playlist_links, playlist_names):
  '''Takes a list of playlist links and names.  Returns an ordered dictionary
  of playlist name as key and link as value'''
  count = 1
  playlists = collections.OrderedDict()
  for i in range(len(inp_play_links)):
    if len(inp_play_links[i]) > 0:
      if len(inp_play_names[i])==0:
        name = f'playlist {count}'
      else:
        name = inp_play_names[i]
      count +=1
      assert inp_play_links[i][0]=='s', f'provided playlist: {inp_play_links[i]} \
        is not a spotify URI link.  Make sure to double check how to get playlist link in instructions'
      playlists[name] = inp_play_links[i]

  playlist_scores = range(len(playlists))
  print('Playlists Successfully Added!')
  return playlists, playlist_scores


def normalize(column, negative=False):
    if negative:
      for i, song in enumerate(column):
        for j, segment in enumerate(song):
          for k, item in enumerate(segment):
            if item < 0:
              column[i][j][k] = -1 * item
            else:
              column[i][j][k] = (item+0.00001)  * 2
      return column
    upper = column.max()
    lower = column.min()
    y = (column - lower)/(upper-lower)
    return y.tolist()


def audio_analysis(song_list, sp=None):
  '''Obtains audio analysis data'''
  global AUDIO_FEATURES_API_COUNTER

  total_songs = len(song_list)
  out, song_ids = [], []
  counter=0
  sp = create_connection(hide_output=True)
  for song in song_list:
    if counter >= 1000:
      try:
        sp = create_connection()
        counter = 0
      except:
        print('Failed to reset authorization to spotify')
        break
    else:
      counter += 1
    # Obtain Data
    try:
      out.append(sp.audio_analysis(song))
      song_ids.append(song)
      AUDIO_FEATURES_API_COUNTER += 1
      sp = check_api_swap(sp)
    except:
        print(f'Failed to get data for {song}')
        pass

  # Create empty lists
  categories = ["song_dur", "tempo", "tempo_confidence", "time_sig", "time_sig_confidence",
    "key", "key_confidence", "mode", "mode_confidence", "fade_in_dur", "fade_out_dur",
    "num_bars", "num_beats", "num_sections", "num_segments", "num_tatums",
    "bar_data", "beat_data", "tatum_data", 'sct_loud', 'sct_tempo', 'sct_key', 'sct_mode', 'sct_time_sig',
    'sgm_loud', 'sgm_pitch', 'sgm_timbre',]

  song_dur, tempo, tempo_confidence, time_sig, time_sig_confidence = [], [], [], [], []
  key, key_confidence, mode, mode_confidence, fade_in_dur, fade_out_dur = [], [], [], [], [], []
  num_bars, num_beats, num_sections, num_segments, num_tatums = [], [], [], [], []
  bar_data, beat_data, tatum_data = [], [], []
  # Section Data
  sct_loud, sct_tempo, sct_key, sct_mode, sct_time_sig = [], [], [], [], []
  # Segment Data
  sgm_loud, sgm_pitch, sgm_timbre = [], [], []
  extra_song_ids = []

  for song, song_id in zip(out, song_ids):
    try:
      extra_song_ids.append(song_id)
      # Song Traits
      duration = song['track']['duration']
      song_dur.append(song['track']['duration'])
      tempo.append(song['track']['tempo'])
      tempo_confidence.append(song['track']['tempo_confidence'])
      time_sig.append(song['track']['time_signature'] / 7)
      time_sig_confidence.append(song['track']['time_signature_confidence'])
      key.append(song['track']['key'])
      key_confidence.append(song['track']['key_confidence'])
      mode.append(song['track']['mode'])
      mode_confidence.append(song['track']['mode_confidence'])
      fade_in_dur.append(song['track']['end_of_fade_in'])
      fade_out_dur.append(duration - song['track']['start_of_fade_out'])
      num_bars.append(len(song['bars']))
      num_beats.append(len(song['beats']))
      num_sections.append(len(song['sections']))
      num_segments.append(len(song['segments']))
      num_tatums.append(len(song['tatums']))

      # Obtain bar Data
      song_bar_data = []
      for bar in song['bars']:
        temp_data = [bar['start'], bar['duration'], bar['confidence']]
        song_bar_data.append(temp_data)
      bar_data.append(song_bar_data)

      # Obtain beat data
      song_beat_data = []
      for beat in song['beats']:
        temp_data = [beat['start'], beat['duration'], beat['confidence']]
        song_beat_data.append(temp_data)
      beat_data.append(song_beat_data)

      # Section Data
      song_sct_loud, song_sct_tempo, song_sct_key, song_sct_mode, song_sct_time_sig = [], [], [], [], []
      for section in song['sections']:
        # Loudness
        temp_data = [section['start'], section['duration'], section['confidence'], section['start'] + section['duration'],  section['loudness']]
        song_sct_loud.append(temp_data)
        # Tempo
        temp_data = [section['start'], section['duration'], section['confidence'], section['start'] + section['duration'], section['tempo'], section['tempo_confidence']]
        song_sct_tempo.append(temp_data)
        # Key
        temp_data = [section['start'], section['duration'], section['confidence'], section['start'] + section['duration'], section['key'], section['key_confidence']]
        song_sct_key.append(temp_data)
        # Mode
        temp_data = [section['start'], section['duration'], section['confidence'], section['start'] + section['duration'], section['mode'], section['mode_confidence']]
        song_sct_mode.append(temp_data)
        # Time signature
        temp_data = [section['start'], section['duration'], section['confidence'], section['start'] + section['duration'], section['time_signature'], section['time_signature_confidence']]
        song_sct_time_sig.append(temp_data)
      sct_loud.append(song_sct_loud)
      sct_tempo.append(song_sct_tempo)
      sct_key.append(song_sct_key)
      sct_mode.append(song_sct_mode)
      sct_time_sig.append(song_sct_time_sig)

      # Segment data
      song_sgm_loud, song_sgm_pitch, song_sgm_timbre = [], [], []
      for segment in song['segments']:
        # Loudness
        temp_data = [segment['start'], segment['duration'], segment['confidence'], segment['start'] + segment['duration'], segment['loudness_start'], segment['loudness_max'], segment['loudness_max_time'], segment['loudness_end']]
        song_sgm_loud.append(temp_data)
        # Pitch
        temp_data = [segment['start'], segment['duration'], segment['confidence'], segment['start'] + segment['duration'], segment['pitches']]
        song_sgm_pitch.extend(temp_data)
        # Timbre
        temp_data = [segment['start'], segment['duration'], segment['confidence'], segment['start'] + segment['duration'], segment['timbre']]
        song_sgm_timbre.extend(temp_data)

      sgm_loud.append(song_sgm_loud)
      sgm_pitch.append(song_sgm_pitch)
      sgm_timbre.append(song_sgm_timbre)

      # Tatum data
      song_tatum_data = []
      for tatum in song['tatums']:
        temp_data = [tatum['start'], tatum['duration'], tatum['confidence']]
        song_tatum_data.append(temp_data)
      tatum_data.append(song_tatum_data)
    except:
      print(f'no data for song_id: {song_id}, filling with 0')
      variables = [song_dur, tempo, tempo_confidence, time_sig, time_sig_confidence,
                  key, key_confidence, mode, mode_confidence, fade_in_dur, fade_out_dur,
                  num_bars, num_beats, num_sections, num_segments, num_tatums,
                  bar_data, beat_data, tatum_data,
                  sct_loud, sct_tempo, sct_key, sct_mode, sct_time_sig,
                  sgm_loud, sgm_pitch, sgm_timbre]
      for variable in variables:
        variable.append(0)


  # Create DataFrame
  data = list(zip(song_dur, tempo, tempo_confidence, time_sig, time_sig_confidence,
    key, key_confidence, mode, mode_confidence, fade_in_dur, fade_out_dur,
    num_bars, num_beats, num_sections, num_segments, num_tatums,
    bar_data, beat_data, tatum_data,
    sct_loud, sct_tempo, sct_key, sct_mode, sct_time_sig,
    sgm_loud, sgm_pitch, sgm_timbre))
  df = pd.DataFrame(data=data, columns=categories, index=extra_song_ids)
  # Drop rows with bad index
  # Handle Bar, Beat and tatum Data, Beat
  for name, item in zip(['section', 'segment', 'bar', 'beat', 'tatum' ], ['sct_loud', 'sgm_loud','bar_data', 'beat_data', 'tatum_data']):
    data = list(df[item].values)
    s_dur_avg, s_dur_sdv, s_conf_avg, s_conf_sdv = [], [], [], []
    for song in data:
      dur, conf = [], []
      for component in song:
        dur.append(component[1])
        conf.append(component[2])
      s_dur_avg.append(np.mean(dur))
      s_dur_sdv.append(np.std(dur))
      s_conf_sdv.append(np.std(conf))
      s_conf_avg.append(np.mean(conf))
    df[f'{name}_dur_avg'] = s_dur_avg
    df[f'{name}_dur_sdv'] = s_dur_sdv
    df[f'{name}_conf_avg'] = s_conf_avg
    df[f'{name}_conf_sdv'] = s_conf_sdv

  # Drop Columns
  df.drop(columns=['bar_data', 'beat_data', 'tatum_data'], inplace=True)

  return df, sp


def process_component(data, max_length, col_id_skip=None, shift_param=0.00001):
  '''Input is a list and returns normalized dataframe'''
  out = []
  cols_skip = []
  counter = 0
  for row in data:  # Loop through song
    row_data = []
    for item in row: # loop through bar
      for i, subitem in enumerate(item): # loop through elements in bar
        if col_id_skip:
          if i in col_id_skip:
            cols_skip.append([f'comp_{counter}'])
        counter += 1
        row_data.append(subitem)
    # Check if row needs padding or trimming
    if len(row_data) < max_length:
      zero_pads = [0] * (max_length - len(row_data))
      row_data.extend(zero_pads)
    elif len(row_data) >= max_length:
      row_data = row_data[:max_length]
    out.append(row_data)

  column_names = [f'comp_{i}' for i in range(max_length)]
  df = pd.DataFrame(data=out, columns=column_names)
  # Apply boxcox normalization to each column
  for col in df.columns:
    if cols_skip:
      if col in cols_skip:
        continue
    try:
      norm_comp = normalize(stats.boxcox(df[col]+shift_param)[0])
      df[col] = norm_comp
    except:
      print('data values are negative, applying normalization first')
      print(col)
      continue

  return df



def z_score(data, avg, sdv):
  return (data-avg) / sdv


def minmax(data, min, max):
  return ( (data - min) / (max - min) )


def transform_to_td(feature_list, transform_feat_names, transform_idxs,
                    loud_idxs=None, loud_add=100, max_val=None,
                    scope=['mean', 'sdv']):
  '''feature = 'sct_loud'
  feature_list = list(df_new_feats[feature])
  transform_idxs = [2,4]
  loud_idx=4
  transform_feat_names = ['sct_loud_conf', 'sct_loud']'''

  out_dict = {}
  for name, idx in zip(transform_feat_names, transform_idxs):
    locals()[f'{name}_flat'] = []
    for song in feature_list:
      for feature in song:
        if idx in loud_idxs:
          if max_val:
            if type(max_val)==list:
              value = (loud_add + feature[idx]) / max_val[idx-4]
            else:
              value = (loud_add + feature[idx]) / max_val
          else:
            value = (loud_add + feature[idx])
        else:
          value = feature[idx]
        locals()[f'{name}_flat'].append(value)
    # Add z_score to output dictionary
    if 'min' in scope:
      out_dict[f'{name}_min'] = min(locals()[f'{name}_flat'])*0.7
    if 'max' in scope:
      out_dict[f'{name}_max'] = max(locals()[f'{name}_flat'])*1.3
    if 'mean' in scope:
      out_dict[f'{name}_mean'] = np.mean(locals()[f'{name}_flat'])
    if 'sdv' in scope:
      out_dict[f'{name}_sdv'] = np.std(locals()[f'{name}_flat'])

  return out_dict


def create_td_data(feature_list, time_segments, feature_idxs, mins=None, maxs=None,
                   means=None, sdvs=None, modify_idxs=None, add_val=None,
                   divide_val=None, divide_list=None,
                   normalization_mode='minmax', run_normalization=True):
  '''Creates a time series transformation of the data'''
  songs = []
  # Loop through song
  for song in feature_list:
    song_data = []
    counter=0
    # Loop through columns to input for feature data
    for threshold in time_segments:
      # Loop through feature values and stop once you find a match
      features = []
      for feature in song[counter:]:
        start, end = feature[0], feature[3]
        # Check if features apply to this time period
        if (threshold > start) & (threshold <= end):
          components=[]
          # loop through components and get z-score or minmax
          if normalization_mode=='minmax':
            for i, idx, min, max in zip(range(len(feature_idxs)), feature_idxs, mins, maxs):
              if idx in modify_idxs:
                if divide_list:
                  value = (feature[idx] + add_val) / divide_list[i-1]
                  value = minmax(value, min, max)
                else:
                  if (add_val != None) & (divide_val != None):
                    value = (feature[idx] + add_val) / divide_val
                    if run_normalization:
                      value = minmax(value, min, max)
                  else:
                    if add_val != None:
                      value = minmax(add_val + feature[idx], min, max)
                    else:
                      value = minmax(feature[idx], min, max)
              else:
                value = minmax(feature[idx], min, max)
              components.append(value)
          # add values
          features.extend(components)
          break

        # seconds is past feature
        elif threshold > start:
          counter+=1
      # no matches add 0
      if len(features)==0:
        features.extend([0] * (len(feature_idxs)))
      song_data.append(features)
    # Add song features
    songs.append(song_data)

  return songs


def process_feat_list(sample):
  '''Converst string list to list'''
  return json.loads(sample)

def feat_data(data):
  '''Ingest Audia Features data and create dataframe with raw audio analysis features'''
  song_list = data.index.values.tolist()
  # song_list = data.index.values.tolist()[:5]
  df_new_feats = audio_analysis(song_list, sp=sp)
  df_new_feats.index = df.index
  return df_new_feats


def transform_feature(df, feature_target):
  '''Apply transformations to time series feature data to prepare for modeling'''
  # Transformation code
  # Section

  if feature_target=='sct_data':
    # sct_loud
    feature='sct_loud'
    feature_list = list(df[feature])
    sec_cols = list(range(1,301))
    feature_idxs = [2, 4]
    means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean']]
    sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv']]
    modify_idxs=[4]
    add_val=100
    divide_val=None
    out_loud = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                          modify_idxs, add_val, divide_val)
    out_loud = np.array(out_loud)

   # sct_tempo
    feature ='sct_tempo'
    feature_list = list(df[feature])
    sec_cols = list(range(1,301))
    feature_idxs = [2, 4, 5]
    means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
    sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
    modify_idxs=[]
    add_val=None
    divide_val=None
    out_tempo = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                          modify_idxs, add_val, divide_val)
    out_tempo = np.array(out_tempo)

    # sct_key
    feature='sct_key'
    feature_list = list(df[feature])
    sec_cols = list(range(1,301))
    feature_idxs = [2, 4, 5]
    means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
    sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
    modify_idxs=[4]
    add_val=1
    divide_val=12
    out_key = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                          modify_idxs, add_val, divide_val, run_z=True)
    out_key = np.array(out_key)

    # sct_mode
    feature='sct_mode'
    feature_list = list(df[feature])
    sec_cols = list(range(1,301))
    feature_idxs = [2, 4, 5]
    means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
    sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
    modify_idxs=[4]
    add_val=0
    divide_val=1
    out_mode = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                          modify_idxs, add_val, divide_val)
    out_mode = np.array(out_mode)

    # sct_time_sig
    feature='sct_time_sig'
    feature_list = list(df[feature])
    sec_cols = list(range(1,301))
    feature_idxs = [2, 4, 5]
    means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
    sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
    modify_idxs=[4]
    add_val=0
    divide_val=7
    out_time_sig = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                          modify_idxs, add_val, divide_val, run_z=True)
    out_time_sig = np.array(out_time_sig)

  # Concat section features
  out_section = np.concatenate([out_loud, out_tempo[:, :, 1:], out_key[:, :, 1:], out_mode[:, :, 1:], out_time_sig[:, :, 1:]], -1)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[0].input, outputs=models[0].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_section).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sct_feature_{i}'] = [item[i] for item in pred]
  else:
    pred = models[0].predict(out_section).tolist()
    df['sct_pred'] = [item[0] for item in pred]



  # sgm loud
  feature = 'sgm_loud'
  feature_list = list(df[feature])
  mcr_sec_cols = list(np.arange(0.1,300.1, 0.1))
  feature_idxs = [2, 4, 5, 6, 7]
  means = [z_score_dict[f'sgm_conf_mean'], z_score_dict['sgm_loud_start_mean'], z_score_dict['sgm_loud_max_mean'],
          z_score_dict['sgm_loud_max_time_mean'], z_score_dict['sgm_loud_end_mean']]
  sddevs = [z_score_dict[f'sgm_conf_sdv'], z_score_dict['sgm_loud_start_sdv'], z_score_dict['sgm_loud_max_sdv'],
            z_score_dict['sgm_loud_max_time_sdv'], z_score_dict['sgm_loud_end_sdv']]
  modify_idxs=[4, 5, 7]
  add_val=100
  divide_val=1
  out_sgm_loud = create_td_data(feature_list, mcr_sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val, run_z=True)
  out_sgm_loud = np.array(out_sgm_loud)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[1].input, outputs=models[1].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_sgm_loud).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sgm_loud_feature_{i}'] = [item[i] for item in pred]
  else:
    pred = models[1].predict(out_sgm_loud).tolist()
    df['sgm_loud_pred'] = [item[0] for item in pred]


  # sgm_pitch
  feature = 'sgm_pitch'
  feature_list = list(df[feature])
  mcr_sec_cols = list(np.arange(0.1,300.1, 0.1))
  feature_idxs = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  means = [z_score_dict['sgm_conf_mean']] + [z_score_dict[f'sgm_timbre_{i}_mean'] for i in range(1,13)]
  sddevs = [z_score_dict['sgm_conf_sdv']] + [z_score_dict[f'sgm_timbre_{i}_sdv'] for i in range(1,13)]
  modify_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  add_val=0.000001
  divide_val=1
  out_sgm_pitch = create_td_data(feature_list, mcr_sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val)
  out_sgm_pitch = np.array(out_sgm_pitch)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[2].input, outputs=models[2].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_sgm_pitch).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sgm_pitch_{i}'] = [item[i] for item in pred]
  else:
    pred = models[2].predict(out_sgm_pitch).tolist()
    df['sgm_pitch_pred'] = [item[0] for item in pred]


  # sgm_timbre
  feature = 'sgm_timbre'
  feature_list = list(df[feature])
  mcr_sec_cols = list(np.arange(0.1,300.1, 0.1))
  feature_idxs = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  means = [z_score_dict['sgm_conf_mean']] + [z_score_dict[f'sgm_timbre_{i}_mean'] for i in range(1,13)]
  sddevs = [z_score_dict['sgm_conf_sdv']] + [z_score_dict[f'sgm_timbre_{i}_sdv'] for i in range(1,13)]
  modify_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  add_val=1000
  divide_val=None
  divide_list = [timbre_max_dict[f'sgm_timbre_{i}_max'] for i in range(1,13)]
  out_sgm_timbre = create_td_data(feature_list, mcr_sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val, divide_list)
  out_sgm_timbre = np.array(out_sgm_timbre)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[3].input, outputs=models[3].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_sgm_timbre).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sgm_timbre_{i}'] = [item[i] for item in pred]
  else:
    pred = models[3].predict(out_sgm_timbre).tolist()
    df['sgm_timbre_pred'] = [item[0] for item in pred]


  return df


def transform_td_features(df, z_score_dict, timbre_max_dict, models, trim_model=False):
  '''Apply transformations to time series feature data to prepare for modeling'''
  # Transformation code
  # Section
  ## sct_loud
  feature='sct_loud'
  feature_list = list(df[feature])
  sec_cols = list(range(1,301))
  feature_idxs = [2, 4]
  means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean']]
  sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv']]
  modify_idxs=[4]
  add_val=100
  divide_val=None
  out_loud = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val)
  out_loud = np.array(out_loud)


  ## sct_tempo
  feature='sct_tempo'
  feature_list = list(df[feature])
  sec_cols = list(range(1,301))
  feature_idxs = [2, 4, 5]
  means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
  sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
  modify_idxs=[]
  add_val=None
  divide_val=None
  out_tempo = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val)
  out_tempo = np.array(out_tempo)

  # sct_key
  feature='sct_key'
  feature_list = list(df[feature])
  sec_cols = list(range(1,301))
  feature_idxs = [2, 4, 5]
  means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
  sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
  modify_idxs=[4]
  add_val=1
  divide_val=12
  out_key = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val, run_z=True)
  out_key = np.array(out_key)

  # sct_mode
  feature='sct_mode'
  feature_list = list(df[feature])
  sec_cols = list(range(1,301))
  feature_idxs = [2, 4, 5]
  means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
  sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
  modify_idxs=[4]
  add_val=0
  divide_val=1
  out_mode = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val)
  out_mode = np.array(out_mode)

  # sct_time_sig
  feature='sct_time_sig'
  feature_list = list(df[feature])
  sec_cols = list(range(1,301))
  feature_idxs = [2, 4, 5]
  means = [z_score_dict['sct_conf_mean'], z_score_dict[f'{feature}_mean'], z_score_dict[f'{feature}_conf_mean']]
  sddevs = [z_score_dict['sct_conf_sdv'], z_score_dict[f'{feature}_sdv'], z_score_dict[f'{feature}_conf_sdv']]
  modify_idxs=[4]
  add_val=0
  divide_val=7
  out_time_sig = create_td_data(feature_list, sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val, run_z=True)
  out_time_sig = np.array(out_time_sig)

  # Concat section features
  out_section = np.concatenate([out_loud, out_tempo[:, :, 1:], out_key[:, :, 1:], out_mode[:, :, 1:], out_time_sig[:, :, 1:]], -1)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[0].input, outputs=models[0].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_section).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sct_feature_{i}'] = [item[i] for item in pred]
  else:
    pred = models[0].predict(out_section).tolist()
    df['sct_pred'] = [item[0] for item in pred]



  # sgm loud
  feature = 'sgm_loud'
  feature_list = list(df[feature])
  mcr_sec_cols = list(np.arange(0.1,300.1, 0.1))
  feature_idxs = [2, 4, 5, 6, 7]
  means = [z_score_dict[f'sgm_conf_mean'], z_score_dict['sgm_loud_start_mean'], z_score_dict['sgm_loud_max_mean'],
          z_score_dict['sgm_loud_max_time_mean'], z_score_dict['sgm_loud_end_mean']]
  sddevs = [z_score_dict[f'sgm_conf_sdv'], z_score_dict['sgm_loud_start_sdv'], z_score_dict['sgm_loud_max_sdv'],
            z_score_dict['sgm_loud_max_time_sdv'], z_score_dict['sgm_loud_end_sdv']]
  modify_idxs=[4, 5, 7]
  add_val=100
  divide_val=1
  out_sgm_loud = create_td_data(feature_list, mcr_sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val, run_z=True)
  out_sgm_loud = np.array(out_sgm_loud)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[1].input, outputs=models[1].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_sgm_loud).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sgm_loud_feature_{i}'] = [item[i] for item in pred]
  else:
    pred = models[1].predict(out_sgm_loud).tolist()
    df['sgm_loud_pred'] = [item[0] for item in pred]


  # sgm_pitch
  feature = 'sgm_pitch'
  feature_list = list(df[feature])
  mcr_sec_cols = list(np.arange(0.1,300.1, 0.1))
  feature_idxs = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  means = [z_score_dict['sgm_conf_mean']] + [z_score_dict[f'sgm_timbre_{i}_mean'] for i in range(1,13)]
  sddevs = [z_score_dict['sgm_conf_sdv']] + [z_score_dict[f'sgm_timbre_{i}_sdv'] for i in range(1,13)]
  modify_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  add_val=0.000001
  divide_val=1
  out_sgm_pitch = create_td_data(feature_list, mcr_sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val)
  out_sgm_pitch = np.array(out_sgm_pitch)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[2].input, outputs=models[2].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_sgm_pitch).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sgm_pitch_{i}'] = [item[i] for item in pred]
  else:
    pred = models[2].predict(out_sgm_pitch).tolist()
    df['sgm_pitch_pred'] = [item[0] for item in pred]


  # sgm_timbre
  feature = 'sgm_timbre'
  feature_list = list(df[feature])
  mcr_sec_cols = list(np.arange(0.1,300.1, 0.1))
  feature_idxs = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  means = [z_score_dict['sgm_conf_mean']] + [z_score_dict[f'sgm_timbre_{i}_mean'] for i in range(1,13)]
  sddevs = [z_score_dict['sgm_conf_sdv']] + [z_score_dict[f'sgm_timbre_{i}_sdv'] for i in range(1,13)]
  modify_idxs=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
  add_val=1000
  divide_val=None
  divide_list = [timbre_max_dict[f'sgm_timbre_{i}_max'] for i in range(1,13)]
  out_sgm_timbre = create_td_data(feature_list, mcr_sec_cols, feature_idxs, means, sddevs,
                        modify_idxs, add_val, divide_val, divide_list)
  out_sgm_timbre = np.array(out_sgm_timbre)
  # Make Predictions
  if trim_model==True:
    # Trim Model
    model_pred = Model(inputs=models[3].input, outputs=models[3].layers[-3].output)
    # Make Predictions
    pred = model_pred.predict(out_sgm_timbre).tolist()
    length = len(pred[0])
    for i in range(length):
      df[f'sgm_timbre_{i}'] = [item[i] for item in pred]
  else:
    pred = models[3].predict(out_sgm_timbre).tolist()
    df['sgm_timbre_pred'] = [item[0] for item in pred]


  return df


########################

def chunks(lst, n):
      """Yield successive n-sized chunks from lst
      Inputs:
        lst (list): list of items to be split
        n (int): number of splits to make
      Output:
        lst (list): returns list of lists that are broken up into chunks of size n
        """
      for i in range(0, len(lst), n):
          yield lst[i:i + n]


def normalize_data(data, use_max=None, use_min=None, sav_max=False):
    ''' Normalizes a list of data to be values from 0-1.  Uses a max value if
        provided
    Input:
      data (list): list of data points
      use_max (float): if specified, uses provided max value
      sav_max (bool): if True, save outputs data max.
    Output:-
      norm_data (list): normalized from 0 - 1
      norm_data (float): calculated maximum data point
      '''
    if use_max:
      data_max = use_max
      data_min = use_min
    else:
      data_max = np.max(data)*1.3
      data_min = np.min(data)*0.7
    norm_data = (data - data_min) / (data_max - data_min)
    if sav_max:
      return norm_data, data_max, data_min
    else:
      return norm_data


def extract_audio_features(df):
  '''Takes in a dataframe generated from extract_artist_info() and adds additional
     columns for audio features from spotify's api for each song in the dataframe
  Input:
    df (DataFrame): Dataframe generated from extract_artist_info
  Output:
    df (DataFrame): merges audio feature columns into input dataframe'''

  song_ids = list(df.song_id.values)
  # Chunk up song ids to lengths of 50
  song_chunks = list(chunks(song_ids, 50))
  song_list=[]
  for sample in song_chunks:
    result = sp.audio_features(sample)
    song_list.extend(result)
  # loop through songs and store audio features in a list of lists
  array_list = []
  for song in song_list:
    row_list = []
    try:
      acoust = song['acousticness']
      dance = song['danceability']
      energy = song['energy']
      instrument = song['instrumentalness']
      key = song['key']
      live = song['liveness']
      loud = song['loudness']
      mode = song['mode']
      speech = song['speechiness']
      tempo = song['tempo']
      time_sig = song['time_signature']
      valence = song['valence']
      row_list.extend([acoust, dance, energy, instrument, key, live, loud, mode,
                       speech, tempo, time_sig, valence])
      array_list.append(row_list)
    except:
      # If audio features can't be extracted from song, add NaN's
      row_list.extend([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
                       np.nan, np.nan, np.nan, np.nan, np.nan])
      array_list.append(row_list)
      pass
  # Create audio feature dataframe and merge with input dataframe
  col_nams = ['acoust', 'dance', 'energy', 'instrument', 'key', 'live', 'loud',
              'mode', 'speech', 'tempo', 'time_sig', 'valence']
  df_songs = pd.DataFrame(data=array_list, columns=col_nams)
  df = pd.merge(df, df_songs, left_index=True, right_index=True)
  return df


def extract_artist_info(df):
  '''Takes a dataframe generated from extract_song_contents() and looks up all
     of the artist ids in the dataframe.  A dictionary is created to keep track of
     artist id row location
  Input:
    df (DataFrame): Dataframe generated from extract_song_contents
  Output:
    df (DataFrame): merges artist info and genre columns into input dataframe'''

  # First flatten list of artist id's in df
  nested_artists = list(df.artist_id.values)
  artists = list(itertools.chain(*nested_artists))
  # Partition artists list for artist lookup
  artists_chunks = list(chunks(artists, 50))
  artist_list = []
  for sample in artists_chunks:
    result = sp.artists(sample)
    artist_list.extend(result['artists'])

  # create a dataframe of all artists with their id, name, popularity, and genres
  df_artists=[]
  for artist in artist_list:
    info = []
    artist_id = artist['id']
    artist_name = artist['name']
    artist_genres = artist['genres']
    artist_pop = artist['popularity']
    info.extend([artist_id, artist_name, artist_genres, artist_pop])
    df_artists.append(info)
  df_artists = pd.DataFrame(data=df_artists,
                            columns=['artist_id', 'artist_name', 'artist_genres',
                                     'artist_pop']).drop_duplicates(subset='artist_id')
  df_artists.set_index('artist_id', inplace=True)

  # Loop through df and find artist info in df_artists
  artist_genre, artist_pop = [], []
  for row in df.artist_id:
    multiple_art_genres =[]
    sample = df_artists[df_artists.index==row[0]]
    artist_pop.extend([sample.artist_pop[0]])
    if len(row)<=1:
      multiple_art_genres.extend(sample.artist_genres[0])
    # Loop through other artists on the track
    else:
      for id in row:
          sample = df_artists[df_artists.index==id]
          multiple_art_genres.extend(sample.artist_genres[0])
    artist_genre.append(multiple_art_genres)
  # add artist genres and popularity to df_artists
  df['artist_pop'] = artist_pop
  df['genres'] = artist_genre

  return df


def extract_song_contents(raw_song_data, ratings=None, weights=None):
  data_lst = []
  # Iterate through each item in the list and add song contents to an array
  if ratings != None:
    col_nams = ['song_id', 'song_nam', 'dur', 'explicit',
                'pop', 'album_nam', 'album_type', 'release', 'artist_id',
                'artist_nam', 'rating', 'weight']
  else:
    col_nams = ['song_id', 'song_nam', 'dur', 'explicit', 'pop', 'album_nam',
                'album_type', 'release', 'artist_id', 'artist_nam']
  for i, track in enumerate(raw_song_data):
    try:
      song_info=[]
      # Get song general info
      song_id = track['id']
      song_nam = track['name']
      dur = track["duration_ms"]
      explicit = int(track['explicit'])
      pop = track['popularity']
      album_nam = track['album']['name']
      album_type = track['album']['album_type']
      release = track['album']['release_date']
      artist_id = [artist['id'] for artist in track['artists']]
      artist_nam = [artist['name'] for artist in track['artists']]
      if ratings != None:
        rating, weight = ratings[i], weights[i]
        song_info.extend([song_id, song_nam, dur, explicit, pop, album_nam,
                          album_type, release, artist_id, artist_nam, rating, weight])
      else:
        song_info.extend([song_id, song_nam, dur, explicit, pop, album_nam,
                          album_type, release, artist_id, artist_nam])
      data_lst.append(song_info)
    except:
      pass
  # Convert array to dataframe
  data = pd.DataFrame(data=data_lst, columns=col_nams)
  # Get artist info
  data = extract_artist_info(data)
  # Get Song Audio features
  data = extract_audio_features(data)
  # Drop rows with null values
  data.dropna(inplace=True)

  return data


def create_song_df(data):
  global BASIC_DATASET_API_COUNTER
  raw_song_data = []
  song_chunks = list(chunks(data['song_id'].tolist(), 50))
  for sample in song_chunks:
    results = sp.tracks(sample)['tracks']
    data_len = len(results)
    BASIC_DATASET_API_COUNTER += data_len
    raw_song_data.extend(results)

  # extract key song details from raw_song_data
  song_info = extract_song_contents(raw_song_data,
                                    ratings=data['rating'].tolist(),
                                    weights=data['weight'].tolist())
  return song_info


def rand_sel(source_list, length=None):
  '''Creates a random list of values with a random length from a sample list.
     List length can be specified or randomly chosen.  For spotify's recommend
     function, the max seed length is 5.  Putting length above 5 will break the
     code.
  Inputs:
    source_list (list): list of items to be randomly queried from
    length (int): number of items to randomly select from source_list
  Outputs:
    output (list): randomly selected n number of item/s from source_list based
    on specified length'''
  if length==None:
    length = np.random.randint(len(source_list))
  output = list(np.random.choice(source_list, (1,length))[0])
  return output


def get_date_float(input_str):
  '''Convert release date to float with year_number.month percent of year.
     If string format not easily interpretable, then return some value for year and month.
  Input:
    input_str (sting): datetime string
  Output:
    year (int): year specified in input_str
    month (Int): month specified in input_str
    '''
  if len(input_str) > 5:
    try:
      time = datetime.strptime(input_str, '%Y-%m-%d')
      year = time.timetuple()[0]
      month = time.timetuple()[1]
    except:
      year=2021
      month=1
  else:
    try:
      time = datetime.strptime(input_str, '%Y')
      year = time.timetuple()[0]
      month = 1
    except:
      year = 2021
      month = 1
  return year, month


def man_ord_encode(item):
  '''Manual encode album type column with values between 0 and 1
  Input:
    item (string): album string name
  Output:
    out (float): value based on album string name
    '''
  if item.lower() == 'single':
    out = 0.33
  elif item.lower() == 'album':
    out = 0.66
  elif item.lower() == 'compilation':
    out=0.99
  else:
    out=0
  return out


def wrangle(df, nlp=nlp_model):
  '''Performs dataset preproccessing by encoding artists and genres, translating
     album type to float, dropping irrelevant columns, and manually scaling
    columns.
      '''
  max_col_vals = [100, 4753466*1.3, 91*1.3, 3.346*1.3, 5*1.3, 11*1.3, 247.936*1.3, 2040*1.3]
  min_col_vals = [0, 0, 0, -60.0*1.3, 0, 0, 0.0, 1924*0.7]

  df = df.copy()
  # Format date column
  output = df.release.apply(get_date_float)
  year, month = list(map(list, zip(*output)))
  df['release_year'] = [yr + mth for yr, mth in zip(year, month)]

  # Apply spacy nlp model to get vectorizations
  genre_vectorizations = [nlp(' '.join(genres)).vector for genres in df['genres']]

  df['genres'] = genre_vectorizations

  # Columns to normalize
  cols_to_normalize = ['artist_pop', 'dur', 'pop', 'loud', 'time_sig',
                       'key', 'tempo', 'release_year']

    # normalize columns using provided max norm values
  for col, max_norm, min_norm in zip(cols_to_normalize, max_col_vals, min_col_vals):
    df[col] = normalize_data(df[col].values, use_max=max_norm, use_min=min_norm)

  # Drop unneeded string columns
  cols_to_drop = ['release', 'song_nam', 'album_nam', 'artist_id', 'artist_nam']
  df.set_index('song_id', inplace=True)
  df.drop(columns=cols_to_drop, inplace=True)
  df.album_type = df.album_type.apply(man_ord_encode)

  return df


def get_genre_options(df):
  '''Generates genre seed options based on genres present in good playlist.
     Spotify has select amount of available genres used as seeds to generate
     recommendations.
  Inputs:
    df (DataFrame): dataframe generated from create_song_df()
  Outputs:
    avail_genre_seeds (list): list of genre seeds
    '''
  good_genres=[]
  for item in song_traits[song_traits['rating']==playlist_scores[-1]]['genres']:
    good_genres.extend(item)
  genre_opts = sp.recommendation_genre_seeds()['genres']
  avail_genre_seeds = list(set(genre_opts) & set(good_genres))
  return avail_genre_seeds


def rec_batch_size(n_rec):
  '''Generates a list of API call legnths based on n_rec which is the number
     of desired recommendations.  Spotify API has a recommendation of 100 per
     recommendation API call.
  Input:
    n_rec (int): number of desired recommendations
  Output:
    out (list): list of recommendation limit numbers whereby the max is 100 for
    each list
  Example:
    n_rec = 230, output = [[100], [100], [23]]
    '''
  if n_rec <= 100:
    out = [n_rec]
  else:
    num = math.ceil(n_rec / 100)
    out = ([100] * (num - 1))
    last = [n_rec - (num - 1) * 100]
    out.extend(last)
  return out


def get_rec_filter_values(song_traits):
  df_flt = song_traits[song_traits.rating==len(playlist_scores)-1]
  rec_cats = ['acoust', 'dance', 'dur', 'energy', 'instrument', 'live', 'loud',
              'pop', 'speech', 'tempo', 'valence']
  rec_cats_nams = ['acousticness', 'danceability', 'duration_ms', 'energy',
                  'instrumentalness', 'liveness', 'loudness', 'popularity',
                  'speechiness', 'tempo', 'valence']
  dict_rec_v = {}
  for rec_cat, rec_nam in zip(rec_cats, rec_cats_nams):
    v_min = min(df_flt[rec_cat])
    v_max = max(df_flt[rec_cat])
    v_avg = np.mean(df_flt[rec_cat])
    # Spotify api recquires some values to be integer
    if rec_cat in ['dur', 'pop']:
      v_min, v_max, v_avg = int(v_min), int(v_max), int(v_avg)
    dict_rec_v[f'min_{rec_nam}'] = v_min
    dict_rec_v[f'max_{rec_nam}'] = v_max
    dict_rec_v[f'target_{rec_nam}'] = v_avg

  return dict_rec_v


def vis_preds(data, data_vec, y_data, models, fig_title, pred_songs=False):
  if pred_songs:
    if 'rating' in df_test.columns: df_test.drop(columns='rating', inplace=True)
    if len(models)>1:
      out_pred = get_avg_predict(models, data, data_vec)
    else:
      model=models[0]
      out_pred = model.predict(x=(data, data_vec))
    fig = px.histogram(out_pred, nbins=200, title=fig_title)
  else:
    if len(models)>1:
      train_pred = get_avg_predict(models, data, data_vec)
    else:
      model=models[0]
      train_pred = model.predict(x=(data, data_vec))
    cols = list(df.columns)
    cols.remove('rating')
    cols.remove('genres')
    ac_test = pd.DataFrame(data=data, columns=cols)
    ac_test['rating'] = y_data
    ac_test['pred_rating'] = train_pred
    ac_test.sort_values('rating', ascending=True, inplace=True)
    fig = px.histogram(ac_test, x='pred_rating', color='rating', opacity=0.75,
                       barmode='overlay', nbins=200, title=fig_title)
  return fig.show()


def vis_preds_newf(data, fig_title):
  fig = px.histogram(data, x='rating', opacity=0.75,
                      barmode='overlay', nbins=200, title=fig_title)
  return fig.show()


def get_avg_predict(models, data, data_vec):
  '''Return average predicted rating from models'''
  for i, model in enumerate(models):
      if i==0:
        out_pred = model.predict(x=(data, data_vec))
      else:
        out_pred += model.predict(x=(data, data_vec))
  out_pred = out_pred / len(models)
  return out_pred


def get_avg_predict_newf(models, data):
  '''Return average predicted rating from models'''
  for i, model in enumerate(models):
      if i==0:
        out_pred = model.predict(x=data)
      else:
        out_pred += model.predict(x=data)
  out_pred = out_pred / len(models)
  return out_pred


def load_model(file_path):
  '''Load tensorflow model from Gdrive'''
  zip_ref = zipfile.ZipFile(file_path, 'r')
  zip_ref.extractall()
  zip_ref.close()
  # Load Model
  model = tf.keras.models.load_model('content/saved_model/saved_model')
  return model


def data_splitter_for_model(data, col_idx=5):
  '''Splits dataframe into two numpy arrays with song genre vectorization split into the second array'''
  # Check if dataframe input
  if type(data)!=np.ndarray:
    data = data.to_numpy()
  # Grab vectorization column
  data_vec = np.array([row[col_idx] for row in data]).astype('float32')
  # Remove vectorization column from data
  data = np.delete(data, col_idx, 1)
  # Convert arrays to float32 type
  data = np.array(data).astype('float32')
  data_vec = np.array(data_vec).astype('float32')
  return data, data_vec


def encode_data(data):
  '''Applies two models to encode input data'''
  if 'rating' in list(data.columns):
    data = data.drop(columns='rating')
  _, data_vec = data_splitter_for_model(data)
  data_vec_comp = vec_model.predict(data_vec)
  data.drop(columns=['genres'], inplace=True)
  data[vec_col_nams] = data_vec_comp
  data_processed = enc_model.predict(data)
  return data_processed


def encode_data_vec(data):
  '''Applies vec model to encode input data'''
  if 'rating' in list(data.columns):
    data = data.drop(columns='rating')
  _, data_vec = data_splitter_for_model(data)
  data_vec_comp = vec_model.predict(data_vec)
  data.drop(columns=['genres'], inplace=True)
  data[vec_col_nams] = data_vec_comp
  return data


def generate_raw_data_segment(basic_dataset_file_path, sp=sp):
  '''Generate time domain data from basic dataset'''
  basic_dataset_file_list = os.listdir(basic_dataset_file_path)

  for i in tqdm(range(len(basic_dataset_file_list))):
    data = pd.read_csv(os.path.join(basic_dataset_file_path,
                                    f'basic_dataset_p{i+1}.csv'))
    song_list = data['song_id'].tolist()
    df_raw_feats, sp = audio_analysis(song_list, sp=sp)
    df_raw_feats = data[['song_id', 'rating', 'weight']].join(df_raw_feats, on='song_id')

    save_dir = os.path.join(DATA_DIR, 'Databases', 'Raw_TD_Features')
    make_dir(save_dir)
    df_raw_feats.to_csv(os.path.join(save_dir, f'raw_td_feature_p{i+1}.csv'))
    # clear varaible
    df_raw_feats=None

  print(f'Saved all TD parts saved successfully!')


def transform_to_td(feature_list, transform_feat_names, transform_idxs,
                    loud_idxs=None, loud_add=100, max_val=None,
                    scope=['mean', 'sdv']):
  '''feature = 'sct_loud'
  feature_list = list(df_new_feats[feature])
  transform_idxs = [2,4]
  loud_idx=4
  transform_feat_names = ['sct_loud_conf', 'sct_loud']'''

  out_dict = {}
  for name, idx in zip(transform_feat_names, transform_idxs):
    locals()[f'{name}_flat'] = []
    for song in feature_list:
      for feature in song:
        if idx in loud_idxs:
          if max_val:
            if type(max_val)==list:
              value = (loud_add + feature[idx]) / max_val[idx-4]
            else:
              value = (loud_add + feature[idx]) / max_val
          else:
            value = (loud_add + feature[idx])
        else:
          value = feature[idx]
        locals()[f'{name}_flat'].append(value)
    # Add z_score to output dictionary
    if 'min' in scope:
      out_dict[f'{name}_min'] = min(locals()[f'{name}_flat'])*0.7
    if 'max' in scope:
      out_dict[f'{name}_max'] = max(locals()[f'{name}_flat'])*1.3
    if 'mean' in scope:
      out_dict[f'{name}_n'] = len(locals()[f'{name}_flat'])
      out_dict[f'{name}_total'] = sum(locals()[f'{name}_flat'])
      out_dict[f'{name}_sqr_total'] = sum(i*i for i in locals()[f'{name}_flat'])

  return out_dict

# Playlist links:
To get consistent access to your playlists, you need the playlist URI:
1. Open spotify and navigate to your playlist
2. Select the three dots under the playlist and navigate to the share button.
3. Next to the share button mouse over to "Copy Link to Playlist" and press control or command on your keyboard.  "Copy Link to Playlist" will change to "Copy Spotify URI" and click on "Copy Spotify URI".  This link to your playlist will be persitent while the default "Copy Link to Playlist" will change periodically.

Playlists 1-5 will be used as the negative song label when building the model and will be used to remove songs from recommendations.  It is recommended to include a playlist of disliked songs to help the model filter out songs you don't like.  At least one playlist must be specified in playllsts 1-5.

The target playlist will be the positive song label when building the building and will be used to select song recommendations.

An output playlist is required in order for song recommendations to be sent to a playlist.  <strong>It is highly recommended to create a new playlist for the output playlist since songs can be removed in the playlist if specified in the config!</strong>  You will also have to be the creator of the output playlist to add and remove songs from it.<br><br>
Note: the output playlist can not be a Collaborative one since spotify's api doesn't allow automated modification of collaborative playlists.

In [None]:
if operation=="create initial basic dataset":
  # Open config file and set ratings if not set
  rating_input(playlists=list(INPUT_PLAYLISTS.keys()))

# Create Initial Dataset
Creates the basic dataset (song traits) that includes the data for genre and basic song traits.  Will be used for general feature and song ids.  Also creates ratings column

In [None]:
if operation=="create initial basic dataset":
  # Obtain all song data and save with song_id, rating and weight
  df_song_ids = create_song_id_df()
  df_cnts = get_rating_cnts(df_song_ids[['rating']])
  display(df_song_ids)
  display(df_cnts)

In [None]:
if operation=="create initial basic dataset":
  # Weight Function
  custom_weights = calc_weights(df_song_ids, df_cnts, weight_mode=weight_mode)

In [None]:
if operation=="create initial basic dataset":
  # Add weights to df_song_id
  df_song_ids = save_weights_and_add_to_df(custom_weights, df_song_ids)
  config = json.load(open('config.json'))
  config['num_songs'] = df_song_ids.shape[0]
  json.dump(config, open('config.json', 'w'))
  display(df_song_ids)

In [None]:
if operation=="create initial basic dataset":
  basic_data_ids_save_dir = os.path.join(DATA_DIR, 'Databases', 'Basic_Dataset_Song_Ids')
  n_chunks = get_n_chunks(df_song_ids)
  split_dataset_into_chunks(df_song_ids.reset_index(), n_chunks, 'rating', basic_data_ids_save_dir)

In [None]:
if operation=="create initial basic dataset":
    # create song_id mapping dictionary
    song_id_path = os.path.join(DATA_DIR, 'Databases', 'Basic_Dataset_Song_Ids')
    song_id_files = os.listdir(song_id_path)
    song_ids, ratings = [], []
    for file in song_id_files:
        df=pd.read_csv(os.path.join(song_id_path, file))
        song_ids.extend(df['song_id'].tolist())
        ratings.extend(df['rating'].tolist())
    out_dict = {}
    for song_id, rating, unq_id in zip(song_ids, ratings, list(range(1,len(song_ids)+1))):
            out_dict[song_id] = (unq_id, rating)
    if os.path.isfile('song_id_lookup.json')==False:
      json.dump(dict(), open('song_id_lookup.json', 'w'))
    json.dump(out_dict, open('song_id_lookup.json', 'w'))

In [None]:
if operation=="create initial basic dataset":
  generate_basic_datasets()

# Generate Raw TD Dataset
Generates the raw time domain data as well as general features from spotify's audio analysis endpoint.  Will generate 2 output tables for access for generating the rest of the features

In [None]:
if operation=="generate raw td dataset":
  if BASIC_DATASET_API_COUNTER > 0:
    try:
      os.remove('.cache')
    except: pass
    sp = create_connection()
  basic_dataset_path = os.path.join(DATA_DIR, 'Databases', 'Basic_Dataset')
  basic_dataset_file_list = os.listdir(basic_dataset_path)
  # This function may take several hours depending on the size of your dataset (~10 minutes per 1000 songs and 250mb per 1000 songs)
  generate_raw_data_segment(basic_dataset_path)

# Create Feature Tables
Creates our time series feature tables from the

In [None]:
if operation=="create feature tables":
  basic_dataset_path = os.path.join(DATA_DIR, 'Databases', 'Basic_Dataset')
  basic_dataset_file_list = os.listdir(basic_dataset_path)
  raw_td_dataset_path = os.path.join(DATA_DIR, 'Databases', 'Raw_TD_Features')
  raw_td_file_list = os.listdir(raw_td_dataset_path)
  config = json.load(open('config.json'))
  if 'null_idxs' in config.keys():
    null_tracker = config['null_idxs']
  else:
    null_tracker = {}
  # null_tracker = {}
  for i in range(len(raw_td_file_list)):
    null_tracker[f'{i+1}'] = []
  for feature in ["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]:
    feature_path = os.path.join(DATA_DIR, 'Databases', feature)
    make_dir(feature_path)
    # Caclculate normalization values and add them to config
    null_tracker = generate_feature_norms(raw_td_dataset_path, raw_td_file_list, basic_dataset_path,
                           feature=feature, normalization_mode=normalization_mode, run_normalization=False,
                                         null_tracker=null_tracker)
    config = json.load(open('config.json'))
    config['null_idxs'] = null_tracker
    json.dump(config, open('config.json', 'w'))

In [None]:
if operation=="create feature tables":
  basic_dataset_path = os.path.join(DATA_DIR, 'Databases', 'Basic_Dataset')
  basic_dataset_file_list = os.listdir(basic_dataset_path)
  raw_td_dataset_path = os.path.join(DATA_DIR, 'Databases', 'Raw_TD_Features')
  raw_td_file_list = os.listdir(raw_td_dataset_path)
  config = json.load(open('config.json'))
  null_tracker = config['null_idxs']
  # apply normalization and save datasets
  for feature in ["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]:
  # for feature in ["sct_data"]:
    feature_path = os.path.join(DATA_DIR, 'Databases', feature)
    _ = generate_feature_norms(raw_td_dataset_path, raw_td_file_list, basic_dataset_path,
                           feature=feature, normalization_mode=normalization_mode,
                           run_normalization=True, save_dir=feature_path, null_tracker=null_tracker, output_format='trf')

In [None]:
if operation=="create feature tables":
  generate_big_data()

# Create Final Data Table

In [None]:
if operation == 'create final data table':
  features = ['genre', 'general', 'sct_data', 'sgm_loud', 'sgm_timbre', 'sgm_pitch']
  # Move models to Saved_Models
  MODEL_TUNE_DIR_FILE_DIR = os.path.join(DATA_DIR, 'Model_tuning')
  for feature in features:
    for file in os.listdir(os.path.join(MODEL_TUNE_DIR_FILE_DIR, feature)):
          if file.endswith('.keras'):
            source_path = os.path.join(MODEL_TUNE_DIR_FILE_DIR, feature, file)
            destination_path = os.path.join(DATA_DIR, 'Saved_Models', file)
            shutil.copyfile(source_path, destination_path)
          continue

In [None]:
if operation == 'create final data table':
  # Load Models
  model_path = os.path.join(DATA_DIR, 'Saved_Models')
  model_dict = {}
  for model in os.listdir(model_path):
    if 'genre' in model:
      model_dict['genre'] =  tf.keras.models.load_model(os.path.join(model_path, model))
    elif 'general' in model:
      model_dict['general'] = tf.keras.models.load_model(os.path.join(model_path, model))
    elif 'sct_data' in model:
      model_dict['sct_data'] = tf.keras.models.load_model(os.path.join(model_path, model))
    elif 'sgm_loud' in model:
      model_dict['sgm_loud'] = tf.keras.models.load_model(os.path.join(model_path, model))
    elif 'sgm_timbre' in model:
      model_dict['sgm_timbre'] = tf.keras.models.load_model(os.path.join(model_path, model))
    elif 'sgm_pitch' in model:
      model_dict['sgm_pitch'] = tf.keras.models.load_model(os.path.join(model_path, model))
    elif 'overall_best' in model:
      model_dict['overall'] = joblib.load(os.path.join(model_path, model))

In [None]:
if operation == 'create final data table':
  # Find normalization values
  generate_final_dataset(run_normalization=False)
  # Apply normalization values
  generate_final_dataset(run_normalization=True)

In [None]:
if operation == 'create final data table':
  model_path = os.path.join(DATA_DIR, 'Saved_Models')
  for model in os.listdir(model_path):
    if 'overall_best' in model:
      model_dict['overall'] = joblib.load(os.path.join(model_path, model))
    out = get_overall_data()
    if 'overall' in os.listdir(model_path):
        lr_model = model_dict['overall']
        preds = lr_model.predict(out[["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]])
        out['overall_model'] = preds
        df_out = out.sort_values('rating').reset_index().drop(columns=['index'])
        df_out['song_id'] = df_out.index
        df_out['rating'] = (df_out['rating'] + abs(df_out['rating'].min())) / (df_out['rating'].max() + abs(df_out['rating'].min()))
        df_out['overall_model'] = (df_out['overall_model'] + abs(df_out['overall_model'].min())) / (df_out['overall_model'].max() + abs(df_out['overall_model'].min()))
        df_out.drop(columns=['weight'], inplace=True)
        df_melt = df_out.melt(id_vars=['song_id'], value_name='model_score', var_name='model')

In [None]:
if operation == 'create final data table':
  if 'overall' in model_dict.keys():
    fig = px.scatter(df_melt, x='song_id', y='model_score', color='model'
       , title='Normalized Model Predictions and Model Rating', opacity=1,
       trendline='ols', trendline_color_override="black", height=800, width=2100)
    fig.show()

In [None]:
fig = px.density_contour(df_melt, x="song_id", y="model_score", color="model")
fig.show()

In [None]:
if operation == 'create final data table':
  if 'overall' in model_dict.keys():
    fig = px.scatter(df_melt, x='song_id', y='model_score', color='model'
       , title='Normalized Model Predictions and Model Rating', opacity=1,
       trendline='ols', trendline_color_override="black", height=800, width=2100)
    fig.show()