# Imports

In [None]:
!pip install -r requirements.txt

In [None]:
# # General Toolboxes
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from datetime import datetime
import itertools
import math
import zipfile
import ast
import scipy
from scipy import stats
import json
from dotenv import load_dotenv
import re
import importlib
from ipywidgets import interact, interactive, fixed, interact_manual, Layout, FloatSlider
import ipywidgets as widgets
import datetime
from numpy2tfrecord import Numpy2TFRecordConverter, build_dataset_from_tfrecord

# Visualizations
import plotly.express as px

# Neural Network
import keras
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import logging
logging.getLogger('tensorflow').disabled = True
import tensorflow as tf


print(tf.config.list_physical_devices())
from tensorflow.python.keras import backend

# Tensorflow imports
import gc
from keras.activations import relu, sigmoid, softmax, tanh, selu, elu, gelu, leaky_relu
from keras import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Activation, AlphaDropout, LSTM, RNN, GRU, SimpleRNN, LayerNormalization, InputLayer, TimeDistributed, Bidirectional
from keras.layers import ReLU, ELU, LeakyReLU, PReLU, MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Conv1D, Concatenate, Flatten
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.constraints import MaxNorm
from keras.regularizers import l2, l1, L1L2
from tensorboard.plugins.hparams import api as hp
import keras_tuner
import keras_tuner as kt

# Imports for custom tuner
from keras_tuner.src.engine import tuner_utils
import copy
import random
try:
    import scipy
    import scipy.optimize
except ImportError:
    scipy = None
from keras_tuner.engine import hyperparameters as hp_module
from keras.src.utils import io_utils
from keras_tuner.src.engine import oracle as oracle_module
from keras_tuner.src.engine import trial as trial_module
from keras_tuner.src.engine import tuner as tuner_module
from keras_tuner.src.tuners.bayesian import BayesianOptimizationOracle

# Spotify
import spotipy
import spotipy.util as util

# SkLearn
import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import class_weight, shuffle
from sklearn.metrics import accuracy_score, precision_recall_curve, average_precision_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge

import spacy
import joblib
pd.set_option('display.max_columns', None)

print('Imports Complete!')

# Environment Variables

In [None]:
load_dotenv(override=True)
DATA_DIR = os.environ.get('DATA_DIR')
def man_win_lin_encode(str_path):
    if ':' in str_path:
        out = str_path.replace('\\', '/')
        out = out.replace(f'{DATA_DIR[:2]}', f'/mnt/{DATA_DIR[0]}'.lower())
    else:
        out = str_path
    return out
DATA_DIR = man_win_lin_encode(DATA_DIR)
# Check to make sure subfolders are created for environment
dir_folders = ['Databases', 'Model_Tuning', 'Saved_Models']
for folder in dir_folders:
  dirname = os.path.join(DATA_DIR, folder)
  if os.path.exists(dirname) == False:
      os.mkdir(dirname)

In [None]:
# Widgets
config = json.load(open('config.json'))
prediction_type = config['prediction_type']
if prediction_type=='Regression':
  regression=True
else:
  regression=False
style = {'description_width': '50%'}
tune_wdg = widgets.Dropdown(
    options=["general", "genre", 'sct_data', 'sgm_loud', 'sgm_pitch', 'sgm_timbre', "big_data", "final_lr"],
    value="sgm_pitch",
    description='Dataset for Model Tuning:',
    layout=Layout(width='25%'),
    disabled=False,
    style = {'description_width': 'initial'}
)
rating_key_wdg = widgets.Dropdown(
    options=[range(len(config['ratings'][prediction_type].keys()))],
    value=None,
    description='Select the playlist rating index for positive classes (ie: 4 for playlists 4 and 5 to be positive):',
    layout=Layout(width='25%'),
    disabled=False,
    style = {'description_width': 'initial'}
)
early_stop_patience_wdg = widgets.IntSlider(
    value=10,
    min=5,
    max=50,
    step=5,
    description='Early Stopping # of Epochs:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    layout=Layout(width='40%'),
    readout=True,
    readout_format='d',
    style = {'description_width': 'initial'}
)
num_random_iter_wdg = widgets.IntSlider(
    value=200,
    min=25,
    max=300,
    step=25,
    description='Number of random hyperparameter initialization epochs for bayesian optimization:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    layout=Layout(width='50%'),
    style = {'description_width': 'initial'}
)
num_model_iter_wdg = widgets.IntSlider(
    value=3000,
    min=100,
    max=2000,
    step=100,
    description='Number of model tuning iterations:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    layout=Layout(width='40%'),
    style = {'description_width': 'initial'}
)
acc_model_tune_metric_wdg = widgets.Dropdown(
    options=["pr_auc", "accuracy", "loss"],
    value="pr_auc",
    description='Accuracy Tuning Metric:',
    layout=Layout(width='25%'),
    disabled=False,
    style = {'description_width': 'initial'}
)
reg_model_tune_metric_wdg = widgets.Dropdown(
    options=["mean_squared_error", "mean_absolute_error", "mape", "msle", "huber"],
    value="mean_squared_error",
    description='RegressionTuning Metric:',
    layout=Layout(width='25%'),
    disabled=False,
    style = {'description_width': 'initial'}
)
made_widgets = [tune_wdg, rating_key_wdg, early_stop_patience_wdg,
                num_random_iter_wdg, num_model_iter_wdg, acc_model_tune_metric_wdg,
                reg_model_tune_metric_wdg]
if prediction_type=='Regression':
  made_widgets.remove(rating_key_wdg)
  made_widgets.remove(acc_model_tune_metric_wdg)
else:
  made_widgets.remove(reg_model_tune_metric_wdg)

# Config

In [None]:
for widget in made_widgets:
  display(widget)

In [None]:
tune = tune_wdg.value
rating_key = rating_key_wdg.value
early_stop_patience = early_stop_patience_wdg.value
model_rand_num_iterations = num_random_iter_wdg.value
model_tune_num_iterations = num_model_iter_wdg.value
acc_model_tune_metric = acc_model_tune_metric_wdg.value
reg_model_tune_metric = reg_model_tune_metric_wdg.value
BEST_SCORE = None

if tune=='genre':
  tune_mode = 'genre'
elif tune=='general':
  tune_mode = 'overall'
elif tune=='final_nn':
  tune_mode = 'model'
elif tune=='final_lr':
  tune_mode = 'lr_overall'
else:
  tune_mode = 'other'
save = True #@param {type:"boolean"}
tuner_seed = "" #@param {type:"string"}

if regression:
  TUNE_METRIC_NAME = reg_model_tune_metric
else:
  TUNE_METRIC_NAME = acc_model_tune_metric
if (tune=='final_lr') or( tune=='final_nn'):
    model_tune_name='overall'
    file_dir = os.path.join(DATA_DIR, 'Databases', "overall")
    save_dir = os.path.join(DATA_DIR, 'Model_Tuning')
    checkpoint_dir = os.path.join(save_dir, 'overall', 'overall')
else:
    model_tune_name = tune
    file_dir = os.path.join(DATA_DIR, 'Databases', tune)
    save_dir = os.path.join(DATA_DIR, 'Model_Tuning')
    checkpoint_dir = os.path.join(save_dir, model_tune_name, model_tune_name)

In [None]:
INPUT_DATA_DIR = file_dir
TRAIN_DATA_COEFFICIENT = 0.80
VALIDATION_DATA_COEFFICIENT = 0.20
# TEST_DATA_COEFFICIENT = 0.10
TEST_DATA_COEFFICIENT = 0.0
file_list = os.listdir(file_dir)
num_files = len(file_list)
file_list = [f'{model_tune_name}_dataset_p{i+1}' for i in range(len(os.listdir(file_dir)))]
num_train = int(num_files * TRAIN_DATA_COEFFICIENT)
num_val = int(num_files * VALIDATION_DATA_COEFFICIENT)
num_test = int(num_files * TEST_DATA_COEFFICIENT)
GEN_INPUT_SHAPES = {
  'genre': [None, 300],
  'general': [None, 49],
  'sct_data': [None, 300, 10],
  'sgm_loud': [None, 3000, 5],
  'sgm_pitch': [None, 3000, 13],
  'sgm_timbre': [None, 3000, 13],
  'big_data': [None, 3000, 39],
  'overall': [None, 6]
}
INPUT_DIM=GEN_INPUT_SHAPES[model_tune_name][1:]

TRAIN_FILES = [os.path.join(file_dir, f'{file}.tfrecord') for file in file_list[:num_train]]
VALIDATION_FILES = [os.path.join(file_dir, f'{file}.tfrecord') for file in file_list[num_train:num_train+num_val]]
# TEST_FILES = [os.path.join(file_dir, f'{file}.tfrecord') for file in file_list[num_train+num_val:]]

# Functions

In [None]:
def parse_feature_function_song_id(example_proto, tune_shapes=GEN_INPUT_SHAPES):
    data_shape=tune_shapes[FEATURE][1:]
    tfrecord_format = {
            "x": tf.io.FixedLenFeature(data_shape, tf.float32),
            "y": tf.io.FixedLenFeature([], tf.float32),
            "weight": tf.io.FixedLenFeature([], tf.float32),
            "song_id": tf.io.FixedLenFeature([], tf.int64)
        }
    features = tf.io.parse_single_example(example_proto, tfrecord_format)
    x=features['x']
    y = features['y']
    weight = features['weight']
    song_id = features['song_id']
    return x, y, weight, song_id


def parse_feature_function(example_proto, tune=model_tune_name, tune_shapes=GEN_INPUT_SHAPES):
    data_shape=tune_shapes[tune][1:]
    tfrecord_format = {
            "x": tf.io.FixedLenFeature(data_shape, tf.float32),
            "y": tf.io.FixedLenFeature([], tf.float32),
            "weight": tf.io.FixedLenFeature([], tf.float32),
            "song_id": tf.io.FixedLenFeature([], tf.int64)
        }
    features = tf.io.parse_single_example(example_proto, tfrecord_format)
    x=features['x']
    y = features['y']
    weight = features['weight']
    return x, y, weight


def translate_ids(song_ids, return_id_only=True):
    song_id_lookup = json.load(open('song_id_lookup.json'))
    if type(song_ids)==str:
        unq_ids, ratings = song_id_lookup[song_ids]
    else:
        unq_ids, ratings = [], []
        for song_id in song_ids:
            unq_id, rating = song_id_lookup[song_id]
            unq_ids.append(unq_id)
            ratings.append(rating)
    if return_id_only:
        return unq_ids
    else:
        return unq_ids, ratings

def reverse_translate_ids(song_ids, return_song_id=True):
    song_id_lookup = json.load(open('song_id_lookup.json'))
    reverse_dict = {}
    for key, value in song_id_lookup.items():
        reverse_dict[value[0]] = (key, value[1])
    if type(song_ids)==str:
        unq_ids, ratings = reverse_dict[song_ids]
    else:
        unq_ids, ratings = [], []
        for song_id in song_ids:
            unq_id, rating = reverse_dict[song_id]
            unq_ids.append(unq_id)
            ratings.append(rating)
    if return_song_id:
        return unq_ids
    else:
        return unq_ids, ratings


def get_overall_data(data_files):
  global FEATURE
  FEATURE='overall'
  features = ["genre", "general", "sct_data", "sgm_loud", "sgm_pitch", "sgm_timbre"]
  for i, file in enumerate(data_files):
      dataset = tf.data.TFRecordDataset([file])
      dataset = dataset.map(parse_feature_function_song_id)
      dataset = dataset.batch(500)
      for raw_record in dataset:
          x, y, weight, song_id = raw_record
      x, y, weight, song_id = x.numpy(), y.numpy().reshape(-1,1), weight.numpy().reshape(-1,1), song_id.numpy().reshape(-1,1)
      song_id = np.array(reverse_translate_ids(song_id.reshape(-1))).reshape(-1,1)
      data = np.hstack((x,y,weight))
      df_pred = pd.DataFrame(data=data, columns=features + ['rating', 'weight'])
      df_pred['song_id'] = song_id
      if i==0:
          out=df_pred
      else:
          out = pd.concat([out, df_pred], axis=0)
  return out


def parse_feature_function_lr(example_proto, tune_shapes=GEN_INPUT_SHAPES):
    data_shape=tune_shapes[FEATURE][1:]
    tfrecord_format = {
            "x": tf.io.FixedLenFeature(data_shape, tf.float32),
            "y": tf.io.FixedLenFeature([], tf.float32),
            "weight": tf.io.FixedLenFeature([], tf.float32),
            "song_id": tf.io.FixedLenFeature([], tf.int64)
        }
    features = tf.io.parse_single_example(example_proto, tfrecord_format)
    x=features['x']
    y = features['y']
    weight = features['weight']
    return x, y, weight

def load_dataset(filenames, batch_size):
    AUTOTUNE = tf.data.AUTOTUNE
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(parse_feature_function, num_parallel_calls=AUTOTUNE)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(batch_size)
    return dataset


def overall_model_grid_search(train_data, val_data, test_data, lr_model_train_data,
                  param_grid, cv=5, metric=TUNE_METRIC_NAME,
                  overfit_penalty=5, prediction_type=prediction_type):

  huber = tf.keras.losses.Huber()
  mse = keras.metrics.MeanSquaredError()
  X, y = lr_model_train_data[:, :-3], lr_model_train_data[:, -3:-1]
  test_x, test_y = test_data[:, :-3], test_data[:, -3:-1]
  X_shuf, y_shuf = shuffle(X, y, random_state=42)
  # Split Data
  kf = KFold(n_splits=cv)
  data = {}
  param_opts = list(param_grid.keys())
  score_nams = ['mean_score', 'mean_train_score', 'mean_ovfit_score']
  for key in param_opts + score_nams: data[key] = []

  # Loop through param_grid
  split_data = []
  for train_index , test_index in kf.split(X_shuf):
    X_train, X_val = X_shuf[train_index], X_shuf[test_index]
    y_train , y_val = y_shuf[train_index], y_shuf[test_index]
    # Split ratings and weights
    y_train, train_weight = split_y(y_train)
    y_val, val_weight = split_y(y_val)
    split_data.append((X_train, X_val, y_train, train_weight, y_val, val_weight))
  # return split_data
  if prediction_type=='Regression':
    # Regression
    for alpha in param_grid['alpha']:
        for solver in param_grid['solver']:
          try:
            model = Ridge(alpha=alpha, solver=solver, random_state=42)
            kf_train_score, kf_val_score, kf_overfit_score = [], [], []
            for cv in split_data:
              X_train, X_val, y_train, train_weight, y_val, val_weight = cv
              # Fit model
              model.fit(X_train, y_train, sample_weight=train_weight)
              y_train_pred = model.predict(X_train)
              y_val_pred = model.predict(X_val)
              if metric=='mean_squared_error':
                train_score = mse(y_train.reshape(-1,1), y_train_pred.reshape(-1,1), sample_weight=train_weight.reshape(-1,1)).numpy()
                val_score = mse(y_val.reshape(-1,1), y_val_pred.reshape(-1,1), sample_weight=val_weight.reshape(-1,1)).numpy()
              else:
                train_score = huber(np.array(y_train).reshape(-1,1), np.array(y_train_pred).reshape(-1,1), sample_weight=np.array(train_weight).reshape(-1,1)).numpy()
                val_score = huber(np.array(y_val).reshape(-1,1), np.array(y_val_pred).reshape(-1,1), sample_weight=np.array(val_weight).reshape(-1,1)).numpy()

              overfit_score = val_score + (overfit_penalty * (val_score - train_score))
              kf_train_score.append(train_score)
              kf_val_score.append(val_score)
              kf_overfit_score.append(overfit_score)
            # get mean scores
            data['mean_score'].append(np.mean(kf_val_score))
            data['mean_train_score'].append(np.mean(kf_train_score))
            data['mean_ovfit_score'].append(np.mean(kf_overfit_score))
            data['alpha'].append(alpha)
            data['solver'].append(solver)
          except:
            pass
  else:
    # Classification
    for penalty in param_grid['penalty']:
      for l1_ratio in param_grid['l1_ratio']:
        for C in param_grid['C']:
          for solver in param_grid['solver']:
            try:
              model = LogisticRegression(penalty=penalty, l1_ratio=l1_ratio, C=C, solver=solver)
              kf_train_score, kf_val_score, kf_overfit_score = [], [], []
              print('created model')
              for cv in split_data:
                X_train, X_val, y_train, train_weight, y_val, val_weight = cv
                # Fit model
                model.fit(X_train, y_train, sample_weight=train_weight)
                if metric=='map':
                  y_train_pred = model.predict_proba(X_train)[:,1]
                  y_val_pred = model.predict_proba(X_val)[:,1]
                  train_score = average_precision_score(y_train, y_train_pred, sample_weight=train_weight)
                  val_score = average_precision_score(y_val, y_val_pred, sample_weight=val_weight)
                elif metric=='mean_squared_error':
                  y_train_pred = model.predict(X_train)
                  y_val_pred = model.predict(X_val)
                  train_score = mse(y_train, y_train_pred, sample_weight=train_weight)
                  val_score = mse(y_val, y_val_pred, sample_weight=val_weight)
                else:
                  y_train_pred = model.predict(X_train)
                  y_val_pred = model.predict(X_val)
                  train_score = accuracy_score(y_train, y_train_pred, sample_weight=train_weight)
                  val_score = accuracy_score(y_val, y_val_pred, sample_weight=val_weight)
                overfit_score = val_score + (overfit_penalty * (val_score - train_score))
                kf_train_score.append(train_score)
                kf_val_score.append(val_score)
                kf_overfit_score.append(overfit_score)
              # get mean scores
              data['mean_score'].append(np.mean(kf_val_score))
              data['mean_train_score'].append(np.mean(kf_train_score))
              data['mean_ovfit_score'].append(np.mean(kf_overfit_score))
              if penalty==None:
                data['penalty'].append('None')
              else:
                data['penalty'].append(penalty)
              data['l1_ratio'].append(l1_ratio)
              data['C'].append(C)
              data['solver'].append(solver)
            except:
              pass
  data = pd.DataFrame(data)
  data.sort_values('mean_ovfit_score', ascending=True, inplace=True)
  display(data)

  # refit model on all train data with best model parameters
  if prediction_type=='Regression':
    fig = px.scatter(data, x='alpha', y='mean_ovfit_score', color='solver',
                title='Computed Score based on Alpha and solver')
    fig.show()
    alpha, solver = data['alpha'].iloc[0], data['solver'].iloc[0]
    # alpha, solver = data['alpha'].iloc[1], data['solver'].iloc[1]
    print(alpha, solver)
    model = Ridge(alpha=alpha, solver=solver, random_state=42)
  else:
    fig = px.scatter(data, x='C', y='mean_ovfit_score', symbol='penalty', color='solver',
                title='Computed Score based on C, penalty and solver')
    fig.show()
    C, solver, penalty, l1_ratio = data['C'].iloc[0], data['solver'].iloc[0], data['penalty'].iloc[0], data['l1_ratio'].iloc[0]
    model = LogisticRegression(penalty=penalty, l1_ratio=l1_ratio, C=C, solver=solver)
  # Fit model and make predictions
  model.fit(X, y[:,0], sample_weight=y[:,1])
  y_train_pred = model.predict(train_data[:,:-3])
  y_val_pred = model.predict(val_data[:,:-3])
  y_test_pred = model.predict(test_data[:,:-3])
  if metric=='map':
    y_train_pred = model.predict_proba(train_data[:,:-2])
    y_val_pred = model.predict_proba(val_data[:,:-2])
    y_test_pred = model.predict_proba(test_data[:,:-2])
    train_score = average_precision_score(train_data[:,-2], y_train_pred, sample_weight=train_data[:,-2])
    val_score = average_precision_score(val_data[:,-2], y_val_pred, sample_weight=val_data[:,-2])
    test_score = average_precision_score(test_data[:,-2], y_test_pred, sample_weight=test_data[:,-2])
  elif metric=='mean_squared_error':
    train_score = mse(train_data[:,-3].reshape(-1,1), y_train_pred.reshape(-1,1), sample_weight=train_data[:,-2].reshape(-1,1))
    val_score = mse(val_data[:,-3].reshape(-1,1), y_val_pred, sample_weight=val_data[:,-2].reshape(-1,1))
    test_score = mse(test_data[:,-3].reshape(-1,1), y_test_pred.reshape(-1,1), sample_weight=test_data[:,-2].reshape(-1,1))
  elif metric=='huber':
    train_score = huber(np.array(train_data[:,-2]).reshape(-1,1), np.array(y_train_pred).reshape(-1,1), sample_weight=np.array(train_data[:,-2]).reshape(-1,1)).numpy()
    val_score = huber(np.array(val_data[:,-2]).reshape(-1,1), np.array(y_val_pred).reshape(-1,1), sample_weight=np.array(val_data[:,-2]).reshape(-1,1)).numpy()
    test_score = huber(np.array(test_data[:,-2]).reshape(-1,1), np.array(y_test_pred).reshape(-1,1), sample_weight=np.array(test_data[:,-2]).reshape(-1,1)).numpy()
  else:
    train_score = accuracy_score(train_data[:,-2], y_train_pred, sample_weight=train_data[:,-2])
    val_score = accuracy_score(val_data[:,-2], y_val_pred, sample_weight=val_data[:,-2])
    test_score = accuracy_score(test_data[:,-2], y_test_pred, sample_weight=test_data[:,-2])
  scores = pd.DataFrame({'Data':['Train', 'Val', 'Test'], metric:[train_score, val_score, test_score]})
  display(scores)
  all_data = np.concatenate((train_data, val_data, test_data), axis=0)
  y_pred = model.predict(all_data[:,:-3])
  all_data_df = pd.DataFrame({'Rating': all_data[:,-3], 'Pred_Rating': y_pred})
  all_data_df['Rating'] = all_data_df['Rating'].astype(str)
  fig = px.scatter(all_data_df, x=all_data_df.index, y='Pred_Rating', color='Rating',
                  title='Predicted ratings from overall model vs Actual Rating in Color')
  fig.update_layout(height=1000)
  # fig.show()
  return model, data, all_data_df


def generate_batches(files, batch_size, tune=tune):
    counter = 0
    while True:
        fname = files[counter]

        frame = np.load(os.path.join(INPUT_DATA_DIR, f'{tune}_dataset_p{counter+1}.npy'), allow_pickle=True)
        counter = (counter + 1) % len(files)

        # here is your preprocessing
        if tune=='sgm_loud' or tune=='sgm_timbre' or tune=='sgm_pitch' or tune=='sct_data':
          input = np.array([np.array(row) for row in frame[:,0]])
          output = frame[:, -2].astype(np.float64).reshape(frame[:,-2].shape[0],1)
          weight_output = frame[:, -1].astype(np.float64).reshape(frame[:,-1].shape[0],1)
        elif tune=='general':
          input=np.delete(frame, (3,4), 1)
          output=frame[:, 3].reshape(frame[:,3].shape[0],1)
          weight_output=frame[:, 4].reshape(frame[:,4].shape[0],1)
        elif tune=='genre':
          input=frame[:,:-2]
          output=frame[:, -2].reshape(frame[:,-2].shape[0],1)
          weight_output=frame[:, -1].reshape(frame[:,-1].shape[0],1)

        for local_index in range(0, input.shape[0], batch_size):
          input_local = input[local_index:(local_index + batch_size)]
          output_local = output[local_index:(local_index + batch_size)]
          weight_output_local = weight_output[local_index:(local_index + batch_size)]

          yield input_local, output_local, weight_output_local


def process_feat_list(sample):
  '''Converst string list to list'''
  return json.loads(sample)


def split_y(y_var, rating_key=rating_key):
  y_out, y_weight = [], []
  for row in y_var:
    y_out.append(row[0])
    y_weight.append(row[1])
  return np.array(y_out), np.array(y_weight)


def process_component(data, max_length, col_id_skip=None, shift_param=0.00001):
  '''Input is a list and returns normalized dataframe'''
  out = []
  cols_skip = []
  counter = 0
  for row in data:  # Loop through song
    row_data = []
    for item in row: # loop through bar
      for i, subitem in enumerate(item): # loop through elements in bar
        if col_id_skip:
          if i in col_id_skip:
            cols_skip.append([f'comp_{counter}'])
        counter += 1
        row_data.append(subitem)
    # Check if row needs padding or trimming
    if len(row_data) < max_length:
      zero_pads = [0] * (max_length - len(row_data))
      row_data.extend(zero_pads)
    elif len(row_data) >= max_length:
      row_data = row_data[:max_length]
    out.append(row_data)

  column_names = [f'comp_{i}' for i in range(max_length)]
  df = pd.DataFrame(data=out, columns=column_names)
  # Apply boxcox normalization to each column
  for col in df.columns:
    if cols_skip:
      if col in cols_skip:
        continue
    try:
      norm_comp = normalize(stats.boxcox(df[col]+shift_param)[0])
      df[col] = norm_comp
    except:
      print('data values are negative, applying normalization first')
      print(col)
      continue

  return df


def normalize(column, negative=False):
    if negative:
      for i, song in enumerate(column):
        for j, segment in enumerate(song):
          for k, item in enumerate(segment):
            if item < 0:
              column[i][j][k] = -1 * item
            else:
              column[i][j][k] = (item+0.00001)  * 2
      return column
    upper = column.max()
    lower = column.min()
    y = (column - lower)/(upper-lower)
    return y.tolist()


########################

def chunks(lst, n):
      """Yield successive n-sized chunks from lst
      Inputs:
        lst (list): list of items to be split
        n (int): number of splits to make
      Output:
        lst (list): returns list of lists that are broken up into chunks of size n
        """
      for i in range(0, len(lst), n):
          yield lst[i:i + n]


def normalize_data(data, use_max=None, use_min=None, sav_max=False):
  ''' Normalizes a list of data to be values from 0-1.  Uses a max value if
      provided
  Input:
    data (list): list of data points
    use_max (float): if specified, uses provided max value
    sav_max (bool): if True, save outputs data max.
  Output:-
    norm_data (list): normalized from 0 - 1
    norm_data (float): calculated maximum data point
    '''
  if use_max:
    data_max = use_max
    data_min = use_min
  else:
    data_max = np.max(data)
    data_min = np.min(data)
  norm_data = (data - data_min) / (data_max - data_min)
  if sav_max:
    return norm_data, data_max, data_min
  else:
    return norm_data



def rand_sel(source_list, length=None):
  '''Creates a random list of values with a random length from a sample list.
     List length can be specified or randomly chosen.  For spotify's recommend
     function, the max seed length is 5.  Putting length above 5 will break the
     code.
  Inputs:
    source_list (list): list of items to be randomly queried from
    length (int): number of items to randomly select from source_list
  Outputs:
    output (list): randomly selected n number of item/s from source_list based
    on specified length'''
  if length==None:
    length = np.random.randint(len(source_list))
  output = list(np.random.choice(source_list, (1,length))[0])
  return output


def get_date_float(input_str):
  '''Convert release date to float with year_number.month percent of year.
     If string format not easily interpretable, then return some value for year and month.
  Input:
    input_str (sting): datetime string
  Output:
    year (int): year specified in input_str
    month (Int): month specified in input_str
    '''
  if len(input_str) > 5:
    try:
      time = datetime.strptime(input_str, '%Y-%m-%d')
      year = time.timetuple()[0]
      month = time.timetuple()[1]
    except:
      year=2021
      month=1
  else:
    try:
      time = datetime.strptime(input_str, '%Y')
      year = time.timetuple()[0]
      month = 1
    except:
      year = 2021
      month = 1
  return year, month


def man_ord_encode(item):
  '''Manual encode album type column with values between 0 and 1
  Input:
    item (string): album string name
  Output:
    out (float): value based on album string name
    '''
  if item.lower() == 'single':
    out = 0.33
  elif item.lower() == 'album':
    out = 0.66
  elif item.lower() == 'compilation':
    out=0.99
  else:
    out=0
  return out




def rec_batch_size(n_rec):
  '''Generates a list of API call legnths based on n_rec which is the number
     of desired recommendations.  Spotify API has a recommendation of 100 per
     recommendation API call.
  Input:
    n_rec (int): number of desired recommendations
  Output:
    out (list): list of recommendation limit numbers whereby the max is 100 for
    each list
  Example:
    n_rec = 230, output = [[100], [100], [23]]
    '''
  if n_rec <= 100:
    out = [n_rec]
  else:
    num = math.ceil(n_rec / 100)
    out = ([100] * (num - 1))
    last = [n_rec - (num - 1) * 100]
    out.extend(last)
  return out



def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        print(lr0 * 0.1**(epoch / s))
        return lr0 * 0.1**(epoch / s)
    return exponential_decay_fn


class MyHyperModel(kt.HyperModel):
  '''Complicated model tuning class'''
  def build(self, hp):

    # Tuning params
    optimizer = hp.Choice('optimizer', values=['adam', 'nadam', 'adamax', 'rmsprop',])
                                              #  'nesterov'])
    # n_layers = hp.Int('n_layers', min_value=13, max_value=16, step=1)
    n_layers=1
    n_units=1
    reg_type = hp.Choice('reg_type', values=['l1', 'l2', 'none'])
    max_norm = hp.Int('max_norm', min_value=1, max_value=10000, step=10)
    activation = hp.Choice('activation', values=['relu', 'selu', 'elu'])
    p_drop = hp.Float('p_drop', min_value=0.0, max_value=0.9, step=0.01)
    batch_norm = hp.Choice('batch_norm', values=[0, 1])
    reg_max = hp.Float('reg_max', min_value=0.0001, max_value=0.4, step=0.0001)

    if optimizer=='nadam':
      with hp.conditional_scope("optimizer", ["nadam"]):
        constant_lr = hp.Choice("constant_learning_rate", values=[1E-2, 1E-3, 1e-4, 3e-4, 4e-4, 5e-4, 1e-5])
        opt = tf.keras.optimizers.Nadam(learning_rate=constant_lr)
    else:
      with hp.conditional_scope("optimizer", ["adam", "adamax", 'rmsprop', 'nesterov']):
        lr = tf.keras.optimizers.schedules.ExponentialDecay(
          initial_learning_rate=hp.Choice("initial_learning_rate", values=[1E-2, 1E-3, 1e-4, 3e-4, 4e-4, 5e-4]),
          decay_steps=hp.Int('decay_steps', min_value=100, max_value=5000, step=10),
          decay_rate=hp.Float('decay_rate', min_value=0.3, max_value=1, step=0.01)
          )
      # Set optimizer learning rate
      if optimizer == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
      elif optimizer == 'adamax':
        opt = tf.keras.optimizers.Adamax(learning_rate=lr)
      elif optimizer == 'rmsprop':
        opt = tf.keras.optimizers.RMSprop(learning_rate=lr)
      elif optimizer == 'nesterov':
        opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9, nesterov=True)

    # Adjust batch normalization
    if batch_norm==1:
      use_bias=False
    else:
      use_bias=True

    # Adjust activation initializer if selu
    if activation=='selu':
      kernel_initializer = 'lecun_normal'
    else:
      kernel_initializer = 'he_normal'

    # Set Weight regularizer
    if reg_type == 'l1':
      kernel_regularizer = l1(reg_max)
    elif reg_type == 'l2':
      kernel_regularizer = l2(reg_max)
    else:
      kernel_regularizer=None

    # Set weight value constraint
    kernel_constraint = MaxNorm(max_value=max_norm)

    # Main model
    input_main = tf.keras.layers.Input(shape=INPUT_DIM)

    if batch_norm==1:
        x = BatchNormalization()(input_main)

    for i in range(n_layers):
      if i==0:
        if batch_norm==1:
          x = Dense(hp.Int('n_layer_1', min_value=10, max_value=5000, step=5),
                    kernel_regularizer=kernel_regularizer,
                    kernel_constraint=kernel_constraint,
                    kernel_initializer=kernel_initializer,
                    use_bias=use_bias)(x)
        else:
          x = Dense(hp.Int('n_layer_1', min_value=10, max_value=5000, step=5),
                    kernel_regularizer=kernel_regularizer,
                    kernel_constraint=kernel_constraint,
                    kernel_initializer=kernel_initializer,
                    use_bias=use_bias)(input_main)
      else:
        x = Dense(n_units,
                  kernel_regularizer=kernel_regularizer,
                  kernel_constraint=kernel_constraint,
                  kernel_initializer=kernel_initializer,
                  use_bias=use_bias)(x)
      if batch_norm==1:
        x = BatchNormalization()(x)
      x = Activation(activation)(x)
      if activation == 'selu':
        x = AlphaDropout(p_drop)(x)
      else:
        x = Dropout(p_drop)(x)

    pr_auc = tf.keras.metrics.AUC(num_thresholds=1000, curve="PR", name='pr_auc', from_logits=True)
    output = Dense(1, activation='tanh')(x)

    # Create model
    model = tf.keras.Model(inputs=[input_main], outputs=[output])
    model.compile(loss='squared_hinge', optimizer=opt,
                  # metrics=['accuracy', pr_auc],
                  weighted_metrics=['accuracy', pr_auc])
    return model

  def fit(self, hp, model, *args, **kwargs):
      return model.fit(
          *args,
          batch_size=hp.Int("batch_size", min_value=10, max_value=180, step=2),
          **kwargs,)


def vis_preds(data, data_vec, y_data, models, fig_title, pred_songs=False):
  if pred_songs:
    if 'rating' in df_test.columns: df_test.drop(columns='rating', inplace=True)
    if len(models)>1:
      out_pred = get_avg_predict(models, data, data_vec)
    else:
      model=models[0]
      out_pred = model.predict(x=(data, data_vec))
    fig = px.histogram(out_pred, nbins=200, title=fig_title)
  else:
    if len(models)>1:
      train_pred = get_avg_predict(models, data, data_vec)
    else:
      model=models[0]
      train_pred = model.predict(x=(data, data_vec))
    cols = list(df.columns)
    cols.remove('rating')
    cols.remove('genres')
    ac_test = pd.DataFrame(data=data, columns=cols)
    ac_test['rating'] = y_data
    ac_test['pred_rating'] = train_pred
    ac_test.sort_values('rating', ascending=True, inplace=True)
    fig = px.histogram(ac_test, x='pred_rating', color='rating', opacity=0.75,
                       barmode='overlay', nbins=200, title=fig_title)
  return fig.show()


def get_avg_predict(models, data, data_vec):
  '''Return average predicted rating from models'''
  for i, model in enumerate(models):
      if i==0:
        out_pred = model.predict(x=(data, data_vec))
      else:
        out_pred += model.predict(x=(data, data_vec))
  out_pred = out_pred / len(models)
  return out_pred


def calc_pred_thresh(models, percen_split=99):
  if 'rating' in df_test.columns: df_test.drop(columns='rating', inplace=True)
  if len(models)>1:
    out_pred = get_avg_predict(models, recs, df_vec_test)
  else:
    model = models[0]
    out_pred = model.predict(x=(recs, df_vec_test))
  threshold = np.percentile(out_pred, percen_split)
  return threshold



def save_model(model):
  '''Saves and downloads model to desktop'''
  model.save(f'saved_model/saved_model')
  !zip -r /content/saved_model.zip  /content/saved_model
  from google.colab import files
  files.download("/content/saved_model.zip")
  print('Model saved and Downloaded')

def load_model(file_path):
  '''Load tensorflow model from Gdrive'''
  zip_ref = zipfile.ZipFile(file_path, 'r')
  zip_ref.extractall()
  zip_ref.close()
  # Load Model
  model = tf.keras.models.load_model('content/saved_model/saved_model')
  return model


def data_splitter_for_model(data, col_idx=5):
  '''Splits dataframe into two numpy arrays with song genre vectorization split into the second array'''
  # Check if dataframe input
  if type(data)!=np.ndarray:
    data = data.to_numpy()
  # Grab vectorization column
  data_vec = np.array([row[col_idx] for row in data]).astype('float32')
  # Remove vectorization column from data
  data = np.delete(data, col_idx, 1)
  # Convert arrays to float32 type
  data = np.array(data).astype('float32')
  data_vec = np.array(data_vec).astype('float32')
  return data, data_vec


def process_str_list(sample):
  '''Converst string list to list'''
  sample = sample.replace("\n", "").replace("[", "").replace("]", "")
  sample_list = list(sample.split(" "))
  sample_list=[x for x in sample_list if len(x)>0]
  sample_list = [float(x) for x in sample_list]
  return sample_list


def encode_data(data):
  '''Applies two models to encode input data'''
  if 'rating' in list(data.columns):
    data = data.drop(columns='rating')
  _, data_vec = data_splitter_for_model(data)
  data_vec_comp = vec_model.predict(data_vec)
  data.drop(columns=['genres'], inplace=True)
  data[vec_col_nams] = data_vec_comp
  data_processed = enc_model.predict(data)
  return data_processed


def encode_data_vec(data):
  '''Applies vec model to encode input data'''
  if 'rating' in list(data.columns):
    data = data.drop(columns='rating')
  _, data_vec = data_splitter_for_model(data)
  data_vec_comp = vec_model.predict(data_vec)
  data.drop(columns=['genres'], inplace=True)
  data[vec_col_nams] = data_vec_comp
  return data

## Custom Tuner

In [None]:
if tune!='lr_overall':
  class CustomTuner(kt.Tuner):
    def _build_and_fit_model(self, trial, *args, **kwargs):
      hp = trial.hyperparameters
      model = self._try_build(hp)
      results = self.hypermodel.fit(hp, model, *args, **kwargs)
      tuner_utils.validate_trial_results(
          results, self.oracle.objective, "HyperModel.fit()"
      )
      ### My additions ###
      print('End Execution')
      del model
      ### End my additions ###
      return results
    def run_trial(self, trial, *args, **kwargs):
      config = tf.compat.v1.ConfigProto(log_device_placement=True)
      config.gpu_options.allow_growth = True
      session = tf.compat.v1.Session(config=config)
      backend.set_session(session)
      original_callbacks = kwargs.pop("callbacks", [])
      kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', 10, 80, step=2)
      kwargs['x'] = load_dataset(TRAIN_FILES, trial.hyperparameters.get('batch_size'))
      kwargs['validation_data'] = load_dataset(VALIDATION_FILES, trial.hyperparameters.get('batch_size'))
      # Run the training process multiple times.
      histories = []
      for execution in range(self.executions_per_trial):
          copied_kwargs = copy.copy(kwargs)
          callbacks = self._deepcopy_callbacks(original_callbacks)
          self._configure_tensorboard_dir(callbacks, trial, execution)
          callbacks.append(tuner_utils.TunerCallback(self, trial))
          # Only checkpoint the best epoch across all executions.
          copied_kwargs["callbacks"] = callbacks
          obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)

          histories.append(obj_value)

      ##### MY ADDITION #####
      backend.clear_session()
      gc.collect()
        # Get best score from all executions and return to the tuner
      if regression==False:
        best_scores = [max(hist.history[f'val_{TUNE_METRIC_NAME}']) for hist in histories]
        return max(best_scores)
      elif regression==True:
        best_scores = [min(hist.history[f'val_{TUNE_METRIC_NAME}']) for hist in histories]
        return min(best_scores)
      ##### END MY ADDITION #####


  class BayesianOptimization(CustomTuner):
      def __init__(
          self,
          hypermodel=None,
          objective=None,
          max_trials=10,
          num_initial_points=2,
          alpha=1e-4,
          beta=2.8,
          seed=None,
          hyperparameters=None,
          tune_new_entries=True,
          allow_new_entries=True,
          max_retries_per_trial=0,
          max_consecutive_failed_trials=1000,
          **kwargs
      ):
          oracle = BayesianOptimizationOracle(
              objective=objective,
              max_trials=max_trials,
              num_initial_points=num_initial_points,
              alpha=alpha,
              beta=beta,
              seed=seed,
              hyperparameters=hyperparameters,
              tune_new_entries=tune_new_entries,
              allow_new_entries=allow_new_entries,
              max_retries_per_trial=max_retries_per_trial,
              max_consecutive_failed_trials=max_consecutive_failed_trials
          )
          super(
              BayesianOptimization, self,
          ).__init__(oracle=oracle, hypermodel=hypermodel, **kwargs)
          if scipy is None:
              raise ImportError(
                  "Please install scipy before using the `BayesianOptimization`."
              )
          self.seed=seed
          self.num_initial_points = num_initial_points
          self.alpha = alpha
          self.beta = beta
          self._random_state = np.random.RandomState(self.seed)
          self.gpr = self._make_gpr()
      def _make_gpr(self):
        return sklearn.gaussian_process.GaussianProcessRegressor(
            kernel=sklearn.gaussian_process.kernels.Matern(nu=2.5),
            n_restarts_optimizer=0,
            normalize_y=True,
            alpha=self.alpha,
            random_state=self.seed,
        )

## Custom Callback

In [None]:
if tune!='lr_overall':
  class LazyModule:
      def __init__(self, name, pip_name=None):
          self.name = name
          pip_name = pip_name or name
          self.pip_name = pip_name
          self.module = None
          self._available = None


      @property
      def available(self):
          if self._available is None:
              try:
                  self.initialize()
                  self._available = True
              except ImportError:
                  self._available = False
          return self._available

      def initialize(self):
          try:
              self.module = importlib.import_module(self.name)
          except ImportError:
              raise ImportError(
                  f"This requires the {self.name} module. "
                  f"You can install it via `pip install {self.pip_name}`"
              )

      def __getattr__(self, name):
          if name == "_api_export_path":
              raise AttributeError
          if self.module is None:
              self.initialize()
          return getattr(self.module, name)


  tensorflow = LazyModule("tensorflow")
  gfile = LazyModule("tensorflow.io.gfile", pip_name="tensorflow")
  tensorflow_io = LazyModule("tensorflow_io")
  scipy = LazyModule("scipy")

  class CustomStopper(keras.callbacks.EarlyStopping):
      def __init__(self, monitor='val_loss',
               min_delta=0, patience=0, verbose=0, mode='auto', start_epoch = 10, baseline=3, restore_best_weights=True): # add argument for starting epoch
          super(CustomStopper, self).__init__()
          self.start_epoch = start_epoch

      def on_epoch_end(self, epoch, logs=None):
          if epoch > self.start_epoch:
              super().on_epoch_end(epoch, logs)

class CustomStopperCheckpoints(keras.callbacks.EarlyStopping):
  def __init__(self, monitor=f'val_{TUNE_METRIC_NAME}',
                 min_delta=0.01, patience=10, verbose=0, mode='auto', start_epoch_1=10,
                 baseline_1=13, start_epoch_2=25, baseline_2=11, restore_best_weights=True): # add argument for starting epoch
      super(CustomStopperCheckpoints, self).__init__()
      self.monitor=monitor
      self.patience=patience
      self.min_delta=min_delta
      self.restore_best_weights=restore_best_weights
      self.verbose=verbose
      self.mode='auto'
      self.start_epoch_1 = start_epoch_1
      self.baseline_1 = baseline_1
      self.start_epoch_2 = start_epoch_2
      self.baseline_2 = baseline_2

  def on_epoch_end(self, epoch, logs=None):
    if self.monitor_op is None:
      self._set_monitor_op()
    current = self.get_monitor_value(logs)
    if current is None or epoch < self.start_epoch_1:
      if np.isnan(current) and epoch<=1:
        self.stopped_epoch = epoch
        self.model.stop_training = True
      else:
        return
    elif epoch==self.start_epoch_1 and self._is_improvement(current, self.baseline_1)==False:
      self.stopped_epoch = epoch
      self.model.stop_training = True
    elif epoch > self.start_epoch_1 and epoch < self.start_epoch_2:
      return
    elif epoch==self.start_epoch_2 and self._is_improvement(current, self.baseline_2)==False:
      self.stopped_epoch = epoch
      self.model.stop_training = True
    elif epoch > self.start_epoch_2:
      if self.restore_best_weights and self.best_weights is None:
          self.best_weights = self.model.get_weights()
          self.best_epoch = epoch
      self.wait += 1
      if epoch==self.start_epoch_1 and self._is_improvement(current, self.baseline_1)==False:
        self.stopped_epoch = epoch
        self.model.stop_training = True
      if self._is_improvement(current, self.best):
          self.best = current
          self.best_epoch = epoch
          if self.restore_best_weights:
              self.best_weights = self.model.get_weights()
          self.wait = 0
          return

      if self.wait >= self.patience and epoch > 0:
          # Patience has been exceeded: stop training
          self.stopped_epoch = epoch
          self.model.stop_training = True
    else:
      return

In [None]:
if tune!='lr_overall':
  # Custom Model Saving Callback
  def is_remote_path(filepath):
      if re.match(r"^(/cns|/cfs|/gcs|/hdfs|.*://).*$", str(filepath)):
          return True
      return False

  def _raise_if_no_gfile(path):
      raise ValueError(
          "Handling remote paths requires installing TensorFlow "
          f"(in order to use gfile). Received path: {path}"
      )

  def path_to_string(path):
      if isinstance(path, os.PathLike):
          return os.fspath(path)
      return path

  def exists(path):
      if is_remote_path(path):
          if gfile.available:
              return gfile.exists(path)
          else:
              _raise_if_no_gfile(path)
      return os.path.exists(path)

  def makedirs(path):
      if is_remote_path(path):
          if gfile.available:
              return gfile.makedirs(path)
          else:
              _raise_if_no_gfile(path)
      return os.makedirs(path)

  class OverallModelCheckpoint(ModelCheckpoint):
      def __init__(
          self,
          filepath,
          base_dir=checkpoint_dir,
          monitor=f"val_{TUNE_METRIC_NAME}",
          verbose=0,
          save_best_only=False,
          save_weights_only=False,
          mode="auto",
          save_freq="epoch",
          initial_value_threshold=None,
      ):
          global BEST_SCORE
          super().__init__(
              filepath + '.keras',
              monitor=f"val_{TUNE_METRIC_NAME}",
              verbose=0,
              save_best_only=False,
              save_weights_only=False,
              mode="auto",
              save_freq="epoch",
              initial_value_threshold=None,
          )
          self.monitor = monitor
          self.verbose = verbose
          self.base_dir = checkpoint_dir
          self.filepath = path_to_string(self.base_dir + f'_{BEST_SCORE}.keras')
          self.save_best_only = save_best_only
          self.save_weights_only = save_weights_only
          self.save_freq = save_freq
          self._batches_seen_since_last_saving = 0
          self._last_batch_seen = 0

          if mode not in ["auto", "min", "max"]:
              warnings.warn(
                  f"ModelCheckpoint mode '{mode}' is unknown, "
                  "fallback to auto mode.",
                  stacklevel=2,
              )
              mode = "auto"

          if mode == "min":
              self.monitor_op = np.less
              if BEST_SCORE is None:
                  BEST_SCORE = np.Inf
          elif mode == "max":
              self.monitor_op = np.greater
              if BEST_SCORE is None:
                  BEST_SCORE = -np.Inf
          else:
              if "acc" in self.monitor or self.monitor.startswith("fmeasure"):
                  self.monitor_op = np.greater
                  if BEST_SCORE is None:
                      BEST_SCORE = -np.Inf
              else:
                  self.monitor_op = np.less
                  if BEST_SCORE is None:
                      BEST_SCORE = np.Inf

          if self.save_freq != "epoch" and not isinstance(self.save_freq, int):
              raise ValueError(
                  f"Unrecognized save_freq: {self.save_freq}. "
                  "Expected save_freq are 'epoch' or integer values"
              )
      def _save_model(self, epoch, batch, logs):
          """Saves the model.

          Args:
              epoch: the epoch this iteration is in.
              batch: the batch this iteration is in. `None` if the `save_freq`
                  is set to `"epoch"`.
              logs: the `logs` dict passed in to `on_batch_end` or `on_epoch_end`.
          """
          global BEST_SCORE
          logs = logs or {}

          filepath = self._get_file_path(epoch, batch, logs)
          dirname = os.path.dirname(filepath)
          if dirname and not exists(dirname):
              makedirs(dirname)

          try:
              if self.save_best_only:
                  current = logs.get(self.monitor)
                  if current is None:
                      warnings.warn(
                          f"Can save best model only with {self.monitor} "
                          "available, skipping.",
                          stacklevel=2)
                  else:
                      if self.monitor_op(current, BEST_SCORE):
                          filepath = path_to_string(self.base_dir + f'_{BEST_SCORE}.keras')
                          self.filepath = filepath
                          try:
                              os.remove(self.filepath)
                          except:
                              print(f'failed to remove: {self.filepath}')
                          if self.verbose > 0:
                              io_utils.print_msg(
                                  f"\nEpoch {epoch + 1}: {self.monitor} "
                                  "improved "
                                  f"from {BEST_SCORE:.5f} to {current:.5f}, "
                                  f"saving model to {self.filepath}"
                              )
                          # Delete previous best and save new best
                          BEST_SCORE = current
                          filepath = path_to_string(self.base_dir + f'_{BEST_SCORE}.keras')
                          self.filepath = filepath
                          # output new global variable
                          if self.save_weights_only:
                              self.model.save_weights(self.filepath, overwrite=True)
                          else:
                              self.model.save(self.filepath, overwrite=True)
                      else:
                          if self.verbose >= 2:
                              io_utils.print_msg(
                                  f"\nEpoch {epoch + 1}: "
                                  f"{self.monitor} did not improve "
                                  f"from {BEST_SCORE:.5f}"
                              )
              else:
                  if self.verbose > 0:
                      io_utils.print_msg(
                          f"\nEpoch {epoch + 1}: saving model to {self.filepath}"
                      )
                  if self.save_weights_only:
                      self.model.save_weights(self.filepath, overwrite=True)
                  else:
                      self.model.save(self.filepath, overwrite=True)
          except IsADirectoryError:  # h5py 3.x
              raise IOError(
                  "Please specify a non-directory filepath for "
                  "ModelCheckpoint. Filepath used is an existing "
                  f"directory: {self.filepath}"
              )
          except IOError as e:  # h5py 2.x
              if "is a directory" in str(e.args[0]).lower():
                  raise IOError(
                      "Please specify a non-directory filepath for "
                      "ModelCheckpoint. Filepath used is an existing "
                      f"directory: f{self.filepath}"
                  )
              # Re-throw the error for any other causes.
              raise e

## Time Hypermodel

In [None]:
if tune_mode=='other':
  class MyHyperModel(kt.HyperModel):
      '''Complicated model tuning class'''
      def __init__(self, loss_metric, regression):
        self.loss_metric = loss_metric
        self.regression = regression

      def build(self, hp):
        optimizer = hp.Choice('optimizer', values=['adam', 'nadam', 'adamax', 'rmsprop','adafactor', 'lion'])
        reg_type = hp.Choice('reg_type', values=['l1', 'l2', 'l1l2', 'None'])
        max_norm = hp.Int('max_norm', min_value=1, max_value=10000, step=10)
        p_drop = hp.Float('p_drop', min_value=0.0, max_value=0.6, step=0.01)
        batch_norm = hp.Choice('batch_norm', values=[0, 1])
        activation = hp.Choice('activation', values=['relu', 'selu', 'elu', 'gelu', 'leaky_relu'])
        activation_layer = hp.Choice('activation_layer', values=['Relu', 'Elu', 'LeakyReLU'])
        pool_layer = hp.Choice('pool_layer', ['local_max', 'local_average'])
        pool_size = hp.Int('pool_size', min_value=1, max_value=30, step=1)
        num_conv_layers=hp.Choice('num_conv_layers', [1,2,3])
        kernel_size=hp.Int('kernel_size', min_value=1, max_value=16, step=1)
        stride=hp.Int('stride', min_value=1, max_value=32, step=1)
        dense_size=hp.Int('dense_size', min_value=1, max_value=500, step=1)
        initial_learning_rate=hp.Choice("initial_learning_rate", values=[1E-2, 1E-3, 1e-4, 3e-4, 4e-4, 5e-4]),
        with hp.conditional_scope('optimizer', ['adam','adamax', 'rmsprop','adafactor', 'lion']):
          if optimizer!='nadam':
            decay_steps=hp.Int('decay_steps', min_value=10, max_value=5000, step=10),
            decay_rate=hp.Float('decay_rate', min_value=0.5, max_value=1, step=0.01)
            lr = tf.keras.optimizers.schedules.ExponentialDecay(
                    initial_learning_rate=initial_learning_rate,
                    decay_steps=hp.get('decay_steps'),
                    decay_rate=hp.get('decay_rate'))
        if batch_norm==1:
          use_bias=False
        else:
          use_bias=True
        # Set Weight regularizer
        if reg_type =='None':
            kernel_regularizer=None
        else:
          with hp.conditional_scope('reg_type', ['l1', 'l2', 'l1l2']):
            if reg_type !='None':
              reg_max = hp.Float('reg_max', min_value=0.005, max_value=0.4, step=0.001)
              if reg_type == 'l1':
                kernel_regularizer = l1(reg_max)
              elif reg_type == 'l2':
                kernel_regularizer = l2(reg_max)
              elif reg_type=='l1l2':
                kernel_regularizer = L1L2(l1=reg_max, l2=reg_max)
      # Set weight value constraint
        kernel_constraint = MaxNorm(max_value=max_norm)
        # Set optimizer learning rate
        if optimizer == 'adam':
          opt = tf.keras.optimizers.Adam(learning_rate=lr)
        elif optimizer == 'adamax':
          opt = tf.keras.optimizers.Adamax(learning_rate=lr)
        elif optimizer == 'rmsprop':
          opt = tf.keras.optimizers.RMSprop(learning_rate=lr)
        elif optimizer == 'nesterov':
          opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9, nesterov=True)
        elif optimizer == 'adafactor':
            opt = tf.keras.optimizers.Adafactor(learning_rate=lr)
        elif optimizer == 'lion':
            opt = tf.keras.optimizers.Lion(learning_rate=lr)
        elif optimizer == 'nadam':
            opt = tf.keras.optimizers.Nadam(learning_rate=hp.get("initial_learning_rate"))

        if pool_layer=='local_max':
          pool_layer = tf.keras.layers.MaxPooling1D(pool_size, padding='same')
        else:
          pool_layer = tf.keras.layers.AveragePooling1D(pool_size, padding='same')

        ### Model Building
        model = tf.keras.models.Sequential()
        ## CONV SECTION
        model.add(tf.keras.layers.Conv1D(filters=hp.Int('conv_1_filter', min_value=2, max_value=128, step=1),
                                         kernel_size=kernel_size, dilation_rate=stride, activation=activation,
                                         padding='same', input_shape=INPUT_DIM))
        model.add(pool_layer)
        with hp.conditional_scope('num_conv_layers', [2,3]):
          if num_conv_layers>1:
            model.add(tf.keras.layers.Conv1D(filters=hp.Int('conv_2_filter', min_value=2, max_value=128, step=1),
                                         kernel_size=kernel_size, dilation_rate=stride, activation=activation,
                                         padding='same', input_shape=INPUT_DIM))
            if batch_norm==1:
              model.add(BatchNormalization())
            model.add(Dropout(p_drop))
            if activation_layer=='Relu':
              model.add(ReLU())
            elif activation_layer=='LeakyReLU':
              model.add(LeakyReLU())
            elif activation_layer=='PReLU':
              model.add(PReLU())
            else:
              model.add(ELU())
            model.add(pool_layer)
        with hp.conditional_scope('num_conv_layers', [3]):
          if num_conv_layers>2:
            model.add(tf.keras.layers.Conv1D(filters=hp.Int('conv_3_filter', min_value=2, max_value=128, step=1),
                                         kernel_size=kernel_size, dilation_rate=stride, activation=activation,
                                         padding='same', input_shape=INPUT_DIM))
            if batch_norm==1:
              model.add(BatchNormalization())
            model.add(Dropout(p_drop))
            if activation_layer=='Relu':
              model.add(ReLU())
            elif activation_layer=='LeakyReLU':
              model.add(LeakyReLU())
            elif activation_layer=='PReLU':
              model.add(PReLU())
            else:
              model.add(ELU())
            model.add(pool_layer)

        ## LSTM SECTION
        sequences=False
        model.add(Bidirectional(LSTM(hp.Int('lstm_1', min_value=2, max_value=300, step=1), return_sequences=sequences)))
        model.add(Dropout(p_drop))

        ## DENSE SECTION
        model.add(Dense(hp.Int('last_hidden', min_value=1, max_value=500, step=1),
                                              kernel_regularizer=kernel_regularizer,
                                              kernel_constraint=kernel_constraint, use_bias=use_bias))
        if batch_norm==1:
          model.add(BatchNormalization())
        model.add(Activation(activation))
        if self.regression==False: # classification
          model.add(tf.keras.layers.Dense(1, activation='tanh'))
          pr_auc = tf.keras.metrics.AUC(num_thresholds=1000, curve="PR", name='pr_auc', from_logits=True)
          model.compile(loss='squared_hinge', optimizer=opt,
                    weighted_metrics=['accuracy', pr_auc])
        elif self.regression==True: # Regression
          model.add(Dense(1))
          model.compile(loss=self.loss_metric, optimizer=opt,
                    weighted_metrics=[self.loss_metric],
                       #### REMOVE THIS AFTER THE FIX
                        jit_compile=False
                       )

        return model

      def fit(self, hp, model, *args, **kwargs):
        return model.fit(*args,**kwargs,)

## Hypermodel

In [None]:
if (tune_mode=='genre') or (tune_mode=='overall') or (tune_mode=='nn_trim') or (tune_mode=='model'):

  class MyHyperModel(kt.HyperModel):
    '''Complicated model tuning class'''
    def __init__(self, loss_metric, regression):
      self.loss_metric = loss_metric
      self.regression = regression

    def build(self, hp):

      # Tuning params
      optimizer = hp.Choice('optimizer', values=['adam', 'nadam', 'adamax', 'rmsprop', 'lion'])
                                                #  'nesterov'])
      n_layers = hp.Int('n_layers', min_value=1, max_value=5, step=1)
      reg_type = hp.Choice('reg_type', values=['l1', 'l2', 'l1l2', 'None'])
      max_norm = hp.Int('max_norm', min_value=1, max_value=10000, step=10)
      activation = hp.Choice('activation', values=['relu', 'selu', 'elu', 'gelu', 'leaky_relu'])
      p_drop = hp.Float('p_drop', min_value=0.0, max_value=0.9, step=0.01)
      batch_norm = hp.Choice('batch_norm', values=[0, 1])

      # Pick number of units per layer and store in a list with descending values
      layer_nums = []
      min_val = 1
      max_val = 3000
      lay_range = list(range(hp.get('n_layers')))
      lay_range = [x+1 for x in lay_range]

      layer_1_s = hp.Int('layer_1', min_value=min_val, max_value=max_val, step=1)
      layer_nums.append(hp.get('layer_1'))
      if hp.get('n_layers') >= 2:
        with hp.conditional_scope("n_layers", [2,3,4,5]):
          layer_2_s = hp.Int(f'layer_2', min_value=min_val, max_value=2000, step=1)
          layer_nums.append(hp.get('layer_2'))
      if hp.get('n_layers') >= 3:
        with hp.conditional_scope("n_layers", [3,4,5]):
          layer_3_s = hp.Int(f'layer_3', min_value=min_val, max_value=2000, step=1)
          layer_nums.append(hp.get('layer_3'))
      if hp.get('n_layers') >= 4:
        with hp.conditional_scope("n_layers", [4,5]):
          layer_4_s = hp.Int(f'layer_4', min_value=min_val, max_value=2000, step=1)
          layer_nums.append(hp.get('layer_4'))
      if hp.get('n_layers') >= 5:
        with hp.conditional_scope("n_layers", [5]):
          layer_5_s = hp.Int(f'layer_5', min_value=min_val, max_value=2000, step=1)
          layer_nums.append(hp.get('layer_5'))

      # Learning Rate
      initial_learning_rate=hp.Choice("initial_learning_rate", values=[1E-2, 1E-3, 1e-4, 3e-4, 4e-4, 5e-4]),
      with hp.conditional_scope('optimizer', ['adam','adamax', 'rmsprop','adafactor', 'lion']):
        if optimizer!='nadam':
          decay_steps=hp.Int('decay_steps', min_value=10, max_value=5000, step=10),
          decay_rate=hp.Float('decay_rate', min_value=0.5, max_value=1, step=0.01)
          lr = tf.keras.optimizers.schedules.ExponentialDecay(
                  initial_learning_rate=initial_learning_rate,
                  decay_steps=hp.get('decay_steps'),
                  decay_rate=hp.get('decay_rate'))
      # Set optimizer learning rate
      if optimizer == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
      elif optimizer == 'adamax':
        opt = tf.keras.optimizers.Adamax(learning_rate=lr)
      elif optimizer == 'rmsprop':
        opt = tf.keras.optimizers.RMSprop(learning_rate=lr)
      elif optimizer == 'nesterov':
        opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9, nesterov=True)
      elif optimizer == 'adafactor':
          opt = tf.keras.optimizers.Adafactor(learning_rate=lr)
      elif optimizer == 'lion':
          opt = tf.keras.optimizers.Lion(learning_rate=lr)
      elif optimizer == 'nadam':
          opt = tf.keras.optimizers.Nadam(learning_rate=hp.get("initial_learning_rate"))

      # Adjust batch normalization
      if batch_norm==1:
        use_bias=False
      else:
        use_bias=True

      # Adjust activation initializer if selu
      if activation=='selu':
        kernel_initializer = 'lecun_normal'
      else:
        kernel_initializer = 'he_normal'

      # Set Weight regularizer
      if reg_type =='None':
          kernel_regularizer=None
      else:
          with hp.conditional_scope('reg_type', ['l1', 'l2', 'l1l2']):
              reg_max = hp.Float('reg_max', min_value=0.005, max_value=0.4, step=0.001)
              if reg_type == 'l1':
                kernel_regularizer = l1(reg_max)
              elif reg_type == 'l2':
                kernel_regularizer = l2(reg_max)
              elif reg_type=='l1l2':
                kernel_regularizer = L1L2(l1=reg_max, l2=reg_max)

      # Set weight value constraint
      kernel_constraint = MaxNorm(max_value=max_norm)

      # Create sequential model
      model = Sequential()
      model.add(tf.keras.Input(shape=INPUT_DIM))

      # Build model
      for i in range(hp.get('n_layers')):
        model.add(Dense(layer_nums[i],
                        kernel_regularizer=kernel_regularizer,
                        kernel_constraint=kernel_constraint,
                        kernel_initializer=kernel_initializer,
                        use_bias=use_bias))
        if batch_norm==1:
            model.add(BatchNormalization())
        model.add(Activation(activation))
        if i!=hp.get('n_layers')-1:
          model.add(Dropout(p_drop))

      if self.regression==False: # classification
        model.add(Dense(1, activation='tanh'))
        model.compile(loss='squared_hinge', optimizer=opt,
                  weighted_metrics=['accuracy', pr_auc])
      elif self.regression==True: # Regression
        model.add(Dense(1))
        model.compile(loss=self.loss_metric, optimizer=opt,
                  weighted_metrics=[self.loss_metric],
                     ####### REMOVE LATER IF NEEDED
                      jit_compile=False
                      # removing jit compile
                     )

      return model
  def fit(self, hp, model, *args, **kwargs):
      return model.fit(*args, **kwargs,)


# Tuner

In [None]:
if tune_mode != 'lr_overall':
  BEST_SCORE=None
  def run_tuning_encoder(input_dim, regression, num_iter, tune_metric='pr_auc',
                         tuner_seed=None, save_dir=save_dir, checkpoint_dir=checkpoint_dir,
                        model_tune_name=model_tune_name):
    global BEST_SCORE
    if model_tune_name=='genre' or model_tune_name=='general':
      num_executions=2
    else:
      num_executions=1
      # Early Stopping
    if regression==False:
      baseline=0.6
      if tune_metric == 'accuracy':
        metric_nam = 'val_accuracy'
        direction='max'
      elif tune_metric == 'loss':
        metric_nam = 'val_loss'
        direction='min'
      else:
        metric_nam = 'val_pr_auc'
        direction='max'
    elif regression:
      metric_nam=f'val_{tune_metric}'
      direction='min'
    es = CustomStopperCheckpoints(
      monitor=metric_nam,
      patience=early_stop_patience,
      baseline_1=12.7,  # mse
      baseline_2=12.2, # mse
      start_epoch_2=25,
      mode=direction,
      restore_best_weights=True,
      min_delta=0.005)
    mcp_save = OverallModelCheckpoint(checkpoint_dir, save_best_only=True,
                               monitor=metric_nam, mode=direction, verbose=1)

    if not tuner_seed:
      tuner_seed = str(np.random.choice(range(0,900000)))

    tuner = BayesianOptimization(
      hypermodel=MyHyperModel(tune_metric, regression),
      objective=kt.Objective(metric_nam, direction=direction),
      max_trials=model_tune_num_iterations,
      num_initial_points=model_rand_num_iterations,
      seed=42,
      beta=2.8,
      alpha=0.0001,
      directory=save_dir,
      overwrite=False,
      project_name=model_tune_name,
      executions_per_trial=1,
      max_retries_per_trial=0
      )
    # Get best score if possible
    try:
      current_best_score = tuner.oracle.get_best_trials(num_trials=1)[0].score
      if current_best_score < BEST_SCORE:
        BEST_SCORE=current_best_score
        print(f'Best score so far: {BEST_SCORE}')
    except:
      pass
    # Tuner Search
    tuner.search(
      shuffle=True,
      verbose=1,
      callbacks=[es, mcp_save],
      epochs=500,
    )
    return tuner


# Tuner Performance

In [None]:
if tune_mode != 'lr_overall':
    print(f'Tuning for: {model_tune_name}')
    tuner = run_tuning_encoder(
                    INPUT_DIM, regression,
                    # model_tune_num_iterations, TUNE_METRIC_NAME,
                    430, TUNE_METRIC_NAME,
                    tuner_seed=tuner_seed,
    )

# Linear Regression Regularization

In [None]:
if tune_mode == 'lr_overall':
  INPUT_DATA_DIR = file_dir
  TRAIN_DATA_COEFFICIENT = 0.60
  VALIDATION_DATA_COEFFICIENT = 0.20
  TEST_DATA_COEFFICIENT = 0.20
  num_train = int(num_files * TRAIN_DATA_COEFFICIENT)
  num_val = int(num_files * VALIDATION_DATA_COEFFICIENT)
  num_test = int(num_files * TEST_DATA_COEFFICIENT)
  TRAIN_FILES = [os.path.join(file_dir, f'{file}.tfrecord') for file in file_list[:num_train]]
  VALIDATION_FILES = [os.path.join(file_dir, f'{file}.tfrecord') for file in file_list[num_train:num_train+num_val]]
  TEST_FILES = [os.path.join(file_dir, f'{file}.tfrecord') for file in file_list[num_train+num_val:]]

  df_train = get_overall_data(TRAIN_FILES)
  df_val = get_overall_data(VALIDATION_FILES)
  df_test = get_overall_data(TEST_FILES)
  train_data, val_data, test_data = df_train.to_numpy(), df_val.to_numpy(), df_test.to_numpy()
  lr_model_train_data = np.concatenate((train_data, val_data), axis=0)

In [None]:
if tune_mode == 'lr_overall':
  metric='mean_squared_error'
  grid_values = {
              'alpha':
                  [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2, 2.5, 3, 3.5,
                   5, 7, 10, 13, 15, 17, 17.5, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 40, 45,
                   50,75,100,150,200,250,300,325,350,365,370,375,380,385,400,450,500,550,600],
              'solver': ['svd', 'cholesky', 'lsqr', 'lbfgs']}
  lr_model, out, all_data = overall_model_grid_search(train_data, val_data, test_data, lr_model_train_data,
                                  grid_values, cv=10, overfit_penalty=2)

In [None]:
lr_model = Ridge(alpha=100, solver='cholesky', random_state=42)
X, y = lr_model_train_data[:, :-3], lr_model_train_data[:, -3:-1]
lr_model.fit(X, y[:,0], sample_weight=y[:,1])

In [None]:
if tune_mode == 'lr_overall':
  # Save Model
  MODEL_SAVE_DIR = os.path.join(DATA_DIR, 'Saved_Models')
  saved_model = joblib.dump(lr_model, os.path.join(MODEL_SAVE_DIR ,'overall_best.joblib'))