In [None]:
# Mount Google Drive so that training data can be used
def mount_drive():
  from google.colab import drive
  drive.mount('/content/drive')

mount_drive()

Mounted at /content/drive


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

import numpy as np


In [None]:
# load data
import pandas as pd

song_data = pd.read_csv("/content/drive/Shareddrives/Cop Detectors /Class Work/song_data/songdatamerge.csv")

In [None]:
print(song_data.columns)


# process data to fit into feature set
feature_list = ['danceability',
              'energy',
              'loudness',
              'mode',
              'speechiness',
              'acousticness',
              'instrumentalness',
              'liveness',
              'valence',
              'tempo',
              'key',]
features = song_data[feature_list]
print(features.describe())

# normalize data
features = tf.keras.utils.normalize(features.values)

# add random values
noise_vecs = np.random.rand(50000, 11)

features = np.vstack((features, noise_vecs))

# shuffle data
np.random.shuffle(features)


# reshape data into a training set
slice_index = int(len(features)*0.8)
x_train = features[:slice_index]
x_test = features[slice_index:]

print(x_train.shape)
print(x_test.shape)

Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')
       danceability        energy      loudness          mode   speechiness  \
count  1.204025e+06  1.204025e+06  1.204025e+06  1.204025e+06  1.204025e+06   
mean   4.930565e-01  5.095363e-01 -1.180870e+01  6.714595e-01  8.438219e-02   
std    1.896694e-01  2.946839e-01  6.982132e+00  4.696827e-01  1.159914e-01   
min    0.000000e+00  0.000000e+00 -6.000000e+01  0.000000e+00  0.000000e+00   
25%    3.560000e-01  2.520000e-01 -1.525400e+01  0.000000e+00  3.510000e-02   
50%    5.010000e-01  5.240000e-01 -9.791000e+00  1.000000e+00  4.460000e-02   
75%    6.330000e-01  7.660000e-01 -6.717000e+00  1.000000e+00  7.230000e-02   
max    1.00000

In [None]:
# Unsupervised Model
# In this case, an autoencoder.

# We will take in the following features (for now):
# acousticness (0.0 to 1.0)
# danceability (0.0 to 1.0)
# energy (0.0 to 1.0)
# key (-1 to 11), 0 = C
# liveness (0.0 to 1.0?)
# loudness (-60dB to 0 dB)
# tempo (BPM)



# For now, using this dataset from Kaggle:
# It has 1.2 million Spotify songs' features
# https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs?resource=download


# Each feature will be a node in the first layer
# Each new layer will reduce the number of nodes by 1
# So there will be a 7 node layer, then a 6 node layer, then 5, and so on # I'm not currently sure whether this approach is best for encoding, because
# it could be the case that much information is lost if we go down to 1 layer,
# to the point of it becoming useless.
# Also, stepping down faster (like halving the number of nodes) might be better.
# Then there will be a sequence of decoding the information back up to 7 dimensions
# This is so that we can verify that the encoding maintained the original information
# Otherwise we could not train the model weights.

# We can play with and research different architectures.


# using this resource: https://towardsdatascience.com/unsupervised-machine-learning-example-in-keras-8c8bf9e63ee0
# and https://www.tensorflow.org/api_docs/python/tf/keras/Sequential
# and MAINLY https://www.tensorflow.org/tutorials/generative/autoencoder

# latent dimensions: The number of dimensions in the compressed representation

class Autoencoder(Model):
  def __init__(self):
    super(Autoencoder, self).__init__()
    self.encoder = tf.keras.Sequential([
        layers.Flatten(),
        layers.Dense(11, activation='tanh'),
        layers.Dense(12, activation='tanh'),
        layers.Dense(13, activation='tanh'),
        layers.Dense(14, activation='tanh'),
        layers.Dense(15, activation='tanh'),
        layers.Dense(16, activation='tanh'),
    ])
    self.decoder = tf.keras.Sequential([
        layers.Dense(16, activation='tanh'),
        layers.Dense(15, activation='tanh'),
        layers.Dense(14, activation='tanh'),
        layers.Dense(13, activation='tanh'),
        layers.Dense(12, activation='tanh'),
        layers.Dense(11, activation='tanh'),
    ])

  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

autoencoder = Autoencoder()

In [None]:
from keras import callbacks 
from keras.callbacks import ModelCheckpoint
autoencoder.compile(optimizer='adam', 
                    loss=losses.MeanAbsoluteError(),
                    metrics=[["accuracy",]])

checkpoint = ModelCheckpoint(filepath='/content/drive/Shareddrives/Cop Detectors /ML Models/weights11d_04-09-23/',
                             monitor="val_loss",
                             save_weights_only=True,
                             save_best_only=True,)

In [None]:
autoencoder.fit(x_train, x_train,
                epochs=100,
                shuffle=True,
                validation_split=0.2, callbacks=[checkpoint])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100

In [None]:
autoencoder.evaluate(x_test, x_test)



[0.005314918234944344, 0.98306804895401]

In [None]:
#sdmerge = pd.read_csv("/content/drive/Shareddrives/Cop Detectors /Class Work/song_data/songdatamerge.csv")

In [None]:
feature_list7 = ['acousticness',
                'danceability',
                'energy',
                'key',
                'liveness',
                'loudness',
                'tempo']

In [None]:
featurevals = song_data[feature_list].values
featurevals = tf.keras.utils.normalize(featurevals)

In [None]:
print(featurevals[:1])
print()
print(autoencoder.encoder(featurevals[:1]).numpy())

[[ 2.20596786e-04  3.97243254e-03  8.26604048e-03  9.21266271e-08
   5.91638889e-02  3.00890635e-03 -4.56322623e-02  8.45198413e-03
   6.14459247e-04  9.96539641e-01  3.38079365e-02  4.25134802e-03]]

[[ 0.01587676  0.1045413   0.00491191  0.34323314 -0.06747051 -0.08069133
  -0.00589121  0.01527525  0.01618507  0.00364498 -0.00034935  0.10073346
  -0.03728201  0.06286093 -0.3346168   0.19996294]]


In [None]:
preds = autoencoder.encoder(featurevals).numpy()
print(preds.shape)

(1231895, 16)


In [None]:
predsdf = pd.DataFrame(preds)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(preds)

In [None]:
n_songs = len(scaled_features)
similarity_matrix = np.zeros((n_songs, n_songs))


In [None]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.0/648.0 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.1-cp39-cp39-linux_x86_64.whl size=582294 sha256=6d0d28b09b6f902b7073347a3ef64b75ca90f39a709c20fa040242e208d9b8b5
  Stored in directory: /root/.cache/pip/wheels/5b/7d/31/9a9a4993d085bc85bee21946bce94cd5906ce99730f5467e57
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1


In [None]:
from sklearn.preprocessing import MinMaxScaler
from annoy import AnnoyIndex

In [None]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(preds)
n_features = scaled_features.shape[1]
annoy_index = AnnoyIndex(n_features, metric='angular')

In [None]:
for i, feature_vector in enumerate(scaled_features):
  annoy_index.add_item(i, feature_vector)

In [None]:
n_trees = 50
annoy_index.build(n_trees)

True

In [None]:
input_track_id = '1p80LdxRV74UKvL8gnD7ky'
input_idx = song_data.loc[song_data['id'] == input_track_id].index[0]
print(input_idx)
n_similar_songs = 10
similar_song_indices = annoy_index.get_nns_by_item(input_idx, n_similar_songs + 1)[1:]


1205306


In [None]:
similar_songs = song_data.iloc[similar_song_indices]
print(similar_songs.head(10)[['name', 'artists']])

                         name                           artists
684872             Kick, Push                   ['Lupe Fiasco']
1205806            Kick, Push                   ['Lupe Fiasco']
535371   Cold Comfort Flowers            ['Fountains Of Wayne']
828949              The Point                  ['Emily Warren']
729109        Le Grand Cochon  ['D. Charles Speer & the Helix']
1150099        Big Brown Eyes                      ["Old 97's"]
496399                Counted            ['The 484 South Band']
1038366            Feel Alive                  ['Dirty Proper']
323252    Khevre, Nit Gezogrt                   ['Klezperanto']
984223        Verdadeiro Amor                          ['Gama']


In [None]:
baseTrackUrl = "https://open.spotify.com/track/"
for i,s in similar_songs.head(10).iterrows():
  print(baseTrackUrl+s['id'])

https://open.spotify.com/track/7nWutFSP2eKuQ9oZlZ53BX
https://open.spotify.com/track/6nz35DNIzbtj5ztpDEcW1j
https://open.spotify.com/track/3cCl6GXnsjWAeqeAN8qSN9
https://open.spotify.com/track/0jvo85AH33ruP3757RIFdX
https://open.spotify.com/track/4Ft5RmIt2SjPxBgqqkCf1c
https://open.spotify.com/track/6QELg0RBFGKisdiqbROWSD
https://open.spotify.com/track/6EvQ93hzbyKUP86vkfYCpP
https://open.spotify.com/track/6UwZFzso8nfWJGkzbGKM85
https://open.spotify.com/track/3mg6ICtPECfaRDINs99PKg
https://open.spotify.com/track/7BBXfVWTr7lxgUHdYTXcF5


In [None]:
# knn on predicted features
from sklearn.neighbors import NearestNeighbors

In [None]:
k = 10
knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
knn.fit(scaled_features)
# get the nearest neighbors
distances, indices = knn.kneighbors(scaled_features, n_neighbors=k)
indices = indices[-1]
nearest_neighbors = []
for i in indices:
    nearest_neighbors.append(song_data.iloc[i]['id'])


In [None]:
from google.colab import files

files.download("model")

In [None]:
!cp -r /content/model_12d /content/drive/Shareddrives/Cop\ Detectors\ /ML\ Models/model_12d

In [None]:
print(len(x_test))
print(x_test[0])
print(x_test[0].shape)

240805
[ 0.00465562  0.00562399  0.00497221  0.0186225   0.00151773 -0.0796205
  0.99661098]
(7,)


In [None]:
preds = autoencoder.encoder(x_test[:1000]).numpy()
print(preds.shape)

(1000, 3)


In [None]:
# TODO: PCA (preferred) or t-SNE on the features + the outputted feature(s) 
# from the unsupervised model, for dimensionality reduction

In [None]:
# TODO: Nearest Neighbor Search algorithm in feature space

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
inputs = keras.Input(shape = (x_train.shape[0], x_train.shape[1]))
x = layers.Bidirectional(layers.LSTM(16))(inputs)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer = "rmsprop", loss = "mse", metrics = ["mae"])
print(x_train.shape)
history = model.fit(x_train, epochs = 10, validation_split = 0.2)

In [None]:
!pip install plotly

In [None]:
import numpy as np
import plotly.express as px
import pandas as pd

In [None]:
df = pd.DataFrame(preds, columns=['X','Y','Z'])

In [None]:
fig = px.scatter_3d(df, x='X', y='Y', z='Z')
fig.show()