This vector embedding is generated using a skip-gram Word2Vec algorithm.
For more information see:
[tensorflow](https://www.tensorflow.org/tutorials/text/word2vec)
[word2vec](http://jalammar.github.io/illustrated-word2vec/)

In [49]:
# Install your required packages here
!pip install pandas numpy matplotlib sklearn fsspec gcsfs



In [50]:
!pip install keras
!pip install -q tqdm



In [51]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [52]:
# Path to credentials for cloud bucket:
%env GOOGLE_APPLICATION_CREDENTIALS=/content/drive/My Drive/CS/AI/Credentials/ai-project-2020-f4dfbc25326c.json

env: GOOGLE_APPLICATION_CREDENTIALS=/content/drive/My Drive/CS/AI/Credentials/ai-project-2020-f4dfbc25326c.json


In [69]:
from google.cloud import storage

import pandas as pd
import tqdm.notebook as tqdm
import os
import math
import datetime

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout
import tensorflow as tf
from tensorboard.plugins import projector

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
# define constants
bucket_name = "ai-project-2020-spotify"
client = storage.Client()
bucket = client.get_bucket(bucket_name)

In [57]:
train_files = list(bucket.list_blobs(prefix='training_set/'))
for blob in [blob for blob in train_files if '20180715' in blob.name]:
  print(blob.name)

training_set/log_0_20180715_000000000000.csv.gz
training_set/log_1_20180715_000000000000.csv.gz
training_set/log_2_20180715_000000000000.csv.gz
training_set/log_3_20180715_000000000000.csv.gz
training_set/log_4_20180715_000000000000.csv.gz
training_set/log_5_20180715_000000000000.csv.gz
training_set/log_6_20180715_000000000000.csv.gz
training_set/log_7_20180715_000000000000.csv.gz


In [58]:
logs_0 = pd.read_csv(f"gs://{bucket_name}/training_set/log_0_20180715_000000000000.csv.gz")
logs_1 = pd.read_csv(f"gs://{bucket_name}/training_set/log_1_20180715_000000000000.csv.gz")
logs_2 = pd.read_csv(f"gs://{bucket_name}/training_set/log_2_20180715_000000000000.csv.gz")
logs_3 = pd.read_csv(f"gs://{bucket_name}/training_set/log_3_20180715_000000000000.csv.gz")
logs = logs_0.append(logs_1).append(logs_2).append(logs_3)
logs.shape

(11927861, 21)

In [59]:
unique_tracks = logs['track_id_clean'].nunique()
print(unique_tracks)

661694


In [60]:
logs_dropped = logs[['session_id','session_position','track_id_clean']]
logs_dropped.head()

Unnamed: 0,session_id,session_position,track_id_clean
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,t_0479f24c-27d2-46d6-a00c-7ec928f2b539
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,t_9099cd7b-c238-47b7-9381-f23f2c1d1043
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,t_23cff8d6-d874-4b20-83dc-94e450e8aa20
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,t_64f3743c-f624-46bb-a579-0f3f9a07a123


In [61]:
encoder = LabelEncoder()
logs_dropped['track_id_clean'] = encoder.fit_transform(logs_dropped['track_id_clean'])
logs_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11927861 entries, 0 to 2972977
Data columns (total 3 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   session_id        object
 1   session_position  int64 
 2   track_id_clean    int64 
dtypes: int64(2), object(1)
memory usage: 364.0+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [62]:
def stack_sessions(df):
    """
    Turn matrix representation into vector by stacking the listen events together (as columns)
    For example:
    session_id session_position feature1 feature2
    a          1                ~        ~
    a          2                ~        ~
    b          1                ~        ~
    b          2                ~        ~
    b          3                ~        ~

    Turns into:
    session_id 1_feature1 1_feature2 2_feature1 2_feature2 3_feature1 3_feature2
    a          ~          ~          ~          ~          Nan        Nan
    b          ~          ~          ~          ~          ~          ~
    """
    columns = list(df.columns)
    columns.remove('session_id')
    columns.remove('session_position')
    sessions = df.pivot(index='session_id', columns='session_position', values=columns)
    return sessions

In [63]:
# Stack all Sessions
stacked_sessions = stack_sessions(logs_dropped)
# Drop all features except track_id, skip_2, session_ids
stacked_sessions.head()

Unnamed: 0_level_0,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean
session_position,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
session_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
0_0000469e-70c4-4b69-8ac3-94417a4fe83b,659475.0,210319.0,659475.0,371696.0,371696.0,659475.0,210319.0,103502.0,42262.0,22240.0,584465.0,,,,,,,,,
0_00005fe5-8086-4ea4-a02f-bbe820af6067,365581.0,61084.0,123857.0,430647.0,371696.0,469729.0,37003.0,483074.0,399162.0,483074.0,37003.0,469729.0,371696.0,,,,,,,
0_000069dc-3d86-4e96-a221-d10eb55d0573,386645.0,363460.0,541696.0,542294.0,541696.0,542294.0,351913.0,5332.0,149087.0,651417.0,78862.0,163045.0,256064.0,457436.0,17511.0,111579.0,199421.0,487463.0,562817.0,315182.0
0_00006f66-33e5-4de7-a324-2d18e439fc1e,11556.0,374221.0,652113.0,92782.0,261423.0,517298.0,584811.0,30884.0,630103.0,111205.0,314190.0,271906.0,232822.0,417499.0,351706.0,567592.0,496409.0,494671.0,42335.0,138798.0
0_000073ea-37e3-473d-8197-bd64d7b16d31,335883.0,416530.0,225569.0,327178.0,127293.0,309946.0,77144.0,483074.0,215003.0,129887.0,312662.0,107808.0,498700.0,433865.0,539743.0,629665.0,363037.0,,,


In [64]:
# Drop second part of session
# Go back from float to int
stacked_sessions.reset_index(inplace=True)
stacked_sessions.drop(columns=['session_id'], inplace=True)
for index in range(11,21):
  stacked_sessions.drop(columns=[('track_id_clean', index)], inplace=True)
for index in range(1,11):
  stacked_sessions[('track_id_clean', index)] = stacked_sessions[('track_id_clean', index)].astype(int)
stacked_sessions.head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0_level_0,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean,track_id_clean
session_position,1,2,3,4,5,6,7,8,9,10
0,659475,210319,659475,371696,371696,659475,210319,103502,42262,22240
1,365581,61084,123857,430647,371696,469729,37003,483074,399162,483074
2,386645,363460,541696,542294,541696,542294,351913,5332,149087,651417
3,11556,374221,652113,92782,261423,517298,584811,30884,630103,111205
4,335883,416530,225569,327178,127293,309946,77144,483074,215003,129887


In [65]:
sequences = stacked_sessions.values.tolist()
len_sequences = len(sequences)
print(len_sequences)

711838


In [76]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Open a tfrecord writer
  current_date_and_time = datetime.datetime.now().date()
  current_date_and_time_string = str(current_date_and_time)
  writer = tf.io.TFRecordWriter('/content/drive/MyDrive/CS/AI/Data/embedding_training_data_'+current_date_and_time_string+'.tfrecord')
  positive_skipgrams_total = 0
  negative_skipgrams_total = 0

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence, 
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)
    positive_skipgrams_total += len(positive_skip_grams)

    # Iterate over each positive skip-gram pair to produce training examples 
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1, 
          num_sampled=num_ns, 
          unique=True, 
          range_max=vocab_size, 
          seed=SEED, 
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)
      
      proto_tensor = tf.make_tensor_proto(negative_sampling_candidates) 
      negative_sampling_candidates = tf.make_ndarray(proto_tensor).tolist()
      negative_sampling_candidates_flat = [item for sublist in negative_sampling_candidates for item in sublist]
      negative_skipgrams_total += len(negative_sampling_candidates_flat)
      context_list = [context_word]
      context_list.extend(negative_sampling_candidates_flat)
      # pretty sure this doesn't need to be recalculated every loop
      label_list = [1] + [0]*num_ns

      target_list = tf.train.Int64List(value=[target_word])
      context_list = tf.train.Int64List(value=context_list)
      label_list = tf.train.Int64List(value=label_list)

      target = tf.train.Feature(int64_list=target_list)
      context = tf.train.Feature(int64_list=context_list)
      label = tf.train.Feature(int64_list=label_list)

      sample_dict = {
        'target': target,
        'context': context,
        'label': label
      }

      sample = tf.train.Features(feature=sample_dict)
      example = tf.train.Example(features=sample)

      writer.write(example.SerializeToString())
  print("Total Negative Skipgrams: "+str(negative_skipgrams_total))
  print("Total Positive Skipgrams: "+str(positive_skipgrams_total))
  # Close tfrecord writer
  writer.close()
  return positive_skipgrams_total

In [77]:
# Don't need to re-run the sampling method every time
SEED = 42
num_ns = 5
positive_skipgrams_total = generate_training_data(
    sequences=sequences, 
    window_size=2, 
    num_ns=num_ns, 
    vocab_size=unique_tracks, 
    seed=SEED)

HBox(children=(FloatProgress(value=0.0, max=711838.0), HTML(value='')))


Total Negative Skipgrams: 120428040
Total Positive Skipgrams: 24085608


In [78]:
class Word2Vec(Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = Embedding(vocab_size, 
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding", )
    self.context_embedding = Embedding(vocab_size, 
                                       embedding_dim, 
                                       input_length=num_ns+1)
    self.dots = Dot(axes=(3,2))
    self.flatten = Flatten()

  def call(self, pair):
    target, context = pair
    we = self.target_embedding(target)
    ce = self.context_embedding(context)
    dots = self.dots([ce, we])
    return self.flatten(dots)

In [79]:
dataset = tf.data.TFRecordDataset(["/content/drive/MyDrive/CS/AI/Data/embedding_training_data_2020-11-17.tfrecord"])

feature_description = {
    'target': tf.io.FixedLenFeature([1], dtype=tf.int64),
    'context': tf.io.FixedLenFeature([num_ns+1], dtype=tf.int64),
    'label': tf.io.FixedLenFeature([num_ns+1], dtype=tf.int64)
}

for raw_record in dataset.take(3):
  print(repr(raw_record))

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, feature_description)

def dataset_tupler(example_proto):
  # Create tuples of the data: needed for the model.fit function
  context = example_proto['context']
  context = tf.expand_dims(context, 1)
  target = example_proto['target']
  target = tf.squeeze(target)
  label = example_proto['label']
  return ((target, context), label)

parsed_dataset = dataset.map(_parse_function)
parsed_dataset = parsed_dataset.map(dataset_tupler)

for parsed_record in parsed_dataset.take(3):
  print(repr(parsed_record))

BATCH_SIZE = 1024
BUFFER_SIZE = 10000
EPOCHS = 3
parsed_dataset = parsed_dataset.shuffle(BUFFER_SIZE).repeat(EPOCHS).batch(BATCH_SIZE, drop_remainder=True)
print(parsed_dataset)

<tf.Tensor: shape=(), dtype=string, numpy=b'\nH\n\x11\n\x06target\x12\x07\x1a\x05\n\x03\xe0\xad\x01\n\x13\n\x05label\x12\n\x1a\x08\n\x06\x01\x00\x00\x00\x00\x00\n\x1e\n\x07context\x12\x13\x1a\x11\n\x0f\x96\xca\x02\xac\x013\xd1\xd6\x0e\xd5\xba\x12\xf0\xe7\x03'>
<tf.Tensor: shape=(), dtype=string, numpy=b'\nD\n\x13\n\x05label\x12\n\x1a\x08\n\x06\x01\x00\x00\x00\x00\x00\n\x1a\n\x07context\x12\x0f\x1a\r\n\x0b\xe0\xad\x01\xbe\x8c\x0e"\x05(\xfe\x01\n\x11\n\x06target\x12\x07\x1a\x05\n\x03\x96\xca\x02'>
<tf.Tensor: shape=(), dtype=string, numpy=b'\nF\n\x13\n\x05label\x12\n\x1a\x08\n\x06\x01\x00\x00\x00\x00\x00\n\x1c\n\x07context\x12\x11\x1a\x0f\n\r\xf0\xd7\x16\xfb\xde\x03\n\xa7\x0b\x83\xca\x01`\n\x11\n\x06target\x12\x07\x1a\x05\n\x03\x93\xa0('>
<tf.Tensor: shape=(), dtype=string, numpy=b'\nE\n\x13\n\x05label\x12\n\x1a\x08\n\x06\x01\x00\x00\x00\x00\x00\n\x1b\n\x07context\x12\x10\x1a\x0e\n\x0c\x8f\xeb\x0c\x01\xf4\x0b\xb8\x14\x04\x8b\xe0\x12\n\x11\n\x06target\x12\x07\x1a\x05\n\x03\x93\xa0('>
<tf.

In [80]:
embedding_dim = 128
word2vec = Word2Vec(unique_tracks, embedding_dim)
word2vec.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [81]:
steps = math.floor(positive_skipgrams_total / BATCH_SIZE)
word2vec.fit(parsed_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE, steps_per_epoch=steps)

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [82]:
embedding_layer = word2vec.get_layer('w2v_embedding')
embedding_layer = pd.DataFrame(embedding_layer.get_weights()[0])
embedding_layer.index = encoder.inverse_transform(embedding_layer.index)
embedding_layer.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127
t_00000648-f2ab-4137-bbc3-bcb3ca19401d,-0.010368,-0.000756,0.021837,0.005374,0.04164,0.029533,0.016841,-0.028333,-0.02813,-0.020724,0.038739,-0.017751,0.001728,0.033822,-0.034419,0.035998,-0.010443,0.01356,-0.015316,-0.034014,0.001643,0.009997,-0.000399,0.040203,-0.025627,0.028001,-0.009644,0.032128,0.007436,0.020244,-0.034568,-0.031622,0.004237,-0.021977,0.027528,0.047425,0.031596,0.034115,0.048945,-0.028111,...,0.009841,0.026863,-0.006123,-0.045327,-0.03309,-0.004791,0.018941,-0.000276,0.013019,-0.000231,-0.042153,-0.002086,0.021195,0.045011,-0.020846,0.041249,-0.031687,0.028507,0.048877,-0.04673,-0.023793,-0.013257,-0.002896,0.019198,0.009402,-0.045728,-0.010646,0.005607,-0.004274,0.041726,0.047934,-0.024515,0.001527,0.036644,-0.03421,-0.046519,0.040694,-0.022674,0.001397,0.004469
t_00001e3a-61a6-42c9-a39e-25193b18c519,0.021432,-0.042622,0.021523,0.012924,0.006709,-0.040652,0.009869,0.043943,0.042756,-0.013831,0.047426,-0.000283,-0.046712,-0.001365,-0.010717,0.045156,0.028408,-0.015158,0.049027,0.029514,-0.041287,-0.046607,0.011304,0.003885,-0.002023,0.010821,0.01958,-0.031961,0.020775,-0.019928,0.04445,-0.01878,-0.035133,0.03977,0.038897,0.044703,-0.003505,0.049766,0.010188,0.010257,...,0.028019,-0.047318,-0.027254,-0.048201,0.041742,0.025767,-0.045161,0.003778,-0.002129,-0.034005,-0.010471,-0.016013,-0.006711,-0.029398,-0.033293,-0.001569,-0.03077,0.04878,0.015632,-0.037159,0.049194,-0.007936,-0.019339,0.004609,-0.005318,0.038007,0.022589,0.00509,-0.037229,0.049011,0.018147,0.048988,-0.013694,-0.041093,-0.021706,0.039041,0.008729,0.007067,0.017242,-0.018898
t_000023df-b650-404d-93e5-63655fa87b45,0.008289,-0.043461,-0.035683,-0.028667,-0.026263,-0.002547,-0.013763,0.001759,-0.040731,0.024146,-0.017317,-0.004552,-0.049313,-0.036193,0.022299,-0.027076,-0.027959,0.020817,-0.020325,0.04892,0.001801,-0.027412,-0.016147,-0.015809,-0.025911,-0.012842,-0.026318,-0.014711,-0.00637,0.035134,-0.003191,-0.043959,0.022685,0.005841,0.006767,0.03105,-0.047031,0.044717,-0.000489,0.008295,...,-0.049631,0.005702,0.033953,-0.028125,0.042217,-0.020787,0.033487,-0.025581,-0.039036,0.023605,-0.046946,0.003423,-0.04518,0.005063,0.00113,-0.011074,0.007607,-0.021763,0.025804,-0.029557,-0.022402,0.03974,-0.009913,0.038227,0.035094,0.022852,-0.017575,0.037866,-0.008418,-0.048388,-0.021493,-0.030611,-0.011961,0.031221,0.026194,-0.024017,0.029715,-0.02449,-0.029865,0.040308
t_000039da-e48a-444a-b5b9-9ef55f3bb4cb,-0.02959,0.025135,-0.038919,0.006091,0.001401,0.048404,0.021141,0.007376,0.035847,0.00378,-0.026996,0.01414,0.032945,-0.010561,-0.040233,-0.02304,0.018876,-0.021468,0.034169,0.024269,0.041373,0.033694,-0.039437,-0.022993,0.028889,-0.017386,-0.011286,-0.008528,-0.044399,-0.023928,-0.000195,-0.049717,0.020815,0.006766,0.009013,0.012036,-0.045808,-0.013981,-0.022119,0.029433,...,0.007263,0.030657,0.025952,-0.013677,-0.00683,0.029905,-0.047809,-0.017445,-0.048792,0.004789,-0.015678,0.014983,0.013547,-0.003291,0.049508,-0.013233,0.015233,0.044465,0.001108,0.009087,-0.004427,0.030559,0.018577,0.028703,0.00066,-0.009263,0.022794,-0.03749,0.024537,-0.022219,-0.01576,0.045489,0.048084,-0.021976,-0.03387,0.01432,-0.039233,-0.001775,-0.004781,0.047986
t_00003a96-a43e-4dda-88d8-d69197b29f02,0.015898,-0.019181,-0.03088,-0.022567,-0.044236,0.036118,-0.040491,-0.008395,-0.048526,-0.023702,0.014873,-0.032333,-0.04075,0.001767,0.028696,-0.026748,-0.033809,0.02254,0.032578,0.047828,0.037266,-0.036038,-0.046405,-0.032178,0.049828,0.030763,0.00713,-0.04101,-0.031535,-0.03492,0.009571,-0.039646,-0.046627,-0.010622,0.04349,0.014362,0.0241,-0.039168,0.012908,-0.029748,...,0.049612,0.010061,-0.039903,-0.014623,-0.001555,0.005815,-0.02958,-0.029275,-0.043124,0.049085,-0.008413,-0.025267,-0.04532,-0.010959,0.00871,0.014878,0.045027,0.008175,0.00514,0.024337,-0.014724,0.017455,0.040929,0.007102,0.007163,0.014495,-0.012205,-0.029912,0.014997,0.008553,0.024003,-0.039736,-0.033604,0.014658,0.037931,-0.034407,-0.044877,0.009087,-0.027117,0.032983


In [83]:
current_date_and_time = datetime.datetime.now().date()
current_date_and_time_string = str(current_date_and_time)
embedding_layer.to_csv('/content/drive/My Drive/CS/AI/Data/embeddings/w2v_embedding_layer_'+current_date_and_time_string)

## Some Statistics About the Embedding

In [None]:
# Cosine distance between some embeddings
embedding = pd.read_csv("/content/drive/My Drive/CS/AI/Data/w2v_embedding_layer_large")
embedding.rename(columns={'Unnamed: 0': 'track_id_clean'}, inplace=True)
embedding.head()

In [None]:
unique_tracks = embedding['track_id_clean'].nunique()
print(unique_tracks)

In [None]:
embedding.info()

In [None]:
def find_max_cosine_similarity(df):
  '''
    Only calculates cosine similarity between current and next song. Just an indication, no the real max.
  '''
  max_cosine = -1
  max_track_1 = ""
  max_track_2 = ""
  for index, row in df.iterrows():
    track_1 = row['track_id_clean']
    arr_1 = row.values[1:]
    if index+1 < len(df):
      track_2 = df.iloc[index+1].track_id_clean
      arr_2 = df.iloc[index+1].values[1:]
      sim = cosine_similarity([arr_1], [arr_2])
      if sim > max_cosine:
        max_track_1 = track_1
        max_track_2 = track_2
        max_cosine = sim
  return max_track_1, max_track_2, max_cosine

In [None]:
max_track_1, max_track_2, max_cosine = find_max_cosine_similarity(embedding)

In [None]:
# get tracks and vectors see if values are indeed similar
print("Most similar tracks: " + str(max_track_1) + " and " + str(max_track_2))
print("Cosine Similarity: "+str(max_cosine))

In [None]:
embedding[embedding['track_id_clean'] == max_track_1].head()

In [None]:
embedding[embedding['track_id_clean'] == max_track_2].head()

In [None]:
features = pd.read_csv('/content/drive/My Drive/CS/AI/Data/tf_mini.csv')
features[features['track_id'] == 't_9e647859-7e89-4026-b537-956caf38ceeb']

In [None]:
def find_min_cosine_similarity(df):
  '''
    Only calculates cosine similarity between current and next song. Just an indication, no the real max.
  '''
  min_cosine = 1
  min_track_1 = ""
  min_track_2 = ""
  for index, row in df.iterrows():
    track_1 = row['track_id_clean']
    arr_1 = row.values[1:]
    if index+1 < len(df):
      track_2 = df.iloc[index+1].track_id_clean
      arr_2 = df.iloc[index+1].values[1:]
      sim = cosine_similarity([arr_1], [arr_2])
      if sim < min_cosine:
        min_track_1 = track_1
        min_track_2 = track_2
        min_cosine = sim
  return min_track_1, min_track_2, min_cosine

In [None]:
min_track_1, min_track_2, min_cosine = find_min_cosine_similarity(embedding)

In [None]:
# get tracks and vectors see if values are indeed similar
print("Most similar tracks: " + str(min_track_1) + " and " + str(min_track_2))
print("Cosine Similarity: "+str(min_cosine))

In [None]:
# Set up a logs directory, so Tensorboard knows where to look for files
LOG_DIR = '/logs/'
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

with open(os.path.join(LOG_DIR, 'metadata.tsv'), "w") as f:
  for index, id in embedding['track_id_clean'].iterrows():
    f.write("{}\n".format(id))


weights = embedding.drop(columns=['track_id_clean']).values
weights = tf.Variable(weights, name='Track Embedding')
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(LOG_DIR, "embedding.ckpt"))

config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(LOG_DIR, config)

In [None]:
%tensorboard --logdir=logs