In [None]:
import pandas as pd
import keras
import numpy as np

Using TensorFlow backend.


In [None]:
df = pd.read_csv('data.csv', parse_dates=[3])
df.columns = ['Artist', 'Album', 'Song', 'Date']
df.head()

Unnamed: 0,Artist,Album,Song,Date
0,The Smiths,Louder Than Bombs,Sheila Take a Bow - 2011 Remaster,2019-10-03 13:32:00
1,The Smiths,Louder Than Bombs,Heaven Knows I'm Miserable Now - 2011 Remaster,2019-10-03 13:28:00
2,The Smiths,The Smiths,This Charming Man - 2011 Remaster,2019-10-03 13:25:00
3,The Smiths,Hatful of Hollow,How Soon Is Now? - 2011 Remaster,2019-10-03 13:19:00
4,The Smiths,The Queen Is Dead,The Queen Is Dead - 2017 Master,2019-10-03 13:12:00


In [None]:
VOCAB_SIZE = 1000

In [None]:
df_g = df.groupby(['Artist', 'Song']).Song.count().rank(method='first', ascending=False).astype(int).sort_values().to_frame()
df_g.columns = ['rank']
#df_g = df_g[df_g['rank']<=VOCAB_SIZE]
df_g = df_g.reset_index()
df_g.head(10)

Unnamed: 0,Artist,Song,rank
0,Maroon 5,Maps,1
1,Lilly Wood & The Prick,Prayer in C - Robin Schulz Radio Edit,2
2,Phoenix,Armistice,3
3,Eminem,The Monster,4
4,Rayden,Mariposas,5
5,Rayden,Matemática de la carne,6
6,Santaflow,La Cabra Tira Al Monte,7
7,Porta,Nota De Suicidio (Con Soma),8
8,Lorde,Team,9
9,The Vamps,Can We Dance,10


In [None]:
df = pd.merge(df_g, df, left_on=['Artist', 'Song'], right_on=['Artist', 'Song'])

In [None]:
from collections import Counter

class NegativeSamplingGenerator(keras.utils.Sequence):
  def __init__(self, df, positive_samples, negative_samples, batch_size = 32, window_length=pd.Timedelta(hours=1)):
    self.df = df
    self.positive_samples = positive_samples
    self.negative_samples = negative_samples
    self.window_length = window_length
    self.batch_size = batch_size
    self.app_counter = Counter()
    
  def __len__(self): #Batches per epoch == Words to generate per epoch
    return self.batch_size
  
  def __getitem__(self, index):
    fixed_element = self.df.sample(1).iloc[0]
    self.app_counter[fixed_element['Artist'] + ' - ' + fixed_element['Song']]+=1
    positive = self.df[(self.df.Date >= fixed_element.Date - self.window_length) & \
                       (self.df.Date <= fixed_element.Date + self.window_length)] \
                .sample(self.positive_samples, replace=True)['rank'].values
    negative = self.df.sample(self.negative_samples)['rank'].values
    """
    X = np.zeros((self.positive_samples + self.negative_samples, 2))
    X[:,0] = fixed_element['rank']
    X[:,1] = np.concatenate((positive, negative))
    """
    X1 = np.full((self.positive_samples + self.negative_samples,), fixed_element['rank'])
    X2 = np.concatenate((positive, negative))
    Y = np.array([1]*self.positive_samples + [0]*self.negative_samples)
    #print([X1, X2], Y)
    return [X1, X2], Y

In [None]:
# Adapted from https://github.com/adventuresinML/adventures-in-ml-code/blob/master/keras_word2vec.py

from keras.layers import Input, Dense, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.models import Model


# vocab_size = # of different sogns
# vector_dim = embedding dimensions
def create_model(vocab_size, vector_dim):
  input_target = Input((1,))
  input_context = Input((1,))

  embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
  target = embedding(input_target)
  target = Reshape((vector_dim, 1))(target)
  context = embedding(input_context)
  context = Reshape((vector_dim, 1))(context)

  dot_product = Dot(axes=1)([target, context])
  dot_product = Reshape((1,))(dot_product)
  output = Dense(1, activation='sigmoid')(dot_product)
  
  model = Model(inputs=[input_target, input_context], outputs=output)
  return model

In [None]:
VECTOR_DIM = 20
model = create_model(VOCAB_SIZE, VECTOR_DIM)

In [None]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 20)        20000       input_13[0][0]                   
                                                                 input_14[0][0]                   
__________________________________________________________________________________________________
reshape_14 (Reshape)            (None, 20, 1)        0           embedding[0][0]                  
__________

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
data_generator = NegativeSamplingGenerator(df, 1, 10, batch_size=256, window_length=pd.Timedelta(days=2))

In [None]:
model.fit_generator(data_generator, epochs=5000, verbose=1)

Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000
Epoch 58/5000
Epoch 59/5000
Epoch 60/5000
Epoch 61/5000
Epoch 62/5000
Epoch 63/5000
Epoch 64/5000
Epoch 65/5000
Epoch 66/5000
Epoch 67/5000
Epoch 68/5000
Epoch 69/5000
Epoch 70/5000
Epoch 71/5000
Epoch 72/5000
E

KeyboardInterrupt: ignored

In [None]:
song_to_vec = {}

embedding_matrix = model.get_layer('embedding').get_weights()[0]

for i, row in df_g.iterrows():
  song_name = row.Artist + ' - ' + row.Song
  song_to_vec[song_name] = embedding_matrix[i]

In [None]:
def cosine_similarity(a, b):
  return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

In [None]:
def song_similarity(songA, songB):
  return cosine_similarity(song_to_vec[songA], song_to_vec[songB])

In [None]:
def find_most_similar(song, k):
  song_vec = song_to_vec[song]
  ans = sorted(list(song_to_vec.items()), key=lambda x : cosine_similarity(song_vec, x[1]), reverse=True)[:k]
  return [(song, cosine_similarity(song_vec, vec)) for song, vec in ans]

In [None]:
song_similarity('Maroon 5 - Maps', 'Bayside - Montauk')

-0.34134343

In [None]:
find_most_similar("Maroon 5 - Maps", 10)

[('Maroon 5 - Maps', 1.0),
 ('Enanitos Verdes - Igual que ayer', 0.7552056),
 ('Alan Walker - Faded', 0.71181303),
 ('My Chemical Romance - Helena (So Long & Goodnight)', 0.6556673),
 ('Twenty One Pilots - Not Today', 0.6398981),
 ('Bayside - Not Fair', 0.6286265),
 ('SFDK - Todo Lo Que Importa', 0.6062225),
 ('David Guetta - Turn Me On', 0.60199237),
 ('Charli XCX - Boom Clap', 0.59383),
 ('Juanes - Volverte A Ver', 0.592072)]

In [None]:
from collections import Counter

title_searched = 'Morandi - Summer in December'
artist_searched, song_searched = title_searched.split(' - ')
df_apps = df[(df.Song==song_searched) & (df.Artist==artist_searched)]
related = Counter()
for i, row in df_apps.iterrows():
  df_related = df[(df.Date >= row['Date'] - pd.Timedelta(days=2)) & (df.Date <= row['Date'] + pd.Timedelta(days=2))]
  for j, row2 in df_related.iterrows():
    related[row2['Artist'] + ' - ' + row2['Song']] += 1
related.most_common(20)

[('Morandi - Summer in December', 75),
 ('The Chainsmokers - Waterbed', 74),
 ('Lost Frequencies - Reality - Radio Edit', 72),
 ('Calvin Harris - How Deep Is Your Love', 70),
 ('New Politics - 15 Dreams', 63),
 ('Rapsusklei - Enero', 62),
 ('Celeste Buckingham - Bleeding', 62),
 ('SFDK - Cantando Bajo la Vida', 61),
 ('The Vamps - Can We Dance', 57),
 ('Jason French - You Just Want My Money', 56),
 ('New Found Glory - Vicious Love (feat. Hayley Williams)', 55),
 ("The Weeknd - Can't Feel My Face", 53),
 ('Katy Tiz - Whistle (While You Work It)', 51),
 ('Rayden - Mariposas', 50),
 ('David Guetta - Sun Goes Down (feat. MAGIC! & Sonny Wilson)', 49),
 ('Phoenix - Armistice', 48),
 ('Arctic Monkeys - Do I Wanna Know?', 48),
 ('Shawn Mendes - Stitches', 48),
 ('SFDK - Orgullo Banderillero', 47),
 ('Adam Lambert - Ghost Town', 47)]

In [None]:
all_songs = (df.Artist + ' - ' + df.Song).values
all_songs

array(['The Smiths - Sheila Take a Bow - 2011 Remaster',
       "The Smiths - Heaven Knows I'm Miserable Now - 2011 Remaster",
       'The Smiths - This Charming Man - 2011 Remaster', ...,
       'Ellie Goulding - Burn', 'Pitbull - Timber',
       'Clean Bandit - Rather Be (feat. Jess Glynne)'], dtype=object)

In [None]:
from gensim.models import Word2Vec

word2vec = Word2Vec([all_songs.tolist()], size=5, window=10, min_count=20, iter=100)

In [None]:
word2vec.wv.most_similar(['Morandi - Summer in December'])

[("Shakira - Can't Remember to Forget You", 0.9156844019889832),
 ('The Neighbourhood - Alleyways', 0.9072766304016113),
 ('Ellie Goulding - Burn', 0.9036813378334045),
 ('lovelytheband - broken', 0.903408408164978),
 ('Lily Allen - LDN', 0.8941390514373779),
 ('All Time Low - So Long Soldier', 0.890181303024292),
 ('Sir Sly - &Run', 0.8828120231628418),
 ('Waterparks - Blonde', 0.8785048723220825),
 ('The Neighbourhood - Let It Go', 0.860504150390625),
 ('Lorde - Tennis Court', 0.8587268590927124)]