In [60]:
import pandas as pd

import io
import itertools
import numpy as np
import os
import re
import string
import tensorflow as tf
import tqdm

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [61]:
# Global Variables
SEED = 42
AUTOTUNE = tf.data.experimental.AUTOTUNE
WINDOW_SIZE = 5
MAX_SEQ_LENGTH = 11
NUM_NS = 3

## Data Collection and Indexing

We're going to use tensorflow's build in TextVectorization layer to convert a corpus of lists of genres into indexes.

First we're importing a dataframe in using pandas (I think I can remove this step and do this with TF later #TODO)

In [62]:
data = pd.read_csv('data/data_w_genres.csv')
corpus = data['genres']
corpus = corpus[corpus != '[]']
corpus = corpus.str.lstrip('[')
corpus = corpus.str.rstrip(']')
corpus = corpus.str.replace("'", '')
corpus = corpus.str.split(', ')
corpus.head()

0                                          [show tunes]
8                          [comedy rock, comic, parody]
9     [emo rap, florida rap, sad rap, underground hi...
10                                [dark trap, meme rap]
12    [asian american hip hop, cali rap, west coast ...
Name: genres, dtype: object

In [63]:
# # this may appear redundant with the vectorize layer but I need to get the 
# # total number of genres present to estable a max "vocab" size

# genre_seq = []
# for values in corpus:
#     for genre in values.split(', '):
#         genre_seq.append(genre)
# genres = set(genre_seq)

# vocab_size = len(genres)

# print(FindMaxLength(genre_seq))

In [64]:
genre_sequences = list(corpus)
flat_corpus = [item for sublist in genre_sequences for item in sublist]
genres = set(flat_corpus)

vocab_size = len(genres)

In [65]:
def FindMaxLength(lst): 
    maxList = max(lst, key = len) 
    maxLength = max(map(len, lst)) 
      
    return maxList, maxLength 

In [66]:
print(FindMaxLength(genre_seq))

NameError: name 'genre_seq' is not defined

In [67]:
df = pd.DataFrame(flat_corpus)

In [75]:
frequency = pd.DataFrame(df.value_counts())
frequency.head()

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
rock,611
pop,593
dance pop,572
rap,516
hip hop,507


In [76]:
frequency.columns = ['occurances']

In [77]:
frequency.index.rename('genre', inplace = True)

In [79]:
frequency.reset_index(inplace = True)

In [80]:
frequency.head()

Unnamed: 0,genre,occurances
0,rock,611
1,pop,593
2,dance pop,572
3,rap,516
4,hip hop,507


In [81]:
import plotly.express as px

fig = px.bar(frequency, x='genre', y='occurances')
fig.show()

In [22]:
genre2int = {}

for i,genre in enumerate(genres):
    genre2int[genre] = i

In [23]:
genre_int_sequences = []
for sequence in genre_sequences:
    genre_int_sequences.append([genre2int[genre] for genre in sequence])
    
genre_int_sequences[:5]

[[1583],
 [2072, 1671, 1969],
 [1462, 1926, 1257, 1071, 686],
 [2441, 1498],
 [2558, 1863, 2822]]

Ultimately, we want a list of lists where the sublists represent a sequence of genres in index form. However, for the word2vec model, we need padded lists. Normally this is handled in the TextVectorization class, however, because I am using genres (which are sometimes comprised of multiple words) instead of a strict one-word vocabulary, I am going to manually create the require numpy array of genre indexs and pad these lists using the `pad_sequences` method in the [preprocessing library](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences#used-in-the-notebooks).

In [26]:
padded_int_sequences = pad_sequences(
    genre_int_sequences, 
    padding="post",
    value=0)

In [28]:
padded_int_sequences[:3]

array([[1583,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [2072, 1671, 1969,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1462, 1926, 1257, 1071,  686,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)

In [29]:
type(padded_int_sequences)

numpy.ndarray

## Generating skip-grams

## Scaling Up: Generating Full Train Data

Now we'll take all steps above and write a function to handle a list of vectorized "sentences" (or vectorize genre list in this case). A *sampling table* is built first and the tuples of target, context and labels are generated.

In [31]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and genre-options size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence, 
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples 
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1, 
          num_sampled=num_ns, 
          unique=True, 
          range_max=vocab_size, 
          seed=SEED, 
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")
        
      # NO SQUEEZING?
      # Reshape target to shape (1,) and context and label to (num_ns+1,).
      # target = tf.squeeze(target_word)
      # context = tf.squeeze(context)
      # label =  tf.squeeze(label)

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

Now I should be able to use this function to construct a list of targets, contexts, and labels.

In [32]:
targets, contexts, labels = generate_training_data(
    sequences=padded_int_sequences, 
    window_size=WINDOW_SIZE, 
    num_ns=NUM_NS, 
    vocab_size=vocab_size, 
    seed=SEED)
print(len(targets), len(contexts), len(labels))

100%|██████████| 18823/18823 [00:04<00:00, 3823.80it/s]


79963 79963 79963


In [33]:
print(f"target  :", targets[0])
print(f"context :", contexts[0] )
print(f"label   :", labels[0] )

target  : 1969
context : tf.Tensor(
[[1671]
 [ 139]
 [  17]
 [ 637]], shape=(4, 1), dtype=int64)
label   : tf.Tensor([1 0 0 0], shape=(4,), dtype=int64)


## Configure the Dataset for Performance

We can put the lists of targets, contexts, and labels together in a formal TF dataset structure using `tf.data.Dataset` which is an object of `(target_word, context_word), (label)`.

Batch size is ???
Buffer size is???

BatchDataset shapes look correct per the example, even without squeezing.

It would also be ideal to add `cache()` ??? and `perfetch()` ??? to improve performance, but I cannot get autotuning to work. So I'm going to see if I can forgoe it. NEVERMIND, I found a different way to define autotune.

In [34]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 4, 1)), (1024, 4)), types: ((tf.int32, tf.int64), tf.int64)>


In [35]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 4, 1)), (1024, 4)), types: ((tf.int32, tf.int64), tf.int64)>


## Model and Training

In [38]:
class Word2Vec(Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = Embedding(vocab_size, 
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding", )
    self.context_embedding = Embedding(vocab_size, 
                                       embedding_dim, 
                                       input_length=NUM_NS+1)
    self.dots = Dot(axes=(3,2))
    self.flatten = Flatten()

  def call(self, pair):
    target, context = pair
    we = self.target_embedding(target)
    ce = self.context_embedding(context)
    dots = self.dots([ce, we])
    return self.flatten(dots)

In [39]:
embedding_dim = 128
word2vec = Word2Vec(len(genres), embedding_dim)
word2vec.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [40]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [41]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Train for 78 steps
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4197912390>

Gaaah I didn't build a vectorized layer so I'm not sure if I'll be able to do this next part... no, I'm fine, I just need to get a flat version songgenres...

In [42]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = genres

In [44]:
weights.shape

(2972, 128)

In [47]:
weights[2971]

array([-0.10396512,  0.108107  ,  0.0088021 ,  0.13973187,  0.1488972 ,
       -0.13805892,  0.08074608, -0.13030392, -0.20428312, -0.142047  ,
        0.14050712,  0.19607213, -0.15363061, -0.1970656 , -0.13440153,
       -0.15356648, -0.17936558,  0.1270342 , -0.163523  ,  0.12451697,
        0.18876666, -0.10824996, -0.10260601,  0.10805888, -0.16713667,
        0.01254791,  0.09476343,  0.06278937,  0.13785484,  0.14142327,
       -0.12185849,  0.15325734,  0.16461438, -0.0177851 ,  0.12840503,
        0.18270724,  0.08598521, -0.1566555 ,  0.02556141, -0.1379484 ,
        0.1407831 , -0.12443019,  0.11294575, -0.16306634, -0.07205127,
       -0.1656258 , -0.10214517, -0.16654333, -0.13710871, -0.09189206,
       -0.10637689, -0.1709959 ,  0.08887486, -0.11582809, -0.19945604,
       -0.14722018,  0.15332708,  0.15598653, -0.14447874, -0.18867221,
       -0.10389595,  0.10557889, -0.18312195, -0.11984608,  0.10776726,
        0.13912441,  0.12660201,  0.15367733,  0.052963  , -0.11

In [50]:
len(vocab)

2972

In [51]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  #if  index == 0: continue # skip 0, it's padding.
  vec = weights[index] 
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()