In [1]:
import io
import itertools
import numpy as np
import os
import re
import string
import tensorflow as tf
import tqdm

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [4]:
with open(path_to_file) as f: 
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [9]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [16]:
list(text_ds)[:20]

[<tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Before we proceed any further, hear me speak.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'All:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Speak, speak.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'You are all resolved rather to die than to famish?'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'All:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Resolved. resolved.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'First, you know Caius Marcius is chief enemy to the people.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'All:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b"We know't, we know't.">,
 <tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b"Let us kill him, and we'll have corn at our

In [10]:
# We create a custom standardization function to lowercase the text and 
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

# Define the vocabulary size and number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [11]:
vectorize_layer.adapt(text_ds.batch(1024))

In [12]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

[b'the', b'and', b'to', b'i', b'of', b'you', b'my', b'a', b'that', b'in', b'is', b'not', b'for', b'with', b'me', b'it', b'be', b'your', b'his', b'this']


In [13]:
vectorize_layer

<tensorflow.python.keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7f1d5fcc2e50>

In [17]:
len(inverse_vocab)

4095

In [7]:
input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) 
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

input_data = tf.keras.Input(shape=(None,)) 
layer = get_layer_class(max_tokens=None, standardize=None, split=None, output_mode=text_vectorization.INT, vocabulary=vocab_data) 
int_data = layer(input_data) 
model = tf.keras.Model(inputs=input_data, outputs=int_data)

output_dataset = model.predict(input_array)

NameError: name 'get_layer_class' is not defined

In [24]:
vocab_data = ["earth", "wind", "and", "fire"]
max_len = 4  # Sequence length to pad the outputs to.

vectorize_layer = TextVectorization(
    max_tokens=8,
    output_mode='int',
    output_sequence_length=max_len)

vectorize_layer.adapt(np.array(vocab_data))
vectorize_layer.get_vocabulary()


[b'wind', b'fire', b'earth', b'and']

In [12]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
max_features = 5000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.
embedding_dims = 2

# Create the layer.
vectorize_layer = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

Now that the vocab layer has been created, call `adapt` on the text-only dataset to create the vocabulary. You don't have to batch, but for large datasets this means we're not keeping spare copies of the dataset.

In [13]:
vectorize_layer.adapt(text_dataset.batch(64))
vectorize_layer.get_vocabulary()

[b'foo', b'baz', b'bar']

Create the model that uses the vectorize text layer. 

Start by creating an explicit input layer. It needs to have a shape of (1,) (because we need to guarantee that there is exactly one string input per batch), and the dtype needs to be 'string'.

The next layer in our model is the vectorization layer. After this layer, we have a tensor of shape (batch_size, max_len) containing vocab indices.

In [15]:
model = tf.keras.models.Sequential()

model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

model.add(vectorize_layer)

Now, the model can map strings to integers, and you can add an embedding layer to map these integers to learned embeddings.

In [16]:
input_data = [["foo qux bar"], ["qux baz"]]
model.predict(input_data)

array([[2, 1, 4, 0],
       [1, 3, 0, 0]])

# NOW

I'm going to test this with a different word set

In [23]:
text_dataset = tf.data.Dataset.from_tensor_slices(["earth", "wind", "and", "fire"])
max_features = 5000  # Maximum vocab size.
max_len = 10  # Sequence length to pad the outputs to.

# Create the layer.
vectorize_layer = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

vectorize_layer.adapt(text_dataset.batch(64))

model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)

input_data = [["earth wind and fire"], ["fire and earth michigan"]]
model.predict(input_data)

array([[4, 2, 5, 3, 0, 0, 0, 0, 0, 0],
       [3, 5, 4, 1, 0, 0, 0, 0, 0, 0]])

Note that `input_data` has to be a list of lists that are one element long...

In [21]:
result

array([[6, 4, 8, 2, 7, 5, 3, 0, 0, 0],
       [5, 7, 6, 4, 1, 0, 0, 0, 0, 0]])

In [22]:
type(result)

numpy.ndarray

so the output is an ndarray with padding to make every sequence the same length.

In [28]:
text_dataset = tf.data.Dataset.from_tensor_slices(["earth", "wind", "and", "fire"])
max_features = 50  # Maximum vocab size, this should map to the number of unique words
max_len = 10  # Sequence length to pad the outputs to.

# Create the layer.
vectorize_layer = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 #output_sequence_length=max_len
)

vectorize_layer.adapt(text_dataset.batch(64))

model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)

input_data = [["earth wind and fire"], ["fire and earth michigan crab plate cue"]]
model.predict(input_data)

array([[4, 2, 5, 3, 0, 0, 0],
       [3, 5, 4, 1, 1, 1, 1]])

OH INTERESTING, it automatically pads; if you remove the max_len, it just picks the longest sequence. ALSO, `1` means an `UNKNOWN` vocabulary word, meaning no index was assigned.

So if I can pad my manually vectorized sequences list, I should be good...
perhaps this is what I can use: https://raghakot.github.io/keras-text/keras_text.processing/

In [33]:
pip install keras-text

Collecting keras-text
  Downloading keras_text-0.1-py2.py3-none-any.whl (26 kB)
Collecting spacy>=2.0.3
  Downloading spacy-2.3.5-cp37-cp37m-manylinux2014_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 9.3 MB/s eta 0:00:01
Collecting keras>=2.1.2
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Collecting srsly<1.1.0,>=1.0.2
  Downloading srsly-1.0.5-cp37-cp37m-manylinux2014_x86_64.whl (184 kB)
[K     |████████████████████████████████| 184 kB 50.9 MB/s eta 0:00:01
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp37-cp37m-manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 22.8 MB/s eta 0:00:01
[?25hCollecting wasabi<1.1.0,>=0.4.0
  Downloading wasabi-0.8.0-py3-none-any.whl (23 kB)
Collecting catalogue<1.1.0,>=0.0.7
  Downloading catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.5-cp37-cp37m-manylinux2014_x86_64.whl (126 kB)
[K     |███████████████████████████

In [38]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [46]:
import pandas as pd

data = pd.read_csv('data/data_w_genres.csv')
corpus = data['genres']
corpus = corpus[corpus != '[]']
corpus = corpus.str.lstrip('[')
corpus = corpus.str.rstrip(']')
corpus = corpus.str.replace("'", '')
corpus = corpus.str.split(', ')
corpus.head()

0                                          [show tunes]
8                          [comedy rock, comic, parody]
9     [emo rap, florida rap, sad rap, underground hi...
10                                [dark trap, meme rap]
12    [asian american hip hop, cali rap, west coast ...
Name: genres, dtype: object

In [30]:
genre_seq = []
for values in corpus:
    for genre in values.split(', '):
        genre_seq.append(genre)
genres = set(genre_seq)

vocab_size = len(genres)

In [42]:
genre2int = {}

for i,genre in enumerate(genres):
    genre2int[genre] = i
    
genre2int

{'baltimore indie': 0,
 'piano blues': 1,
 'berlin minimal techno': 2,
 'turbo folk': 3,
 'sovietwave': 4,
 'future garage': 5,
 'pinoy reggae': 6,
 'russian metalcore': 7,
 'russian trance': 8,
 'dutch rap pop': 9,
 'japanese folk': 10,
 'quebec death metal': 11,
 'motivation': 12,
 'psychedelic blues-rock': 13,
 'russian modern classical': 14,
 'polish reggae': 15,
 'abstract hip hop': 16,
 'japanese metalcore': 17,
 'vaudeville': 18,
 'electra': 19,
 'techno': 20,
 'neue deutsche todeskunst': 21,
 'ok indie': 22,
 'melodic deathcore': 23,
 'hungarian classical piano': 24,
 'dark techno': 25,
 'swiss pop': 26,
 'aussietronica': 27,
 'chicago drill': 28,
 'smooth saxophone': 29,
 'latin funk': 30,
 'ska punk': 31,
 'psychedelic folk rock': 32,
 'arab pop': 33,
 'pei indie': 34,
 'deep ccm': 35,
 'classic afrobeat': 36,
 'progressive psytrance': 37,
 'icelandic jazz': 38,
 'irish country': 39,
 'progressive sludge': 40,
 'deep smooth jazz': 41,
 'belgian indie rock': 42,
 'cinematic po

In [45]:
genre_seq[8]

'vapor trap'

In [47]:
genre_int_seq = []
for sequence in corpus:
    genre_int_seq.append([genre2int[genre] for genre in sequence])
    
genre_int_seq[:5]

[[2371],
 [605, 937, 2516],
 [1557, 1347, 1270, 1013, 2322],
 [1046, 2798],
 [1468, 2083, 1607]]

In [48]:
man_input_data = pad_sequences(
    genre_int_seq, 
    padding="post",
    value=0)

In [50]:
man_input_data[:5]

array([[2371,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [ 605,  937, 2516,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1557, 1347, 1270, 1013, 2322,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1046, 2798,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [1468, 2083, 1607,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)