In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np

ds_train, ds_test = tfds.load('ag_news_subset').values()

In [None]:
vocab_size = 30000
batch_size = 128

vectorizer = keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocab_size, input_shape=(1,))

model = keras.models.Sequential([
    vectorizer,
    keras.layers.Embedding(vocab_size,100),
    keras.layers.Lambda(lambda x: tf.reduce_mean(x,axis=1)),
    keras.layers.Dense(4, activation='softmax')
])
model.summary()

In [None]:
def extract_text(x):
    return x['title']+' '+x['description']

def tupelize(x):
    return (extract_text(x), x['label'])

print("Training vectorizer")
vectorizer.adapt(ds_train.take(500).map(extract_text))

model.compile(loss='sparse_categorical_crossentropy', metrics=['acc'])
model.fit(ds_train.map(tupelize).batch(batch_size), validation_data=ds_test.map(tupelize).batch(batch_size))

##Dealing with variable sequence sizes


In [None]:
print(vectorizer('Hello, world!'))
print(vectorizer('I am glad to meet you!'))

In [None]:
vectorizer(['Hello, world!', 'I am glad to meet you!'])

In [None]:
model.layers[1](vectorizer(['Hello, world!', 'I am glad to meet you!'])).numpy()

##Semantic embeddings: Word2Vec

In [1]:
import gensim.downloader as api
w2v = api.load('word2vec-google-news-300')



In [None]:
for w,p in w2v.most_similar('neural'):
  print(f"{w} -> {p}")

In [None]:
w2v['play'][:20]

In [None]:
w2v.most_similar(positive=['king', 'woman'], negative=['man'])[0]

In [None]:
#get the vector corresponding to kind-man +woman
qvec = w2v['king']-1.7*w2v['man']+1.7*w2v['woman']
#find the index of the closest embedding vector
d = np.sum((w2v.vectors-qvec)**2, axis=1)
min_idx = np.argmin(d)
#find the coressponding word
w2v.index_to_key[min_idx]

##Using pretrained embeddings in keras

###Using tokenizer vocabulary

In [None]:
embed_size = len(w2v.get_vector('hello'))
print(f'Embedding size: {embed_size}')

vocab = vectorizer.get_vocabulary()
W = np.zeros((vocab_size, embed_size))
print('Populating matrix, this will take some time...',end='')
found, not_found =0,0
for i,w in enumerate(vocab):
  try:
    W[i] = w2v.get_vector(w)
    found+=1

  except:
    #W[i] = np.random.normal(0.0,0.3, size=(embed_size,))
    not_found+=1

print(f"Done, found {found} words, {not_found} words missing")

In [None]:
emb = keras.layers.Embedding(vocab_size, embed_size, weights=[W], trainable=False)
model = keras.models.Sequential([
    vectorizer, emb,
    keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    keras.layers.Dense(4, activation='softmax')
])

In [None]:
model.compile(loss='sparse_categorical_crossentropy', metrics=['acc'])
model.fit(ds_train.map(tupelize).batch(batch_size),
          validation_data=ds_test.map(tupelize).batch(batch_size))

###Using embedding vocabulary

In [None]:
vocab = list(w2v.vocab.keys())
vectorizer = keras.layers.experimental.preprocessing.TextVectorization(input_shape=(1,))
vectorizer.set_vocabulary(vocab)

In [None]:
model = keras.models.Sequential([
    vectorizer,
    w2v.get_keras_embedding(train,embeddings=False),
    keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    keras.layers.Dense(4, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', metrics=['acc'])
model.fit(ds_train.map(tupelize).batch(128), validation_data=ds_test.map(tupelize).batch(128),epochs=5)