In [1]:
import tensorflow as tf
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l:[item for sublist in l for item in sublist]

### Util Function

In [113]:
def get_batch(batch_size, train_data):
    random.shuffle(train_data)
    start = 0
    end = batch_size
    length = len(train_data)
    while end < length:
        batch = train_data[start:end]
        tmp = end
        end = end + batch_size
        start = tmp
        yield batch
    if end >= length:
        batch = train_data[start:]
        yield batch

In [3]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda x: word2index[x] if word2index.get(x) is not None else word2index['<UNK>'],seq))
    return idxs
def prepare_word(word, word2index):
    return word2index[word] if word2index.get(word) is not None else word2index['<UNK>']

## Data load and Preprocessing

### Load corpus : gutenberg corpus
[NLTK load data](https://www.nltk.org/data.html)


In [4]:
from nltk.corpus import gutenberg as bg

In [5]:
bg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [6]:
bg.sents?

In [7]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] # sampling sentences for test

corpus = [[word.lower() for word in sent] for sent in corpus]

In [8]:
corpus[:10]

[['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'],
 ['etymology', '.'],
 ['(',
  'supplied',
  'by',
  'a',
  'late',
  'consumptive',
  'usher',
  'to',
  'a',
  'grammar',
  'school',
  ')'],
 ['the',
  'pale',
  'usher',
  '--',
  'threadbare',
  'in',
  'coat',
  ',',
  'heart',
  ',',
  'body',
  ',',
  'and',
  'brain',
  ';',
  'i',
  'see',
  'him',
  'now',
  '.'],
 ['he',
  'was',
  'ever',
  'dusting',
  'his',
  'old',
  'lexicons',
  'and',
  'grammars',
  ',',
  'with',
  'a',
  'queer',
  'handkerchief',
  ',',
  'mockingly',
  'embellished',
  'with',
  'all',
  'the',
  'gay',
  'flags',
  'of',
  'all',
  'the',
  'known',
  'nations',
  'of',
  'the',
  'world',
  '.'],
 ['he',
  'loved',
  'to',
  'dust',
  'his',
  'old',
  'grammars',
  ';',
  'it',
  'somehow',
  'mildly',
  'reminded',
  'him',
  'of',
  'his',
  'mortality',
  '.'],
 ['"',
  'while',
  'you',
  'take',
  'in',
  'hand',
  'to',
  'school',
  'others',
  ',',
  'and',
  'to',
  'te

In [9]:
flatten(corpus)[:10]

['[',
 'moby',
 'dick',
 'by',
 'herman',
 'melville',
 '1851',
 ']',
 'etymology',
 '.']

In [10]:
### Extract Stopwords from ungram distribution tails

In [11]:
word_count = Counter(flatten(corpus))
border = int(len(word_count)*0.01)

In [12]:
word_count.most_common(10)

[(',', 96),
 ('.', 66),
 ('the', 58),
 ('of', 36),
 ('and', 35),
 ('--', 27),
 ('"', 26),
 ('."', 26),
 ('to', 25),
 ('-', 24)]

In [13]:
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]

In [14]:
stopwords = [s[0] for s in stopwords]

In [15]:
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

### Build vocab

In [16]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')

In [17]:
print(len(set(flatten(corpus))), len(vocab))


592 583


In [18]:
word2index = {'<UNK>' : 0}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v:k for k,v in word2index.items()}

In [19]:
nltk.ngrams?

In [20]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1))for c in corpus])

In [21]:
windows[:2]

[('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by'),
 ('<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman')]

In [22]:
train_data = []


for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>':
            continue
        train_data.append((window[WINDOW_SIZE],window[i]))
print(train_data[:WINDOW_SIZE * 2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [23]:
X_train = []
Y_train = []

In [24]:
for tr in train_data:
    X_train.append(prepare_word(tr[0], word2index))
    Y_train.append(prepare_word(tr[1],word2index))

In [25]:
train_data = list(zip(X_train,Y_train))

In [26]:
for i in range(3):
    print(train_data[i][0])
    print(train_data[i][1])

198
318
198
394
198
85


In [27]:
len(train_data)

7606

## Modeling 

In [28]:
from tensorflow.keras import Model,layers,Sequential

In [116]:
class SkipGram(Model):
    
    def __init__(self, vocab_size,projection_dim):
        super(SkipGram,self).__init__()
        self.I_H = layers.Embedding(vocab_size,projection_dim) # input_hidden matrix
        self.H_U = layers.Embedding(vocab_size,projection_dim) # hidden_out matrix
        
    def call(self,inputs,predict,normal):
        inputs_embed = self.I_H(inputs)
        predict_embed = self.H_U(predict)
        normal_embed = self.H_U(normal)
        
        scores = tf.matmul(predict_embed,tf.transpose(inputs_embed,[0,2,1])) # Bx1xD * BxDx1 => Bx1
#         print("predict shape:{} input shape:{} result shape{}".format(
#                 predict_embed.shape,tf.transpose(inputs_embed,[0,2,1]).shape,scores.shape))
        scores = tf.squeeze(scores,2)
        norm_scores = tf.squeeze(tf.matmul(normal_embed,tf.transpose(inputs_embed,[0,2,1])),2) # BxVxD * BxDx1 => BxV
        
        nll = tf.expand_dims(-tf.math.reduce_mean(tf.math.log(tf.math.exp(scores)/tf.math.reduce_sum(tf.math.exp(norm_scores), 1),1)),axis=0) # log-softmax
        
        return nll # negative log likelihood
    
    

In [121]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100
losses = []

In [122]:
model = SkipGram(len(word2index), EMBEDDING_SIZE)
optimizer = tf.keras.optimizers.Adam()
for epoch in range(EPOCH):
    for i, batch in enumerate(get_batch(BATCH_SIZE, train_data)):
        inputs, targets = zip(*batch)
        vocabs = tf.expand_dims(tf.convert_to_tensor(prepare_sequence(list(vocab), word2index)),0)
        inputs = tf.expand_dims(tf.convert_to_tensor(inputs),1)
        targets = tf.expand_dims(tf.convert_to_tensor(targets),1)
        with tf.GradientTape() as tape:
            loss = model(inputs, targets, vocabs)
            grads = tape.gradient(loss,model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            losses.append(loss.numpy().tolist()[0])

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))

Epoch : 0, mean_loss : 6.37
Epoch : 10, mean_loss : 5.98
Epoch : 20, mean_loss : 5.60
Epoch : 30, mean_loss : 5.39
Epoch : 40, mean_loss : 5.24
Epoch : 50, mean_loss : 5.11
Epoch : 60, mean_loss : 4.99
Epoch : 70, mean_loss : 4.88
Epoch : 80, mean_loss : 4.77
Epoch : 90, mean_loss : 4.68
