# Import Libraries

In [39]:
!pip install keras
!pip install tensorflow
!pip install plot_keras_history
!pip install seaborn



In [40]:
from keras.utils import np_utils
from keras.preprocessing import sequence
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, Reshape
from keras.layers import Input
from keras.models import Model
from keras.layers import dot
from tensorflow.keras.activations import relu
from nltk import word_tokenize, sent_tokenize
from gensim.corpora.dictionary import Dictionary
import numpy as np
from keras.preprocessing.sequence import skipgrams
import gensim
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# CBOW

### Data Preparation

In [41]:
AlotOftext = """Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in corpora, 
the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a relation 
between two phenomena is demonstrably non-random, does not support the inference 
that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis testing 
has been used, and show how it has often led to unhelpful or misleading results.""".lower()



# Tokenize text
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(AlotOftext)]
print(tokenized_text)

# Create Vocab as a Dictionary
vocab = Dictionary(tokenized_text)
print(dict(vocab.items()))

print(vocab.token2id['corpora'])
print(vocab[2])
sent0 = tokenized_text[0]
print(vocab.doc2idx(sent0))

vocab.add_documents([['PAD']])
dict(vocab.items())
print(vocab.token2id['PAD'])

corpusByWordID = list()
for sent in  tokenized_text:
    corpusByWordID.append(vocab.doc2idx(sent))

vocab_size = len(vocab)
embed_size = 100
hidden_dim=100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(vocab.items())[:10])

[['language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.'], ['statistical', 'hypothesis', 'testing', 'uses', 'a', 'null', 'hypothesis', ',', 'which', 'posits', 'randomness', '.'], ['hence', ',', 'when', 'we', 'look', 'at', 'linguistic', 'phenomena', 'in', 'corpora', ',', 'the', 'null', 'hypothesis', 'will', 'never', 'be', 'true', '.'], ['moreover', ',', 'where', 'there', 'is', 'enough', 'data', ',', 'we', 'shall', '(', 'almost', ')', 'always', 'be', 'able', 'to', 'establish', 'that', 'it', 'is', 'not', 'true', '.'], ['in', 'corpus', 'studies', ',', 'we', 'frequently', 'do', 'have', 'enough', 'data', ',', 'so', 'the', 'fact', 'that', 'a', 'relation', 'between', 'two', 'phenomena', 'is', 'demonstrably', 'non-random', ',', 'does', 'not', 'support', 'the', 'inference', 'that', 'it', 'is', 'not', 'arbitrary', '.'], ['we', 'present', 'experimental', 'evidence', 'of', 'how', 'arbitrary', 'associations', 'between', 'word', 'fr

In [42]:
# Create CBOW Training data
def generate_cbow_context_word_pairs(corpusByID, window_size, vocab_size):
    context_length = window_size*2
    X=[]
    Y=[]
    for sent in corpusByID:
        sentence_length = len(sent)
        for index, word in enumerate(sent):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sent[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)
            if start<0:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='pre',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            if end>=sentence_length:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='post',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            else:
                X.append(sequence.pad_sequences(context_words, maxlen=context_length))
                y = np_utils.to_categorical(label_word, vocab_size)
                Y.append(y)
                continue
           
    return X,Y
            
# Test this out for some samples


X,Y = generate_cbow_context_word_pairs(corpusByWordID, window_size, vocab_size) 
   
for x, y in zip(X,Y):
    print('Context (X):', [vocab[w] for w in x[0]], '-> Target (Y):', vocab[np.argwhere(y[0])[0][0]])

Context (X): ['PAD', 'PAD', 'users', 'never'] -> Target (Y): language
Context (X): ['PAD', 'language', 'never', 'choose'] -> Target (Y): users
Context (X): ['language', 'users', 'choose', 'words'] -> Target (Y): never
Context (X): ['users', 'never', 'words', 'randomly'] -> Target (Y): choose
Context (X): ['never', 'choose', 'randomly', ','] -> Target (Y): words
Context (X): ['choose', 'words', ',', 'and'] -> Target (Y): randomly
Context (X): ['words', 'randomly', 'and', 'language'] -> Target (Y): ,
Context (X): ['randomly', ',', 'language', 'is'] -> Target (Y): and
Context (X): [',', 'and', 'is', 'essentially'] -> Target (Y): language
Context (X): ['and', 'language', 'essentially', 'non-random'] -> Target (Y): is
Context (X): ['language', 'is', 'non-random', '.'] -> Target (Y): essentially
Context (X): ['is', 'essentially', '.', 'PAD'] -> Target (Y): non-random
Context (X): ['essentially', 'non-random', 'PAD', 'PAD'] -> Target (Y): .
Context (X): ['PAD', 'PAD', 'hypothesis', 'testing']

### Modeling

In [43]:
V = len(vocab) 
N = 100 
window_size = 2

In [44]:
cbow = Sequential()

cbow.add(Embedding(input_dim = V, 
                   output_dim = N,
                   input_length = window_size*2)) 

cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(N,1)))

cbow.add(Dense(V, activation='relu'))
cbow.compile(loss='categorical_crossentropy', optimizer='sgd')
cbow.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 4, 100)            8800      
                                                                 
 lambda_4 (Lambda)           (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 88)                8888      
                                                                 
Total params: 17,688
Trainable params: 17,688
Non-trainable params: 0
_________________________________________________________________


In [45]:
#Train the model

for epoch in range(100):
    loss = 0.
    for x, y in zip(X,Y):
      loss += cbow.train_on_batch(x, y)
    print(epoch, loss)

0 1567.5061424970627
1 1269.7155812978745
2 1194.2798880338669
3 1178.5544596910477
4 1171.8559724092484
5 1166.545238852501
6 1162.1016528606415
7 1158.2608840465546
8 1154.8661383390427
9 1151.8166155815125
10 1149.0421355962753
11 1146.4915161132812
12 1144.127556681633
13 1141.9210551977158
14 1139.849433541298
15 1137.8944646120071
16 1136.0420526266098
17 1134.279482960701
18 1132.596445798874
19 1130.9848074913025
20 1129.4368628263474
21 1127.9470437765121
22 1126.5100889205933
23 1125.1210478544235
24 1123.7765136957169
25 1122.4734536409378
26 1121.2085974216461
27 1119.9788310527802
28 1118.7821642160416
29 1117.6156251430511
30 1116.4775099754333
31 1115.3656096458435
32 1114.278504371643
33 1113.2147631645203
34 1112.1723136901855
35 1111.1506071090698
36 1110.148718714714
37 1109.1651582717896
38 1108.198615193367
39 1107.2480803728104
40 1106.3123837709427
41 1105.3907358646393
42 1104.482270359993
43 1103.5867277383804
44 1102.70365858078
45 1101.8323880434036
46 1100.9

In [46]:
## Save the wordvectors
f = open('Cbow_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = cbow.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [47]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./Cbow_vectors.txt', binary=False)

w2v.most_similar(positive=['language'])

[('non-random', 0.7850772738456726),
 ('and', 0.7158022522926331),
 ('essentially', 0.7134977579116821),
 ('choose', 0.6051605939865112),
 ('words', 0.5714943408966064),
 ('users', 0.5539845824241638),
 ('not', 0.43584901094436646),
 ('that', 0.4232177436351776),
 ('demonstrably', 0.4219173192977905),
 ('systematically', 0.41441741585731506)]

# Skip-gram

### Data Preparation

In [48]:
# generate skip-grams with both positive and negative examples
skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=2) for sent in corpusByWordID]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        vocab[pairs[i][0]], pairs[i][0],           
        vocab[pairs[i][1]], pairs[i][1], 
        labels[i]))

(choose (3), and (2)) -> 0
(. (1), non-random (8)) -> 1
(language (6), never (7)) -> 1
(never (7), been (75)) -> 0
(never (7), words (11)) -> 1
(language (6), is (5)) -> 1
(and (2), or (81)) -> 0
(non-random (8), . (1)) -> 1
(is (5), essentially (4)) -> 1
(. (1), essentially (4)) -> 1


### Modeling

In [49]:
V = len(vocab) 
N = 100 
window_size = 2

In [50]:
#define the skip-gram model

input_word = Input((1,))
input_context_word = Input((1,))

word_embedding    = Embedding(input_dim=V, output_dim=N,input_length=1,name='word_embedding')
context_embedding = Embedding(input_dim=V, output_dim=N,input_length=1,name='context_embedding')

word_embedding = word_embedding(input_word)
word_embedding_layer = Reshape((N, 1))(word_embedding)

context_embedding = context_embedding(input_context_word)
context_embedding_layer = Reshape((N, 1))(context_embedding)

# now perform the dot product operation  
dot_product = dot([word_embedding_layer, context_embedding_layer], axes=1)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
outputLayer = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_word, input_context_word], outputs=outputLayer)
model.compile(loss='binary_crossentropy', optimizer='adam')

# view model summary
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 word_embedding (Embedding)     (None, 1, 100)       8800        ['input_3[0][0]']                
                                                                                                  
 context_embedding (Embedding)  (None, 1, 100)       8800        ['input_4[0][0]']                
                                                                                            

In [51]:
#train the model

for epoch in range(1, 100):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 4.852176606655121
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 4.844983518123627
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 4.838289499282837
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 4 Loss: 4.831020832061768
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 5 Loss: 4.82278436422348
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 6 Loss: 4.813201487064362
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 7 Loss: 4.8018800020217896
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 8 Loss: 4.788412928581238
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 9 Loss: 4.772378742694855
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 10 Loss: 4.753344416618347
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 11 Loss: 4.730868995189667
Processed 0 (skip_first, skip_

In [52]:
#get the embeding matrix
weights = model.get_weights()
## Save the wordvectors
f = open('skipgram_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [53]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./skipgram_vectors.txt', binary=False)
w2v.most_similar(positive=['the'])

[('has', 0.5080244541168213),
 ('will', 0.39930760860443115),
 ('there', 0.38471418619155884),
 ('inference', 0.37814533710479736),
 ('show', 0.3536876142024994),
 ('to', 0.2860047519207001),
 ('used', 0.27465033531188965),
 ('testing', 0.27205532789230347),
 ('often', 0.24003587663173676),
 ('statistical', 0.1933324635028839)]

In [54]:
#Excerise: 
#modeify the skipegram_model to share the same embeding layer between word and context
#Discussion: which is better? Why?  