### New package installation
```pip install tensorflow_datasets```

```pip install ipywidgets```

```pip install nltk```
### Reference:
https://www.tensorflow.org/tutorials/text/word_embeddings

## Neural Network Language Model

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [4]:
file = open("Data/J. K. Rowling - Harry Potter 1 - Sorcerer's Stone",'r')
raw_data_1 = file.read()
file.close()

In [24]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize

from nltk.corpus import stopwords 
import nltk 
#nltk.download('punkt')
#nltk.download('stopwords')

In [112]:
stop_words = set(stopwords.words('english')) 
word_tokens = wordpunct_tokenize(raw_data_1)  
word_tokens = [w.lower() for w in word_tokens if not w in stop_words] 
word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 

In [34]:
vocab = set(word_tokens)
char_to_int = dict((c,i) for i,c in enumerate(vocab))
int_to_char = dict((i,c) for i,c in enumerate(vocab))

In [36]:
X = np.array([], dtype=np.float).reshape(0,4)
Y = np.array([])
Xwords=[]
Ywords=[]
window_size = 2
for i, word in enumerate(word_tokens):
    Xsub=np.zeros(2*window_size)
    Xsubwords=[]
    isetvalue=0
    for icontext in range(max(i-window_size,0), min(i+window_size, len(word_tokens)-1)+1):
        if icontext!=i:
            Xsub[isetvalue]=char_to_int[word_tokens[icontext]]
            Xsubwords.append(word_tokens[icontext])
            isetvalue=isetvalue+1
    X=np.vstack([X, Xsub])
    Xwords.append(Xsubwords)
    Y=np.append(Y,char_to_int[word])
    Ywords.append(word)

In [56]:
print([X[0][1]])
print([Y[0]])

[5482.0]
[1947.0]


In [47]:
embedding_dim=100
vocab_size=len(vocab)
cbowNN = keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(vocab_size, activation='softmax')
])

In [50]:
cbowNN.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [52]:
history = cbowNN.fit(X, Y,epochs=10)

Train on 45231 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [53]:
char_to_int["harry"]

1947

In [54]:
char_to_int["potter"]

4788

In [95]:
int_to_char[1947]

'harry'

In [96]:
int_to_char[3]

'pellets'

In [98]:
from numpy.linalg import norm
e = cbowNN.layers[0]
a=e(tf.constant(char_to_int['stone'])).numpy()
b=e(tf.constant(char_to_int['harry'])).numpy()
np.dot(a,b)/norm(a)/norm(b)

-0.1257097

In [99]:
e.get_weights()[0][1947]

array([ 0.5817012 ,  0.11552174, -0.50672746, -0.76676583,  0.15401813,
       -0.6681709 , -0.71469826, -0.61663234,  0.12586027, -0.5183265 ,
       -0.40008894, -0.4398571 ,  0.0119456 , -0.75455415,  0.2741087 ,
       -0.46714088, -1.201626  , -0.81872886,  0.4370791 ,  0.77879894,
       -0.8572957 , -0.6878351 ,  0.9252831 , -0.68646735,  0.5330421 ,
        0.31523493,  0.49670088, -0.47646615,  0.91890687,  0.5631099 ,
        0.06969263, -0.8544153 , -0.84899163, -0.7981884 ,  0.04232222,
        0.05061456,  0.4748415 ,  0.7895601 , -0.42458475,  0.638358  ,
       -0.281402  , -1.5851709 , -0.05116541, -0.57948965, -0.91488653,
       -0.04604899, -0.529991  , -0.04008646, -0.11312816,  0.6267403 ,
       -0.53496647,  0.2150093 , -0.04823145, -0.5373095 , -0.19713253,
       -0.71456957,  0.31499422, -0.11559291,  0.03633097, -0.6609817 ,
        0.13168311,  0.8367661 ,  0.56946206, -0.9807432 ,  0.38796842,
        0.5526397 , -0.78772616,  0.37516588, -0.9905758 , -0.54

In [389]:
cbowNN.predict(tf.constant([3767]))

array([[2.2948307e-22, 3.2295737e-35, 4.3348098e-26, ..., 1.5308596e-27,
        1.0985353e-19, 2.6982707e-24]], dtype=float32)

In [391]:
np.argmax(cbowNN.predict(tf.constant([3767]))[0])

4196

In [110]:
len(cbowNN.predict(tf.constant([char_to_int['stone']]))[0])
for i in np.argsort(cbowNN.predict(tf.constant([1947]))[0])[::-1][:5]:
    print(int_to_char[i])

potter
felt
council
shook
whispered


In [402]:
for i, word in enumerate(words):
    if word=="ink":
        print(i)

8548
15077


In [101]:
from sklearn.metrics.pairwise import cosine_similarity

# compute pairwise distance matrix
distance_matrix = cosine_similarity(e.get_weights()[0])
print(distance_matrix.shape)

(5730, 5730)


In [114]:
# view contextually similar words
similar_words = {search_term: [int_to_char[idx] for idx in distance_matrix[char_to_int[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['tree', 'potter', 'stone', 'bird', 'girl']}

similar_words

{'tree': ['peering', 'tasted', 'befuddle', 'coast', 'treble'],
 'potter': ['treble', 'secret', 'town', 'cloak', 'forests'],
 'stone': ['flock', 'sneaking', 'sniffling', 'story', 'dazed'],
 'bird': ['pulling', 'cast', 'forests', 'proof', 'applause'],
 'girl': ['nonstop', 'forests', 'town', 'beats', 'cast']}

##