In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/python_modules')

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
from NLP.Utils import Utils

In [5]:
class CBOW:
    def __init__(self):
        self.learning_rate = 0.01

    # preprocess corpus
    def clean_corpus(self, corpus):
        if type(corpus[0]) == list:
            clean_corpus, word_counts = Utils.clean_docs(corpus)
            return clean_corpus, word_counts
        else:
            prep_corpus = Utils.tokenize(corpus, 'bulk')
            clean_corpus, word_counts = Utils.clean_docs(prep_corpus)
            return clean_corpus, word_counts
        
    # convert output to probability
    def softmax(self, u):
        x = np.exp(u) / np.sum(np.exp(u))
        return x
    
    # initialize words weights
    def initialize_weights(self, V, N):
        """
            W1 (hidden layer) = shape(Vocab_length(V), Word_dim(N))
            W1 = | d1-w1 d2-w1 ... dN-w1 |
                 | d1-w2 d2-w2 ... dN-w2 |
                 | d1-wV d2-wV ... dN-wV |

            W2 (output layer) = shape(Word_dim(N), Vocab_length(V))
            W2 = | d1-w1 d1-w2 ... d1-wV |
                 | d2-w1 d2-w2 ... d2-wV |
                 | dN-w1 dN-w2 ... dN-wV |
        """
        np.random.seed(0)
        self.W1 = np.random.randn(V, N).astype('float128')
        self.W2 = np.random.randn(N, V).astype('float128') 
    
    # update weights based on gradient
    def update_weights(self, dW2, dW1):
        self.W2 = self.W2 - self.learning_rate * dW2
        self.W1 = self.W1 - self.learning_rate * dW1

    # feed forward
    def forward(self, X, predict=False, k=5):
        self.x_avg = (np.sum(X, axis=1, keepdims=True) / X.shape[1]).copy()
        self.h = np.dot(self.W1.T, self.x_avg)
        self.u = np.dot(self.W2.T, self.h)
        self.y = self.softmax(self.u)

        if predict:
            words = []
            # store pred and keep trach of their index
            pred = dict(zip(range(len(self.y)), self.y))
            # sort based on probability of each word to be a context word
            pred_sorted = sorted(pred, key=lambda x: pred[x], reverse=True)
            # select the top k words
            top_context = pred_sorted[:k]
            # grab the word using its index from the vocab
            for w in top_context:
                words.append(self.vocab[w])
            return words

    # backprop error and calculate gradient
    def backprop(self, x, label):
        error = self.y - label
        dW2 = np.dot(self.h, error.T)
        dh = np.dot(self.W2, error)
        dW1 = np.dot(self.x_avg, dh.T)
        self.update_weights(dW2, dW1)

    # train the model
    def fit(self, corpus, N=10, window_size=2, epochs=500, learning_rate=.01):
        # clean corpus
        corpus, word_counts = self.clean_corpus(corpus)
        self.cleaned_corpus = corpus
        self.vocab = sorted(list(word_counts.keys()))
        self.word_index = Utils.vocab_idx(self.vocab)

        # initialize parameters
        self.initialize_weights(len(self.vocab), N)

        if learning_rate is not None:
            self.learning_rate = learning_rate
        
        self.train_loss = []
        vocab_len = len(self.vocab)

        for epoch in range(epochs):
            loss = 0
            for doc in corpus:
                # index to track position in the doc
                current_index = 0
                
                doc_len = len(doc)
                # grab center word context words
                while current_index < doc_len:
                    # center word
                    target_word = doc[current_index]

                    # words in window size
                    left_window = max(0, current_index - window_size)
                    right_window= min(current_index + window_size, doc_len)
                    context_words = doc[left_window:current_index] + doc[current_index+1: right_window]
                    if len(context_words) == 0:
                        current_index += 1
                        continue
                    # context word one-hot-vector for each context word
                    input = np.zeros((vocab_len, len(context_words)))

                    context_words_idx = [self.word_index[cw] for cw in context_words]
                    idx = np.arange(len(context_words_idx))

                    input[context_words_idx, idx] = 1

                    # convert target word to one-hot-vector
                    target_word_idx = self.word_index[target_word]
                    target_word_vector = np.zeros((vocab_len, 1))
                    target_word_vector[target_word_idx] = 1

                    # feed forward word through network
                    self.forward(input) 

                    # backprop
                    self.backprop(input, target_word_vector)

                    # calculate loss
                    loss = -np.log(self.y[target_word_idx])

                    # increment index
                    current_index += 1 

            self.train_loss.append((epoch, loss))
            if epoch > 1:
                if abs(self.train_loss[-2][1][0] - self.train_loss[-1][1][0]) < 1e-5 \
                or self.train_loss[-2][1][0] < self.train_loss[-1][1][0]:
                    print ('Stopping Early')
                    break

            if epoch % 10 == 0:
                print("Epoch: ", epoch, "Loss", loss)

    # make predictions
    def predict(self, x, k=5):
        cleaned, _ = Utils.clean_docs(x)
        # convert context words to one-hot-vectors
        context_words_idx = [self.word_index[w.lower()] for w in cleaned[0]]
        context_words_vector = np.zeros((len(self.vocab), len(cleaned[0])))

        idx = np.arange(len(context_words_idx))
        context_words_vector[context_words_idx, idx] = 1

        # feed it forward in the network and grab most similar words
        return self.forward(context_words_vector, predict=True, k=k)

## Tree bank corpus

In [6]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [7]:
np.random.seed(0)
corpus = np.random.choice(treebank.sents(), 50)

In [8]:
tree_bank_cbow = CBOW()

In [9]:
tree_bank_cbow.fit(corpus, epochs=1000, learning_rate=.01)

Epoch:  0 Loss [10.94631158]
Epoch:  10 Loss [9.39053767]
Epoch:  20 Loss [8.26339969]
Epoch:  30 Loss [7.35713468]
Epoch:  40 Loss [6.57207851]
Epoch:  50 Loss [5.85326182]
Epoch:  60 Loss [5.16985854]
Epoch:  70 Loss [4.50842321]
Epoch:  80 Loss [3.86900256]
Epoch:  90 Loss [3.26029678]
Epoch:  100 Loss [2.69549227]
Epoch:  110 Loss [2.19400489]
Epoch:  120 Loss [1.78069585]
Epoch:  130 Loss [1.47040447]
Epoch:  140 Loss [1.25469781]
Epoch:  150 Loss [1.10912823]
Epoch:  160 Loss [1.00836062]
Epoch:  170 Loss [0.93388206]
Epoch:  180 Loss [0.87439624]
Epoch:  190 Loss [0.82368736]
Epoch:  200 Loss [0.77853124]
Epoch:  210 Loss [0.73729223]
Epoch:  220 Loss [0.69912344]
Epoch:  230 Loss [0.66355234]
Epoch:  240 Loss [0.63027749]
Epoch:  250 Loss [0.59907085]
Epoch:  260 Loss [0.56973367]
Epoch:  270 Loss [0.5420844]
Epoch:  280 Loss [0.51596543]
Epoch:  290 Loss [0.491254]
Epoch:  300 Loss [0.46786521]
Epoch:  310 Loss [0.44574463]
Epoch:  320 Loss [0.42485544]
Epoch:  330 Loss [0.405

In [10]:
data = []
for  doc in tree_bank_cbow.cleaned_corpus[:10]:
    for i in range(2, len(doc) - 2):
        context = [doc[i - 2], doc[i - 1], doc[i + 1], doc[i + 2]]
        target = doc[i]
        data.append((context, target))

In [11]:
for i in range(20):
    idx = np.random.randint(0, len(data)+1)
    sent = data[idx]
    print('Sentence: ', sent[0], '|Target: ', sent[1])
    print('Predicted word:', tree_bank_cbow.predict([sent[0]], 1))

Sentence:  ['trying', 'help', 'unfair', 'testing'] |Target:  kids
Predicted word: ['kids']
Sentence:  ['closed', 'yesterday', 'york', 'stock'] |Target:  new
Predicted word: ['exchange']
Sentence:  ['state', 'university', 'professor', 'concluded'] |Target:  education
Predicted word: ['education']
Sentence:  ['oust', 'mr', 'chairman', 'datapoint'] |Target:  edelman
Predicted word: ['edelman']
Sentence:  ['nt', 'used', 'similarity', 'actual'] |Target:  classroom
Predicted word: ['classroom']
Sentence:  ['dallas', 'investor', 'simmons', 'offered'] |Target:  harold
Predicted word: ['report']
Sentence:  ['st', 'mary', 'ilminster', 'somerset'] |Target:  church
Predicted word: ['ilminster']
Sentence:  ['trying', 'help', 'unfair', 'testing'] |Target:  kids
Predicted word: ['kids']
Sentence:  ['nl', 'industries', 'dallas', 'investor'] |Target:  controlled
Predicted word: ['controlled']
Sentence:  ['schoolteacher', 'william', 'michigan', 'state'] |Target:  mehrens
Predicted word: ['mehrens']
Sent