In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/python_modules')

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
from NLP.Utils import Utils

# **SkipGram Model**

In [5]:
class SkipGram:
    def __init__(self):
        self.learning_rate = 0.01

    # preprocess corpus
    def clean_corpus(self, corpus):
        if type(corpus[0]) == list:
            clean_corpus, word_counts = Utils.clean_docs(corpus)
            return clean_corpus, word_counts
        else:
            prep_corpus = Utils.tokenize(corpus, 'bulk')
            clean_corpus, word_counts = Utils.clean_docs(prep_corpus)
            return clean_corpus, word_counts
        
    # convert output to probability
    def softmax(self, u):
        x = np.exp(u) / np.sum(np.exp(u))
        return x
    
    # initialize words weights
    def initialize_weights(self, V, N):
        """
            W1 (hidden layer) = shape(Vocab_length(V), Word_dim(N))
            W1 = | d1-w1 d2-w1 ... dN-w1 |
                 | d1-w2 d2-w2 ... dN-w2 |
                 | d1-wV d2-wV ... dN-wV |

            W2 (output layer) = shape(Word_dim(N), Vocab_length(V))
            W2 = | d1-w1 d1-w2 ... d1-wV |
                 | d2-w1 d2-w2 ... d2-wV |
                 | dN-w1 dN-w2 ... dN-wV |
        """
        np.random.seed(0)
        self.W1 = np.random.randn(V, N).astype('float128')
        self.W2 = np.random.randn(N, V).astype('float128') 
    
    # update weights based on gradient
    def update_weights(self, dW2, dW1):
        self.W2 = self.W2 - self.learning_rate * dW2
        self.W1 = self.W1 - self.learning_rate * dW1

    # feed forward
    def forward(self, X, predict=False, k=5):
        """
            X = [0, 0, 1, ... , V].T -> shape(V, 1)
            h (hidden layer) = W1.T @ X -> (N, V) @ (V, 1) = (N, 1)
            u (output layer) = W2.T @ h -> (V, N) @ (N, 1) = (V, 1)
            y (output prob) = softmax(u)
        """

        self.h = np.dot(self.W1.T, X)
        self.u = np.dot(self.W2.T, self.h)
        self.y = self.softmax(self.u)

        # if trying to predict most similar words
        if predict:
            words = []
            # store pred and keep trach of their index
            pred = dict(zip(range(len(self.y)), self.y))
            # sort based on probability of each word to be a context word
            pred_sorted = sorted(pred, key=lambda x: pred[x], reverse=True)
            # select the top k words
            top_context = pred_sorted[:k]
            # grab the word using its index from the vocab
            for w in top_context:
                words.append(self.vocab[w])

            return words

    # backprop error and calculate gradient
    def backprop(self, x, label):
        """
            error = (pred - true) -> (V, 1)
            dW2 = dE/dy * dy/dW2 = h @ error.T -> (N, 1) @ (1, V) -> (N, V)(=W2 shape)
            dh = W2 @ error -> backpropagate error to the hidden layer -> (N, 1)
            dW1 = x @ dh.T -> outer product (V, 1) @ (1, N) -> (V, N) (=W1 shape)
        """
        error = self.y - label
        dW2 = np.dot(self.h, error.T)
        dh = np.dot(self.W2, error)
        dW1 = np.dot(x, dh.T)
        self.update_weights(dW2, dW1)


    # train the model
    def fit(self, corpus, N=200, window_size=2, epochs=500, learning_rate=.01):
        # clean corpus
        corpus, word_counts = self.clean_corpus(corpus)
        self.vocab = sorted(list(word_counts.keys()))
        self.word_index = Utils.vocab_idx(self.vocab)

        # initialize parameters
        self.initialize_weights(len(self.vocab), N)

        if learning_rate is not None:
            self.learning_rate = learning_rate
        
        self.train_loss = []
        vocab_len = len(self.vocab)

        for epoch in range(epochs):
            loss = 0
            for doc in corpus:
                # index to track position in the doc
                current_index = 0
                
                doc_len = len(doc)
                # grab center word context words
                while current_index < doc_len:
                    # center word
                    word = doc[current_index]

                    # words in window size
                    left_window = max(0, current_index - window_size)
                    right_window= min(current_index + window_size, doc_len)
                    context_words = doc[left_window:current_index] + doc[current_index+1: right_window]
                    
                    # prepare label (index of context words)
                    label = np.zeros((vocab_len, 1))

                    for cw in context_words:
                        cw_idx = self.word_index[cw]
                        # change value of label to 1 for words in context
                        label[cw_idx] = 1
                    
                    # convert input word to one-hot-vector
                    center_word_idx = self.word_index[word]
                    center_word_vector = np.zeros((vocab_len, 1))
                    center_word_vector[center_word_idx] = 1

                    # feed forward word through network
                    self.forward(center_word_vector) 

                    # backprop
                    self.backprop(center_word_vector, label)

                    # calculate loss
                    loss = -np.sum(self.u[label==1]) + len(context_words) * np.log(sum(np.exp(self.u)))

                    # increment index
                    current_index += 1 

            self.train_loss.append((epoch, loss))
            if epoch > 1:
                if abs(self.train_loss[-2][1][0] - self.train_loss[-1][1][0]) < 1e-4 \
                or self.train_loss[-2][1][0] < self.train_loss[-1][1][0]:
                    print ('Stopping Early')
                    break

            if epoch % 10 == 0:
                print("Epoch: ", epoch, "Loss", loss)

    # make predictions
    def predict(self, x, k=5):
        # convert word to one-hot-vector
        center_word_idx = self.word_index[x.lower()]
        center_word_vector = np.zeros((len(self.vocab), 1))
        center_word_vector[center_word_idx] = 1
        # feed it forward in the network and grab most similar words
        return self.forward(center_word_vector, predict=True, k=k)

# Testing Model

## Simple Test

In [6]:
simple_corpus = "The quick brown fox jumps over the lazy dog."

In [7]:
simple_sg = SkipGram()
simple_sg.fit(simple_corpus, N=10, window_size=2, epochs=100, learning_rate=.01)

Epoch:  0 Loss [4.5641709]
Epoch:  10 Loss [2.53427398]
Epoch:  20 Loss [1.80073308]
Epoch:  30 Loss [1.53672327]
Epoch:  40 Loss [1.44471628]
Epoch:  50 Loss [1.42416028]
Stopping Early


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
simple_sg.predict('fox', 2)

['brown', 'jumps']

# Using treebank corpus

## Grab random 100 doc from tree bank corpus

In [9]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [10]:
np.random.seed(0)
corpus = np.random.choice(treebank.sents(), 100)

In [11]:
tree_bank_sg = SkipGram()
tree_bank_sg.fit(corpus, N=50, window_size=3, epochs=50, learning_rate=.01)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch:  0 Loss [20.40035635]
Epoch:  10 Loss [5.93900068]
Epoch:  20 Loss [0.59148718]
Stopping Early


In [12]:
print(np.random.choice(corpus))

['But', 'the', 'strength', 'in', 'heating', 'oil', 'helped', '*-1', 'push', 'up', 'crude', 'oil', '.']


In [13]:
tree_bank_sg.predict('oil')

['helped', 'push', 'crude', 'strength', 'spending']