In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.insert(0,'/content/drive/My Drive/python_modules')

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
from NLP.Utils import Utils

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
class SkipGram:
    def __init__(self):
        self.learning_rate = 0.001

    def softmax(self, u):
        return np.exp(u) / np.sum(np.exp(u))

    # convert output to probability
    # sigmoid for neg sampling
    def sigmoid(self, u):
        out = 1 / (1 + np.exp(-u))
        return out

    # preprocess corpus
    def clean_corpus(self, corpus):
        if type(corpus[0]) == list:
            clean_corpus, word_counts = Utils.clean_docs(corpus)
            return clean_corpus, word_counts
        else:
            prep_corpus = Utils.tokenize(corpus, 'bulk')
            clean_corpus, word_counts = Utils.clean_docs(prep_corpus)
            return clean_corpus, word_counts
    
    # Unigram
    def create_unigram(self, word_counts):
        self.word_prob = Utils.unigram(word_counts)
    
    # grab negative samples
    def negative_sample(self, k=10):
        samples = Utils.negative_sample(self.word_prob, k)
        samples_idx = [self.word_index[s] for s in samples]
        return samples_idx
    
    # initialize words weights
    def initialize_weights(self, V, N):
        """
            W1 (hidden layer) = shape(Vocab_length(V), Word_dim(N))
            W1 = | d1-w1 d2-w1 ... dN-w1 |
                 | d1-w2 d2-w2 ... dN-w2 |
                 | d1-wV d2-wV ... dN-wV |

            W2 (output layer) = shape(Word_dim(N), Vocab_length(V))
            W2 = | d1-w1 d1-w2 ... d1-wV |
                 | d2-w1 d2-w2 ... d2-wV |
                 | dN-w1 dN-w2 ... dN-wV |
        """
        np.random.seed(0)
        self.W1 = np.random.randn(V, N).astype('float128')
        self.W2 = np.random.randn(N, V).astype('float128')


    # update weights based on gradient
    def update_weights(self, dW2, dW1, neg_sample, cwidx):
        self.W2[:, neg_sample] = self.W2[:, neg_sample] - self.learning_rate * dW2
        self.W1[cwidx, :] = self.W1[cwidx, :] - self.learning_rate * dW1

    # feed forward
    def forward(self, X):
        """
            X = [0, 0, 1, ... , V].T -> shape(V, 1)
            h (hidden layer) = W1.T @ X -> (N, V) @ (V, 1) = (N, 1)
            u (output layer) = W2.T @ h -> (V, N) @ (N, 1) = (V, 1)
            y (output prob) = softmax(u)
        """

        self.h = np.dot(self.W1.T, X)
        self.u = np.dot(self.W2.T, self.h)
        self.y = self.sigmoid(self.u)

    # backprop error and calculate gradient
    def backprop(self, x, label, samples, center_word_idx):
        """
            calculate error only with respect to chosen samples
            error = (pred[samples] - true[samples]) -> (V[samples], 1)
            dW2 = dE/dy * dy/dW2 = h @ error.T -> (N, 1) @ (1, V[samples]) -> (N, V[samples])
            dh = W2[samples] @ error -> backpropagate error to the hidden layer -> (N, 1)
            dW1 = df.T -> (1, N) x is one-hot-vector so only input word will be updated
        """
        # calculate error from selected samples
        error = self.y[samples] - label[samples]
        # grad w.r.t W2
        dW2 = np.dot(self.h, error.T)
        # backpropagate error from outputlayer to hidden layer
        dh = np.dot(self.W2[:, samples], error)
        # grad w.r.t W1
        dW1 = dh.reshape(1, -1)
        # update weights
        self.update_weights(dW2, dW1, samples, center_word_idx)

    # train the model
    def fit(self, corpus, N=300, window_size=2, epochs=500, learning_rate=None):
        # clean corpus
        corpus, word_counts = self.clean_corpus(corpus)
        self.vocab = sorted(list(word_counts.keys()))
        self.word_index = Utils.vocab_idx(self.vocab)

        # generate unigram
        self.create_unigram(word_counts)

        # initialize parameters
        self.initialize_weights(len(self.vocab), N)

        if learning_rate is not None:
            self.learning_rate = learning_rate
        
        self.train_loss = []
        # iterateive process to train model
        for epoch in range(epochs):
            loss = 0
            # loop through each doc in the corpus
            for doc in corpus:
                current_index = 0
                doc_len = len(doc)

                # grab center word context words
                while current_index < doc_len:
                    # center word
                    word = doc[current_index]
                    center_word_idx = self.word_index[word]
                    center_word_vector = np.zeros((len(self.vocab), 1))
                    center_word_vector[center_word_idx] = 1

                    # words in window size
                    left_window = max(0, current_index - window_size)
                    right_window= min(current_index + window_size, doc_len)
                    context_words = doc[left_window:current_index] + doc[current_index+1: right_window]

                    # add context words to positive samples (index of context words)
                    positive_samples = []
                    for cw in context_words:
                        cw_idx = self.word_index[cw]
                        positive_samples.append(cw_idx)
                    
                    # grab random 10 neg samples
                    negative_samples = list(set(self.negative_sample(10)) - set(positive_samples))

                    # samples to be updated
                    total_samples = positive_samples + negative_samples

                    # label to be predicted of selected words
                    label = np.zeros((len(self.vocab), 1))
                    label[positive_samples] = 1

                    # feed forward word through network
                    self.forward(center_word_vector)

                    # backprop
                    self.backprop(center_word_vector, label, total_samples, center_word_idx)

                    # calculate loss
                    loss = - np.sum(np.log(self.y[positive_samples])) - np.sum(-self.y[negative_samples])
                    
                    current_index += 1
            
            self.train_loss.append((epoch, loss))
            if epoch > 1:
                if abs(self.train_loss[-2][1] - self.train_loss[-1][1]) < 1e-4:
                    print ('Stopping Early')
                    break

            # print("Epoch: ", epoch, "Loss", loss)
            if epoch % 10 == 0:
                print("Epoch: ", epoch, "Loss", loss)

    # make predictions
    def predict(self, x, k=5):
        # convert word to one-hot-vector
        center_word_idx = self.word_index[x.lower()]
        center_word_vector = np.zeros((len(self.vocab), 1))
        center_word_vector[center_word_idx] = 1

        # feed it forward in the network and grab most similar words
        words = []
        h = np.dot(self.W1.T, center_word_vector)
        u = np.dot(self.W2.T, h)
        y = self.softmax(u)

        # store pred and keep trach of their index
        pred = dict(zip(range(len(y)), y))
        # sort based on probability of each word to be a context word
        pred_sorted = sorted(pred, key=lambda x: pred[x], reverse=True)

        # select the top k words
        top_context = pred_sorted[:k]
        # grab the word using its index from the vocab
        for w in top_context:
            words.append(self.vocab[w])

        return words


In [7]:
simple_corpus = "The quick brown fox jumps over the lazy dog."

In [8]:
tree_sg = SkipGram()

In [9]:
tree_sg.fit(simple_corpus, N=100, window_size=2, epochs=100, learning_rate=.01)

Epoch:  0 Loss 27.762170661715051871
Epoch:  10 Loss 1.637661212819013209
Epoch:  20 Loss 0.19866546731102966619
Epoch:  30 Loss 0.13702310372069873756
Epoch:  40 Loss 0.067367665164215754286
Epoch:  50 Loss 0.07927632780390618027
Epoch:  60 Loss 0.066287248172391194734
Epoch:  70 Loss 0.056254114102344675032
Epoch:  80 Loss 0.0417184141723886248
Epoch:  90 Loss 0.043487697881388946823


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
tree_sg.predict('fox', 2)

['jumps', 'brown']

# Using treebank corpus

## Grab random 500 doc from tree bank corpus "more docs than used in normal skip gram"

In [12]:
import nltk
nltk.download('treebank')
from nltk.corpus import treebank

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [13]:
np.random.seed(0)
corpus = np.random.choice(treebank.sents(), 500)

In [14]:
tree_bank_sg = SkipGram()
tree_bank_sg.fit(corpus, N=30, window_size=2, epochs=500, learning_rate=.01)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch:  0 Loss 5.7034501385658579685
Epoch:  10 Loss 6.6688911679724960655
Epoch:  20 Loss 6.1371392216316975063
Epoch:  30 Loss 4.027964769061404772
Epoch:  40 Loss 3.1897386700563670541
Epoch:  50 Loss 2.438827236072422324
Epoch:  60 Loss 2.0603206056129379797
Epoch:  70 Loss 1.9086652246541145276
Epoch:  80 Loss 1.4327058146553509392
Epoch:  90 Loss 1.7184273667191716421
Epoch:  100 Loss 2.0244380512727827934
Epoch:  110 Loss 1.3366981486923037874
Epoch:  120 Loss 1.3548533045489924607
Epoch:  130 Loss 0.66194677665427943295
Epoch:  140 Loss 0.71901751988829729843
Epoch:  150 Loss 0.6718347131520239223
Epoch:  160 Loss 1.3457234810726876783
Epoch:  170 Loss 0.76357242681824916955
Epoch:  180 Loss 0.56373279233549101195
Epoch:  190 Loss 0.97705589363694978476
Epoch:  200 Loss 0.5389898146508087449
Epoch:  210 Loss 0.4700505170512265051
Epoch:  220 Loss 0.28300066818794673856
Epoch:  230 Loss 0.62334299426920901144
Epoch:  240 Loss 0.30333818616339749513
Epoch:  250 Loss 0.44407441863

In [None]:
# same test used without negative sampling
print(np.random.choice(corpus))

['But', 'the', 'strength', 'in', 'heating', 'oil', 'helped', '*-1', 'push', 'up', 'crude', 'oil', '.']


In [15]:
tree_bank_sg.predict('oil')

['heating', 'strength', 'crude', 'helped', 'push']