In [None]:
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Embedding, RepeatVector, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from keras.models import load_model

import re
import os
import json
import pickle
import keras
import random
import pandas as pd

In [None]:
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 5 #number of words to predict for the seed text
        pred_words = ""
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            # predict probabilities for each word
            proba = model.predict(encoded, verbose=0).flatten()
            #take the n_preds highest probability classes 
            yhat = numpy.argsort(-proba)[:n_preds] 
            # map predicted words index to word
            out_word = ''

            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stopwords:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break


        return pred_words
    

In [None]:
# load the model
model = load_model('model.h5')
rev_model = load_model('rev_model.h5')

#load tokeniser and max_length
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
    
with open('max_length.pkl', 'rb') as f:
    max_length = pickle.load(f)
    
#loading stopwords to improve relevant word predictions    
stopwords= open('stopwords').read().split()

#load spacy GloVe Model
nlp = spacy.load('en_core_web_md')

In [None]:
#Find and set embeddings for OOV words
def set_embedding_for_oov(doc):
    #checking for oov words and adding embedding
    for token in doc:
        if token.is_oov == True:
            before_text = doc[:token.i].text
            after_text = str(array(doc)[:token.i:-1]).replace('[','').replace(']','')

            pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
            pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
            embedding = numpy.zeros((300,))

            i=len(before_text)
            print('Words predicted from forward sequence model:')
            for word in pred_before:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            i=len(after_text)
            print('Words predicted from reverse sequence model:')
            for word in pred_after:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            nlp.vocab.set_vector(token.text, embedding)
            print(token.text,nlp.vocab.get_vector(token.text))

In [None]:
doc = nlp('i livwgffe in london ')
set_embedding_for_oov(doc)

In [None]:
most_similar(nlp('livwgffe'))

In [None]:
def most_similar(word):
    by_similarity = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)
    return [w.orth_ for w in by_similarity[:10]]

In [None]:
test1 = nlp('i live in lndn ')
set_embedding_for_oov(test1)
nlp.vocab.get_vector('lndn')

In [None]:
nlp('lndn').similarity(nlp('London'))

In [None]:
most_similar(nlp('lndn'))

In [None]:
test2 = nlp('i play fidditch at school')

In [None]:
set_embedding_for_oov(test2)

In [None]:
most_similar(nlp('fidditch'))

In [None]:
nlp('fidditch').similarity(nlp('sport'))