In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
path_to_drive = "drive/My\ Drive/OOV_EMBEDDING_MODEL"
path_to_model = os.path.join(path_to_drive,"model")
path_to_data = os.path.join(path_to_drive,"data")
path_to_saved = os.path.join(path_to_drive,"saved")

#IMPORT LIBRARIES

In [0]:
import os
import pandas as pd
import numpy as np
import time
import pickle

import keras
import keras.backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Bidirectional, LSTM, Embedding, Dense
from sklearn.utils import shuffle
import tensorflow as tf

Using TensorFlow backend.


#Parameters

In [0]:
EMBEDDING_DIM = 100

#data preproccessing

In [0]:
filename = "Copy of samples_data2.txt"

In [0]:
with open(os.path.join(path_to_data,filename),'r') as f:
  content = f.read().split('\n')
content = shuffle(content)
content = [c.split(",,")[0].strip().lower() for c in content]

In [0]:
content[:2]

['verify mlsdbitval bit of register mlsdreg is set',
 'assert mlsdfield bits in mlsdreg is set to mlsdval']

In [0]:
def data_generator(data):
  #keras dictionary
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(data)
  #save the dictionary
  with open(os.path.join(path_to_saved,'tokenizer.pkl'),'wb') as f:
    pickle.dump(tokenizer,f)
  
  #converts the data into integer sequences
  # inputs = tokenizer.texts_to_sequences(data)
  #print vocab_size
  vocab_size = len(tokenizer.word_index)+1
  print("Vocabulary Size  ::  ",vocab_size)

  #create new sequences 
  #new data sequence with the prev_seq + post_target_word
  seqs = []
  rev_seqs = []
  for line in data:
    encoded = tokenizer.texts_to_sequences([line])[0]
    rev_encoded = encoded[::-1]
    for i in range(len(encoded)):
      seq = encoded[:i+1]
      rev_seq = rev_encoded[:i+1]
      seqs.append(seq)
      rev_seqs.append(rev_seq)
  print("Total number of sequences are %d"%len(seqs))

  #max_length of inputs
  max_length = max([len(s) for s in seqs])
  print("Maximum lenght of sequences is %d"%max_length)

  #pad the sequences
  seqs = pad_sequences(seqs,maxlen=max_length,padding='pre')
  seqs = np.array(seqs)
  x,y = seqs[:,:-1],seqs[:,-1]

  #same for rev_seqs (reverse sequences)
  rev_seqs = pad_sequences(rev_seqs,maxlen=max_length,padding="pre")
  rev_seqs = np.array(rev_seqs)
  rev_x, rev_y = rev_seqs[:,:-1], rev_seqs[:,-1]

  return x, y, rev_x, rev_y, max_length, vocab_size


In [0]:
x, y, rev_x, rev_y, max_length, vocab_size = data_generator(data=content)

Vocabulary Size  ::   86
Total number of sequences are 25794
Maximum lenght of sequences is 20


In [0]:
len(x[1])

19

#Building Model

In [0]:
#define forword model

In [0]:
model = Sequential()
model.add(Embedding(vocab_size,EMBEDDING_DIM,input_length=max_length-1))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(vocab_size,activation='softmax'))
print(model.summary)




<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x7fd526fe1358>>


In [0]:
model.compile(optimizer="Adam",loss="sparse_categorical_crossentropy",metrics=['accuracy'])





In [0]:
model.fit(x=x, y=y , batch_size=100, epochs=200, verbose=2,)

In [0]:
import os
if not os.path.exists(path_to_model):
  os.mkdir(path_to_model)
  model.save(os.path.join(path_to_model,"model.h5"))

In [0]:
# define reverse model
rev_model = Sequential()
rev_model.add(Embedding(vocab_size, 100, input_length=max_length-1))
#rev_model.add(LSTM(100))
rev_model.add(Bidirectional(LSTM(100)))
rev_model.add(Dense(vocab_size, activation='softmax'))
print(rev_model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 19, 100)           8600      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_2 (Dense)              (None, 86)                17286     
Total params: 186,686
Trainable params: 186,686
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
rev_model.compile(optimizer="Adam",loss="sparse_categorical_crossentropy",metrics=['accuracy'])

In [0]:
rev_model.fit(x=rev_x, y=rev_y, batch_size=100, epochs=200, verbose=2)

In [0]:
if os.path.exists(path_to_model):
  rev_model.save(os.path.join(path_to_model,"rev_model.h5"))

#Save model 

In [0]:
# generate a sequence using a language model
def generate_seq(model, tokenizer, max_length, seed_text):
    if seed_text == "":
        return ""
    else:
        in_text = seed_text
        n_words = 1
        n_preds = 5 #number of words to predict for the seed text
        pred_words = ""
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = tokenizer.texts_to_sequences([in_text])[0]
            # pre-pad sequences to a fixed length
            encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
            # predict probabilities for each word
            proba = model.predict(encoded, verbose=0).flatten()
            #take the n_preds highest probability classes 
            yhat = numpy.argsort(-proba)[:n_preds] 
            # map predicted words index to word
            out_word = ''

            for _ in range(n_preds):
                for word, index in tokenizer.word_index.items():
                    if index == yhat[_] and word not in stopwords:
                        out_word = word
                        pred_words += ' ' + out_word
                        #print(out_word)
                        break


        return pred_words

In [0]:
#importing libraries
import spacy
from spacy.vocab import Vocab
import numpy
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
import pickle

In [0]:
# load the model
model = load_model(os.path.join(path_to_model,"model.h5"))
rev_model = load_model(os.path.join(path_to_model,"rev_model.h5"))

#load tokeniser and max_length
with open(os.path.join(path_to_saved,'tokenizer.pkl'), 'rb') as f:
    tokenizer = pickle.load(f)
    
# with open('max_length.pkl', 'rb') as f:
#     max_length = pickle.load(f)

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
# !python -m spacy download en_core_web_md
#loading stopwords to improve relevant word predictions    
# stopwords= open('stopwords').read().split()

#load spacy GloVe Model
nlp = spacy.load('en_core_web_md')







Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#Find and set embeddings for OOV words
def set_embedding_for_oov(doc):
    #checking for oov words and adding embedding
    for token in doc:
        if token.is_oov == True:
            print(token.text)
            before_text = doc[:token.i].text
            after_text = str(array(doc)[:token.i:-1]).replace('[','').replace(']','')

            pred_before = generate_seq(model, tokenizer, max_length-1, before_text).split()
            pred_after = generate_seq(rev_model, tokenizer, max_length-1, after_text).split()
            
            embedding = numpy.zeros((300,))

            i=len(before_text)
            print('Words predicted from forward sequence model:')
            for word in pred_before:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            i=len(after_text)
            print('Words predicted from reverse sequence model:')
            for word in pred_after:
                print(word)
                embedding += i*nlp.vocab.get_vector(word)
                i= i*.5
            nlp.vocab.set_vector(token.text, embedding)
            print(token.text,nlp.vocab.get_vector(token.text))

In [0]:
doc = nlp('wait if sasasas of asfasf is set to 242')
set_embedding_for_oov(doc)

sasasas
Words predicted from forward sequence model:
mlsdbitval
bits
mlsdfield
mlsdreg
bit
Words predicted from reverse sequence model:
mlsdbitval
mlsdreg
mlsdreg's
mlsdfield
bit
sasasas [ 3.03316154e+01  1.17391930e+01  5.54521227e+00 -2.19785423e+01
 -2.32484379e+01 -1.13341961e+01 -4.66820145e+00 -1.98870583e+01
 -2.12542610e+01  2.40024643e+02  4.40581417e+00 -2.69718971e+01
 -1.07687759e+01  1.45400038e+01  2.24786396e+01 -3.25940681e+00
 -4.87834015e+01  2.18520630e+02 -4.72285795e+00 -4.59136620e+01
 -3.57097359e+01  1.49753590e+01 -1.79020767e+01 -1.16812019e+01
  1.46388483e+01 -4.12850952e+01  3.93421721e+00 -2.77517262e+01
 -5.22202158e+00 -5.71729517e+00 -3.52275772e+01  1.60578766e+01
  9.98739529e+00 -6.26056480e+00  4.43319321e+01 -1.26360445e+01
  2.08469963e+01  2.41237583e+01 -2.90555210e+01  3.58107328e+00
  2.05226212e+01  6.76669464e+01  1.10556288e+01 -3.14615917e+01
  2.47559977e+00 -8.67858601e+00 -4.28124094e+00 -3.29253616e+01
 -4.16007500e+01 -1.35199041e+01 

In [0]:
# nlp('gjjjhh').vocab
[w.orth_ for w in nlp('gjjjhh').vocab]

['EDPT',
 'croup',
 'Blunt.',
 'CAFFIAUX',
 'beruhigt',
 'Sibs',
 'Ginjo',
 'stumpfe',
 'gorey',
 'here..i',
 'DECEASED',
 'MOSCOVY',
 'Blough',
 'Ginja',
 'mcleish',
 '5,10,20',
 'Cayla',
 'chontales',
 'datsyuk',
 '28,398.14',
 '153.65',
 'Imaginaerum',
 'eradicate',
 '118.10',
 'Jambalaya',
 'DOG-POWERED',
 'chiodi',
 'State-Farm',
 'meeni',
 '.23',
 'GILLAN',
 '~an',
 '/r/raleigh',
 'XXRs',
 '81ST',
 'klefki',
 'Criken',
 'traverser',
 'mersey',
 'batphoned',
 'SERB',
 'ec-135',
 'romen',
 'Sadeq',
 'COBRA',
 'guthrie',
 'piense',
 'SCREENPRINT',
 'cardgame',
 'Askin',
 'зная',
 'VACATION-BUSINESS',
 'patootie',
 '198,410,000',
 'wade',
 'state-industry',
 'LTV-owned',
 'equator',
 'T60',
 'cismales',
 'http://i.imgur.com/Y9o0p.png',
 'RIVETT',
 'Cross-Trades',
 'organisaties',
 'ladri',
 'UNDULY',
 '0-stone',
 'hostilely',
 'UNDERWRITER',
 'skyliner',
 '|grep',
 'oshima',
 'shit&gt',
 'Nitrosamines',
 'land-oriented',
 'Fifth-In-A-Row',
 '&gt;Benefit',
 'dak',
 'kippur',
 'underhå

In [0]:
most_similar(nlp('gjjjhh'))

In [0]:
#analysis

In [0]:
#function to find most similar words
def most_similar(word):
    by_similarity = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)
    return [w.orth_ for w in by_similarity[:10]]

In [0]:
#test1

In [0]:
nlp.vocab.get_vector('gjjjhh')

NameError: ignored

In [0]:
test1 = nlp('i live in lndn ')
set_embedding_for_oov(test1)
nlp.vocab.get_vector('lndn')

In [0]:
nlp('lndn').similarity(nlp('London'))

In [0]:
most_similar(nlp('lndn'))

In [0]:
#test2

In [0]:
test2 = nlp('i play fidditch at school')

In [0]:
nlp.vocab.get_vector('fidditch')

In [0]:
set_embedding_for_oov(test2)

In [0]:
nlp.vocab.get_vector('fidditch')

In [0]:
most_similar(nlp('mlsdreg'))

In [0]:
nlp('gjjjhh').similarity(nlp('field'))

0.41892825600514766

In [0]:
from scipy import spatial

dataSetI = nlp.vocab.get_vector('sasasas')
dataSetII = nlp.vocab.get_vector('242')
result = 1 - spatial.distance.cosine(dataSetI, dataSetII)
print(result)

0.0512220561504364
