In [1]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers
import nltk
from nltk.tokenize import TreebankWordTokenizer
from random import randint
import string

In [2]:
with open("./updated_train_data.txt", 'r', encoding = 'utf8') as dataFile:
    sentences = dataFile.read().splitlines()
tokenizer = TreebankWordTokenizer()

In [3]:
#exclude words based on Wikipedia Most common words list
excluded_words = {"the", "a", "an", "be", "to", "of", "and", "in", "that", "have", "I", \
                "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this"\
                "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "will"\
                "my", "one", "would", "there", "their", "what", "so", "up", "out",\
                "if", "about", "who", "get", "which", "go", "me", "when", "make", "can"\
                "like", "time", "no", "just", "know", "take", "into", "than", \
                 "then", "could", "how", "'", "is", '$', '£', "'s", "``", '-', '--', "'t"}
punc_set = set(string.punctuation) 
excluded_words = excluded_words | punc_set
excluded_str = ''
for item in excluded_words:
    excluded_str += item
excluded_str

'theiranwouldsoistimejust?^.weyou,not£without\'--gowhen:saythisbuttheofsheupabouttoat)makeget\\]&}forintodocould_knowwillmy\'tfrom%ifnome@/$=orhow+acanlikethereoneas~\'s>-theyhis"thanandthenwhichIitwho[!<|take(have*inthatby;{`onbewhatheher``#'

In [6]:
#sentence-word pairs
data = []
for sentence in sentences:
    sentence_len = randint(5,15)
    tokenized_sent = tokenizer.tokenize(sentence.lower())
    num_groups = len(tokenized_sent)//sentence_len
    for i in range(1, num_groups):
        sent_pair = tokenized_sent[:i*sentence_len]
        word_pair = tokenized_sent[i*sentence_len]
        if word_pair not in excluded_words:
            data.append((sent_pair, word_pair))

In [7]:
#get most common words and their frequiences
unique_words = dict()
for item in data:
    if item[1] not in unique_words:
        unique_words[item[1]] = 1
    else:
        unique_words[item[1]] += 1

In [21]:
x = unique_words
unique_words = dict(sorted(x.items(), key=lambda item: item[1], reverse = True))

#want vocabulary of first 100000 most common words
unique_words = {k: unique_words[k] for k in list(unique_words)[:100000]}

In [22]:
#create encoding of all the sentence-word pairs
import numpy as np
word_to_embedding_dict = dict()
embedding_to_word_dict = dict()
for i, word in enumerate(unique_words.keys()):
    word_to_embedding_dict[word] = i
    embedding_to_word_dict[i] = word

In [24]:
print(len(unique_words))
print(len(embedding_to_word_dict))
print(len(word_to_embedding_dict))
unique_words

100000
100000
100000


{'was': 84072,
 'said': 58423,
 'has': 55737,
 'but': 54366,
 'are': 50599,
 'will': 45501,
 'had': 40683,
 'more': 35401,
 'its': 35196,
 'were': 34972,
 'been': 34742,
 'after': 33220,
 'this': 32663,
 'new': 28794,
 'i': 25879,
 'two': 21805,
 'over': 21757,
 'people': 21479,
 'can': 21088,
 'year': 20917,
 'first': 20734,
 'all': 19849,
 'last': 19049,
 'some': 17319,
 'other': 17307,
 'because': 17213,
 'million': 16016,
 'years': 15879,
 'also': 15627,
 'percent': 15142,
 'government': 15098,
 'where': 14859,
 'only': 14592,
 'world': 13794,
 'u.s.': 13652,
 'before': 13579,
 'most': 13544,
 'while': 13416,
 'them': 13194,
 'three': 12549,
 'including': 12440,
 'him': 12368,
 'against': 12351,
 'president': 12327,
 'like': 12313,
 'being': 12112,
 'may': 11720,
 'us': 11556,
 'now': 11537,
 'such': 11300,
 'since': 11284,
 'company': 11283,
 'many': 11277,
 'our': 11130,
 'state': 10981,
 'down': 10903,
 'made': 10858,
 'any': 10681,
 'off': 10627,
 'through': 10585,
 'back': 104

In [25]:
word_to_embedding_dict

{'was': 0,
 'said': 1,
 'has': 2,
 'but': 3,
 'are': 4,
 'will': 5,
 'had': 6,
 'more': 7,
 'its': 8,
 'were': 9,
 'been': 10,
 'after': 11,
 'this': 12,
 'new': 13,
 'i': 14,
 'two': 15,
 'over': 16,
 'people': 17,
 'can': 18,
 'year': 19,
 'first': 20,
 'all': 21,
 'last': 22,
 'some': 23,
 'other': 24,
 'because': 25,
 'million': 26,
 'years': 27,
 'also': 28,
 'percent': 29,
 'government': 30,
 'where': 31,
 'only': 32,
 'world': 33,
 'u.s.': 34,
 'before': 35,
 'most': 36,
 'while': 37,
 'them': 38,
 'three': 39,
 'including': 40,
 'him': 41,
 'against': 42,
 'president': 43,
 'like': 44,
 'being': 45,
 'may': 46,
 'us': 47,
 'now': 48,
 'such': 49,
 'since': 50,
 'company': 51,
 'many': 52,
 'our': 53,
 'state': 54,
 'down': 55,
 'made': 56,
 'any': 57,
 'off': 58,
 'through': 59,
 'back': 60,
 'between': 61,
 'per': 62,
 'home': 63,
 'even': 64,
 'those': 65,
 'week': 66,
 'during': 67,
 'obama': 68,
 'much': 69,
 'should': 70,
 'country': 71,
 'billion': 72,
 'group': 73,
 'way

In [17]:
embedding_to_word_dict

{0: 'lost',
 1: 'are',
 2: 'building',
 3: 'said',
 4: 'storms',
 5: 'given',
 6: 'suggest',
 7: 'friends',
 8: 'tigers',
 9: 'internet',
 10: 'member',
 11: 'knowledge',
 12: 'compared',
 13: 'spain',
 14: 'football',
 15: 'reappointment',
 16: 'were',
 17: 'director',
 18: 'electricity',
 19: 'river',
 20: 'nuclear',
 21: 'experts',
 22: 'a-list',
 23: 'enjoyed',
 24: 'capital',
 25: 'glyndebourne',
 26: 'prices',
 27: 'consumer',
 28: 'energy',
 29: 'moderate',
 30: 'reuters',
 31: 'holzer',
 32: 'refined',
 33: 'raised',
 34: 'but',
 35: 'leaders',
 36: 'palestinians',
 37: 'been',
 38: 'raising',
 39: 'mobile',
 40: 'sells',
 41: 'each',
 42: 'electronics',
 43: 'ship',
 44: 'issue',
 45: 'minimum',
 46: 'manage',
 47: 'record',
 48: 'i',
 49: 'partnership',
 50: 'mr.',
 51: 'nor',
 52: 'son',
 53: 'several',
 54: 'supporter',
 55: 'don',
 56: 'bill',
 57: 'palin',
 58: 'recycling',
 59: 'speaker',
 60: 'gap',
 61: 'better',
 62: 'year',
 63: 'already',
 64: 'line-up',
 65: 'servi

In [26]:
desired_training_data = []
for item in data:
    if item[1] in word_to_embedding_dict:
        desired_training_data.append((item[0], word_to_embedding_dict[item[1]]))
desired_training_data

[(['what', 'does', 'it', 'say', 'about', 'pelosi', 'that', 'she'], 317),
 (['``',
   'we',
   'need',
   'them',
   'to',
   'move',
   'on',
   'and',
   'accept',
   'change',
   ',',
   'because',
   'their',
   'problems'],
  4),
 (['``',
   'we',
   'need',
   'them',
   'to',
   'move',
   'on',
   'and',
   'accept',
   'change',
   ',',
   'because',
   'their',
   'problems',
   'are',
   'a',
   'distraction',
   'from',
   'our',
   'goal',
   'of'],
  426),
 (['``',
   'we',
   'need',
   'them',
   'to',
   'move',
   'on',
   'and',
   'accept',
   'change',
   ',',
   'because',
   'their',
   'problems',
   'are',
   'a',
   'distraction',
   'from',
   'our',
   'goal',
   'of',
   'building',
   'a',
   'more',
   'integrated',
   'asean',
   ',',
   '``'],
  1),
 (['forecasts',
   'called',
   'for',
   'plunging',
   'temperatures',
   'and',
   'afternoon',
   'rain'],
  5614),
 (['stupak',
   'had',
   'pledged',
   'to',
   'oppose',
   'the',
   'health',
   'ca

In [27]:
len(desired_training_data)

8503003

In [28]:
np.save("./model_data", desired_training_data, allow_pickle = True)

In [29]:
#save the word to embedding
# and emedding to word dictionaries to be used for reference
import json
with open("./word_to_embedding.json", "w") as word_to_embedding_file:
    json.dump(word_to_embedding_dict, word_to_embedding_file, sort_keys=True, indent=4)
with open("./embedding_to_word.json", "w") as embedding_to_word_file:
    json.dump(embedding_to_word_dict, embedding_to_word_file, sort_keys=True, indent=4)

In [17]:
loaded_data = np.load("./model_data.npy", allow_pickle = True)
loaded_data[0]

array([list(['the', 'u.s.', 'centers', 'for', 'disease', 'control', 'and', 'prevention', 'initially', 'advised']),
       array([[1, 0, 0, ..., 0, 0, 0]], dtype=int8)], dtype=object)

In [18]:
loaded_data[0][0]

['the',
 'u.s.',
 'centers',
 'for',
 'disease',
 'control',
 'and',
 'prevention',
 'initially',
 'advised']