In [23]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers
import nltk
from nltk.tokenize import TreebankWordTokenizer
from random import randint
import string

In [2]:
with open("./updated_train_data.txt", 'r', encoding = 'utf8') as dataFile:
    sentences = dataFile.read().splitlines()
tokenizer = TreebankWordTokenizer()

In [67]:
#exclude words based on Wikipedia Most common words list
excluded_words = {"the", "a", "an", "be", "to", "of", "and", "in", "that", "have", "I", \
                "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this"\
                "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "will"\
                "my", "one", "would", "there", "their", "what", "so", "up", "out",\
                "if", "about", "who", "get", "which", "go", "me", "when", "make", "can"\
                "like", "time", "no", "just", "know", "take", "into", "than", \
                 "then", "could", "how", "'", "is", '$', '£', "'s", "``", '-', '--', "'t"}
punc_set = set(string.punctuation) 
excluded_words = excluded_words | punc_set
excluded_str = ''
for item in excluded_words:
    excluded_str += item
excluded_str

'}know<^onehis+andwouldgohowtime>their\\takeit\'tweabout)which\'£?]meintobedofornojusthaveyou!what,--Isheupcanlike.toanoutas@&so*heronwillmymakecould-of"/_thisbut#notsaythanifthathe``:get$=within%oris;whenby{at[the(\'s`therethenwho|afrom~they'

In [68]:
keras_tokenizer = keras.preprocessing.text.Tokenizer(
    num_words = 50001,
    filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n``--' + excluded_str
)
keras_tokenizer.fit_on_texts(sentences)

In [69]:
#sentence-word pairs
sentence_len = 10
data = []
for sentence in sentences:
    tokenized_sent = tokenizer.tokenize(sentence.lower())
    num_groups = len(tokenized_sent)//sentence_len
    for i in range(1, num_groups):
        sent_pair = tokenized_sent[:i*sentence_len]
        word_pair = tokenized_sent[i*sentence_len]
        if word_pair not in excluded_words:
            data.append((sent_pair, word_pair))

In [70]:
#get most common words and their frequiences
unique_words = dict()
for item in data:
    if item[1] not in unique_words:
        unique_words[item[1]] = 1
    else:
        unique_words[item[1]] += 1
unique_words

{'school': 288,
 'itself': 77,
 'most': 599,
 'even': 519,
 'home': 440,
 'case': 235,
 'ms.': 51,
 'chemistry': 9,
 'husbands': 4,
 'but': 3336,
 'lloyds': 12,
 'national': 321,
 'horse': 19,
 'will': 2141,
 'given': 212,
 'win': 170,
 'again': 185,
 'pleaded': 18,
 'assistance': 20,
 'listen': 18,
 'ago': 217,
 'arguing': 50,
 'back': 534,
 'midday': 5,
 'bank': 270,
 'america': 180,
 'request': 46,
 'rift': 7,
 'are': 2457,
 'support': 291,
 'years': 661,
 'only': 662,
 'secret': 43,
 'team': 291,
 'league': 134,
 'gawk': 1,
 'field': 104,
 'because': 1105,
 'voice': 38,
 'this': 1560,
 'said': 3305,
 'citizens': 22,
 'opted': 9,
 'work': 414,
 'norway': 15,
 'forward': 96,
 'been': 1597,
 'also': 715,
 'record': 140,
 'i': 1510,
 'democratic': 126,
 'commission': 108,
 'interests': 42,
 'remained': 53,
 'include': 102,
 'fear': 67,
 'speaker': 25,
 'seen': 184,
 'ignored': 16,
 'ensure': 110,
 'mega': 1,
 'lawmakers': 46,
 'europeans': 5,
 'was': 3916,
 'contributed': 20,
 'rollove

In [71]:
x = unique_words
dict(sorted(x.items(), key=lambda item: item[1], reverse = True))

{'was': 3916,
 'but': 3336,
 'said': 3305,
 'are': 2457,
 'has': 2434,
 'will': 2141,
 'had': 1992,
 'were': 1640,
 'been': 1597,
 'after': 1561,
 'this': 1560,
 'i': 1510,
 'its': 1504,
 'more': 1466,
 'new': 1283,
 'because': 1105,
 'can': 1070,
 'people': 1026,
 'two': 940,
 'over': 914,
 'all': 913,
 'year': 885,
 'some': 875,
 'where': 871,
 'other': 837,
 'last': 835,
 'first': 802,
 'government': 765,
 'also': 715,
 'them': 690,
 'only': 662,
 'years': 661,
 'us': 649,
 'while': 646,
 'any': 625,
 'president': 608,
 'like': 606,
 'him': 601,
 'being': 600,
 'most': 599,
 'world': 593,
 'such': 592,
 'before': 591,
 'our': 586,
 'many': 586,
 'against': 582,
 'u.s.': 580,
 'including': 564,
 'made': 541,
 'back': 534,
 'may': 533,
 'should': 532,
 'those': 524,
 'three': 523,
 'state': 521,
 'even': 519,
 'now': 513,
 'obama': 490,
 'way': 484,
 'through': 479,
 'down': 472,
 'off': 465,
 'week': 459,
 'country': 454,
 'much': 452,
 'company': 450,
 'since': 442,
 'home': 440,
 '

In [79]:
#create one-hot encoding of all the sentence-word pairs
import numpy as np
word_embedding = dict()
for i, word in enumerate(unique_words.keys()):
    embed_vector = np.zeros(shape = (1,len(unique_words)), dtype = np.int8)
    np.put(embed_vector, [i], [1])
    word_embedding[word] = embed_vector

In [81]:
print(len(unique_words))
len(word_embedding)

40000


40000

In [85]:
len(sentences)

740427

In [88]:
#create training dataset
len_training_data = int(len(data)*0.8)
len_training_data

#shuffle dataset
data.shuffle()


308411