In [132]:
import numpy as np
import nltk
import math
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [133]:
def load_data(path):
    
    f = open(path,'r')
    content = f.readlines()
    f.close()
    
    return content

In [134]:
def make_n_grams(data, n):

  all_X = []
  all_y = []

  for line in data:
    if len(line)<=n:
      continue

    else:
      for i in range(0, len(line)-n):
        X = line[i:i+n]
        y = line[i+n]

        all_X.append(np.asarray(X))
        all_y.append(y)

  all_X = np.asarray(all_X)
  all_y = np.asarray(all_y)

  return all_X, all_y

In [229]:
def tokenize_data(data):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(data)
    vocab = tokenizer.word_index
    vocab_size = len(vocab) + 1
    encoded_data = tokenizer.texts_to_sequences(data)

    return encoded_data, vocab, tokenizer

In [230]:
def preprocess_data(data):

  data, vocab, eng_tokenizer = tokenize_data(data)
  X, y = make_n_grams(data, 8)

  return X, y, vocab, eng_tokenizer

In [231]:
eng_train = load_data('/content/drive/MyDrive/europarl-corpus/train.europarl')
X, y, eng_vocab, eng_tokenizer = preprocess_data(eng_train)

In [200]:
reverse_vocab = {}

for k,v in eng_vocab.items():
  reverse_vocab[v] = k

In [202]:
num_tokens = len(eng_vocab)+1
embedding_dim = 100

In [209]:
embedding_layer = tf.keras.layers.Embedding(num_tokens, embedding_dim, trainable=True)

def language_model():
    
    inp = tf.keras.Input(shape=(None,), dtype="int64")
    embedded_sequences = embedding_layer(inp)

    lstm = tf.keras.layers.LSTM(256)
    rep = lstm(embedded_sequences)
    
    dense = tf.keras.layers.Dense(num_tokens, activation="softmax")
    output = dense(rep)

    model = tf.keras.Model(inp, output)

    return model

In [210]:
model = language_model()

In [211]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 100)         1512500   
                                                                 
 lstm_3 (LSTM)               (None, 256)               365568    
                                                                 
 dense_3 (Dense)             (None, 15125)             3887125   
                                                                 
Total params: 5,765,193
Trainable params: 5,765,193
Non-trainable params: 0
_________________________________________________________________


In [212]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
model.fit(X, y, batch_size=128, epochs=10)



<keras.callbacks.History at 0x7feb63a06ed0>

In [183]:
model.save("/content/drive/MyDrive/question1_epoch10")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/question1_epoch10/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/question1_epoch10/assets


In [232]:
eng_test = load_data('/content/drive/MyDrive/europarl-corpus/test.europarl')
eng_test_data = eng_tokenizer.texts_to_sequences(eng_test)

In [234]:
test_X, test_y = make_n_grams(eng_test_data, 4)

In [236]:
preds = model.predict(test_X, batch_size=16)

In [237]:
def decode_pred(preds):
    
    all_preds_idx = []
    all_preds = []

    for i in range(preds.shape[0]):
      val = np.argmax(preds[i])
      all_preds_idx.append(val)
      
    for val in all_preds_idx:
      all_preds.append(reverse_vocab[val])
      
    all_preds_idx = np.asarray(all_preds_idx)
    all_preds = np.asarray(all_preds)
    
    return all_preds_idx, all_preds

In [238]:
dec_preds, dec_preds_word = decode_pred(preds)

In [217]:
def get_perplexity(sent):

  perp = 0
  if len(sent)<4:
    return 0
  else:
    cnt=0
    for i in range(len(sent)-4):
      cnt+=1
      inp = sent[i:i+4]

      inp = inp.reshape(1,-1)
      pred = model.predict(inp)
      val = np.max(pred)

      perp += math.log(val)/math.log(2)
    
    return (-1*perp)/cnt

In [243]:
for i, sent in enumerate(X[:5]):
  perp = get_perplexity(np.asarray(sent))
  print(eng_train[i].split("\n")[0], end="\t")
  print(perp)  

Resumption of the session	2.899883634135354
I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.	2.025733719206991
Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.	2.4543161840810472
You have requested a debate on this subject in the course of the next few days, during this part-session.	2.675161205373426
In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.	2.3873106445711922


In [242]:
for i, sent in enumerate(eng_test_data[:5]):
  perp = get_perplexity(np.asarray(sent))
  print(eng_test[i].split("\n")[0], end="\t")
  print(perp)

When used preventively, it saves the state and the economy a great deal of money.	3.6488758710538525
I have completely failed to understand in this debate why a reasonable set of rules was not adopted back in 1993, especially as the Commission and Parliament did not want any derogations even then.	3.3339458113589893
Seven million workers were affected and specific sectors, such as the mobile worker sector, have been subject to ruinous competition over recent years, especially in Germany.	3.118494075404921
It is therefore also a social problem and it is not enough, Mr Crowley, to use tachographs or other technical aids.	3.7744358820129635
One does not exclude the other.	3.612589429540818
