<a href="https://colab.research.google.com/github/AUT-Student/NLP-HW1/blob/main/NLP_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!gdown --id 16C0_9i0io43VfABV3-uukUjJYlM6k-2U
!unzip /content/HW1-datasets.zip

Downloading...
From: https://drive.google.com/uc?id=16C0_9i0io43VfABV3-uukUjJYlM6k-2U
To: /content/HW1-datasets.zip
100% 3.14M/3.14M [00:00<00:00, 16.2MB/s]
Archive:  /content/HW1-datasets.zip
replace content/HW1-datasets/train.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: Y
  inflating: content/HW1-datasets/train.txt  
replace content/HW1-datasets/valid.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: content/HW1-datasets/valid.txt  
  inflating: content/HW1-datasets/test_incomplete.txt  
  inflating: content/HW1-datasets/test.txt  
  inflating: content/HW1-datasets/test_incomplete_gold.txt  


In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random

In [2]:
file = open("/content/content/HW1-datasets/train.txt")

vocabulary = set()

for i, line in enumerate(file.readlines()):
  if i < 30*1000:
    words = line.split()

    for word in words:
      vocabulary.add(word)

sorted_vocabulary = sorted(vocabulary)
print(f"Vocab Size = {len(vocabulary)}")

Vocab Size = 21188


In [3]:
from bisect import bisect_left

class NLM(keras.Model):
  def __init__(self, vocabulary):
    super().__init__()
    self.vocabulary = vocabulary
    self.model = keras.models.Sequential([
                                          keras.layers.Input((2,), name="Input"),
                                          keras.layers.Embedding(len(self.vocabulary) + 1, 128, name="Embedding"),
                                          keras.layers.Flatten(name="Flatten"),
                                          keras.layers.Dense(units=256, name="Hidden"),
                                          keras.layers.Dense(units=len(self.vocabulary), activation="softmax", name="Output"),
    ])
    
  def convert_word_to_index(self, word):
    pos = bisect_left(self.vocabulary, word, 0, len(self.vocabulary))
    return pos if pos != len(self.vocabulary) and self.vocabulary[pos] == word else len(self.vocabulary)

  def convert_index_to_word(self, index):
    return self.vocabulary[index]

  def call(self, inputs):
    return self.model(inputs)

  def predict(self, inputs):
    model_output =  self.model(inputs)
    predicted_word_indexes = keras.backend.argmax(model_outputs, axis=1)
    return predicted_word_indexes

In [4]:
nlm = NLM(vocabulary=sorted_vocabulary)

In [5]:
nlm.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [6]:
import random
def create_neural_dataset(path, line_limit=None, data_number_limit=None):
  file = open(path)

  X_dataset = []
  Y_dataset = []

  for i, line in enumerate(file.readlines()):
    if line_limit is not None and i >= line_limit:
      break

    words = line.split()

    for j in range(2, len(words)):
      word1 = words[j-2]
      word2 = words[j-1]
      word_target = words[j]

      index1 = nlm.convert_word_to_index(word1)
      index2 = nlm.convert_word_to_index(word2)
      index_target = nlm.convert_word_to_index(word_target)

      X_dataset.append([index1, index2])
      Y_dataset.append(index_target)

  print(len(X_dataset))
  if data_number_limit is not None:
    random.seed(0)
    X_dataset, Y_dataset = zip(*random.sample(list(zip(X_dataset, Y_dataset)), data_number_limit))

  X_dataset = np.array(X_dataset)
  Y_dataset = tf.one_hot(indices=Y_dataset, depth=len(vocabulary)).numpy()
  return X_dataset, Y_dataset

In [7]:
X_valid, Y_valid = create_neural_dataset(path="/content/content/HW1-datasets/valid.txt", data_number_limit=25*1000)

95190


In [8]:
X_train, Y_train = create_neural_dataset(path="/content/content/HW1-datasets/train.txt",
                                         line_limit=30*1000,
                                         data_number_limit=50*1000)

151278


In [9]:
nlm.fit(x=X_train, y=Y_train,validation_data= (X_valid, Y_valid), epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f275312b410>

In [None]:
X_test, Y_test = create_neural_dataset(path="/content/content/HW1-datasets/test.txt")

In [None]:
X_train = np.array([[4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           [4, 5],
           [6, 8],
           [2, 9],
           ])

# Y_train = ["سلام", "بر", "ایران","او","از","آن","که","آمد","بر"]
# Y_train = [3,2,3,4,7,5,11,2,3,3,2,3,4,7,5,11,2,3,3,2,3,4,7,5,11,2,3,3,2,3,4,7,5,11,2,3]

Y_train = tf.one_hot(indices=[3,2,3,4,7,5,11,2,3,3,2,3,4,7,5,11,2,3,3,2,3,4,7,5,11,2,3,3,2,3,4,7,5,11,2,3], depth=len(vocabulary)).numpy()

bnlm.fit(x=X_train, y=Y_train, epochs=20)

Epoch 1/20
inputs = Tensor("IteratorGetNext:0", shape=(None, 2), dtype=int64)
model_outputs = Tensor("bigram_nlm_3/sequential_3/Output/Softmax:0", shape=(None, 21188), dtype=float32)
inputs = Tensor("IteratorGetNext:0", shape=(None, 2), dtype=int64)
model_outputs = Tensor("bigram_nlm_3/sequential_3/Output/Softmax:0", shape=(None, 21188), dtype=float32)
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f7c8a8f33d0>

In [None]:
keras.backend.argmax(tf.constant([
                        [4, 5],
                        [4, 7],
                        [6, 2]
                        ]), axis=1)

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([1, 1, 0])>