<a href="https://colab.research.google.com/github/AUT-Student/NLP-HW1/blob/main/NLP_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [1]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import accuracy_score
import numpy as np
import random
import math

# Dataset

In [3]:
!gdown --id 16C0_9i0io43VfABV3-uukUjJYlM6k-2U
!unzip /content/HW1-datasets.zip

Downloading...
From: https://drive.google.com/uc?id=16C0_9i0io43VfABV3-uukUjJYlM6k-2U
To: /content/HW1-datasets.zip
100% 3.14M/3.14M [00:00<00:00, 17.0MB/s]
Archive:  /content/HW1-datasets.zip
   creating: content/HW1-datasets/
  inflating: content/HW1-datasets/train.txt  
  inflating: content/HW1-datasets/valid.txt  
  inflating: content/HW1-datasets/test_incomplete.txt  
  inflating: content/HW1-datasets/test.txt  
  inflating: content/HW1-datasets/test_incomplete_gold.txt  


# Statistical Language Model

## Counting

In [None]:
file = open("/content/content/HW1-datasets/train.txt")

word_counter = {}
pair_counter = {}
vocabulary = set()
N = 0

for line in file.readlines():
    words = line.split()
    N += len(words)
    
    for i, word in enumerate(words):
        vocabulary.add(word)
        if word_counter.get(word) is None:
            word_counter[word] = 0
        word_counter[word] += 1
    
        if i>0:
            pair = (words[i-1], word)
            if pair_counter.get(pair) is None:
                pair_counter[pair] = 0
            pair_counter[pair] += 1

## Functions

In [None]:
def create_unigram(delta):
  def unigram(word):
    if word in vocabulary:
      return word_counter[word] / N 
    else:
      return delta / N

  return unigram

In [None]:
def create_bigram(delta, unigram):
    def calculate_B(delta):
        B_word = {}

        for word in vocabulary:
          B_word[word] = 0

        for pair in pair_counter:
            word1 = pair[0]
            B_word[word1] += 1
        
        for word in vocabulary:
          if B_word[word]==0:
            B_word[word] = 1

        return B_word
    
    B_word = calculate_B(delta)
    
    def bigram(word1, word2):
      p_bg = unigram(word2)
      if word1 not in vocabulary:
        return p_bg
      else:
        alpha = delta / word_counter[word1] * B_word[word1]

        return (max(pair_counter.get((word1, word2), 0) - delta, 0) / word_counter[word1]) + alpha * p_bg

    return bigram

In [None]:
def create_statistical_dataset(path, number_words, line_limit=None, data_number_limit=None):
  file = open(path)

  X_dataset = []
  Y_dataset = []

  for i, line in enumerate(file.readlines()):
    if line_limit is not None and i >= line_limit:
      break

    all_words = line.split()

    for j in range(number_words, len(all_words)):
      words = []
      for k in range(number_words):
        word = all_words[j-k-1] 
        words.append(word)
      
      X_dataset.append(words)

      word_target = all_words[j]
      Y_dataset.append(word_target)

  print(f"Maximum Data = {len(X_dataset)}")

  if data_number_limit is not None:
    random.seed(0)
    X_dataset, Y_dataset = zip(*random.sample(list(zip(X_dataset, Y_dataset)), data_number_limit))

  X_dataset = np.array(X_dataset)
  Y_dataset = np.array(Y_dataset)
  return X_dataset, Y_dataset

## Perplexity

In [None]:
def unigram_perplexity(path):
  file = open(path)

  counter = 0
  probability = 0

  for line in file.readlines():
    words = line.split()
    
    for word in words:
      probability += math.log(unigram(word))
      counter += 1

  log_perplexity = -1/counter*probability
  
  perplexity = math.exp(log_perplexity)

  return perplexity

In [None]:
def bigram_perplexity(path):
  file = open(path)

  counter = 0
  probability = 0

  for line in file.readlines():
    words = line.split()
  
    for i in range(1, len(words)):
      probability += math.log(bigram(words[i-1], words[i]))
      counter += 1

  log_perplexity = -1/counter*probability  

  perplexity = math.exp(log_perplexity)

  return perplexity

## Fine-Tune

In [None]:
path = "/content/content/HW1-datasets/valid.txt"

best_unigram_perplexity = np.inf
best_unigram_delta = None

for delta in np.arange(0.2, 0.98, 0.02):
  unigram = create_unigram(delta=delta)

  new_unigram_perplexity = unigram_perplexity(path)

  if new_unigram_perplexity < best_unigram_perplexity:
    best_unigram_perplexity = new_unigram_perplexity
    best_unigram_delta = delta

print(f"Delta = {best_unigram_delta} Perplexity = {best_unigram_perplexity}")

Delta = 0.9599999999999995 Perplexity = 1828.493440423088


In [None]:
path = "/content/content/HW1-datasets/valid.txt"

best_bigram_perplexity = np.inf
best_bigram_delta = None

for delta in np.arange(0.2, 0.98, 0.02):
  unigram = create_unigram(delta=0.96)
  bigram = create_bigram(delta=delta, unigram=unigram)

  new_bigram_perplexity = bigram_perplexity(path)

  if new_bigram_perplexity < best_bigram_perplexity:
    best_bigram_perplexity = new_bigram_perplexity
    best_bigram_delta = delta

print(f"Delta = {best_bigram_delta} Perplexity = {best_bigram_perplexity}")

Delta = 0.8799999999999997 Perplexity = 1318.4189504258493


In [None]:
unigram = create_unigram(delta=0.96)
bigram = create_bigram(delta=0.88, unigram=unigram)

In [None]:
for text in ["train", "test", "valid"]:
  path = f"/content/content/HW1-datasets/{text}.txt"
  print(f"Unigram perplexity of {text} = {unigram_perplexity(path)}")

Unigram perplexity of train = 1768.173187352548
Unigram perplexity of test = 1810.0517762463387
Unigram perplexity of valid = 1828.493440423088


In [None]:
for text in ["train", "test", "valid"]:
  path = f"/content/content/HW1-datasets/{text}.txt"
  print(f"Bigram perplexity of {text} = {bigram_perplexity(path)}")

Bigram perplexity of train = 322.6322649838004
Bigram perplexity of test = 1326.7163336443393
Bigram perplexity of valid = 1318.4189504258493


## Unigram

In [None]:
unigram = create_unigram(delta=0.96)

### Quantitative Test

In [None]:
for dataset in ["train", "test", "valid"]:
  X_test, Y_test = create_statistical_dataset(path=f"/content/content/HW1-datasets/{dataset}.txt",
                                              number_words=0,
                                              data_number_limit=50*1000)

  best_word = None
  best_probability = -math.inf
  for condidate_word in vocabulary:
      if unigram(condidate_word) > best_probability:
          best_probability = unigram(condidate_word)
          best_word = condidate_word

  Y_test_predict = []
  for _ in range(len(Y_test)):
    Y_test_predict.append(best_word)

  print(f"{dataset} Accuracy = {accuracy_score(Y_test, Y_test_predict)}")

Maximum Data = 1063985
train Accuracy = 0.04028
Maximum Data = 85324
test Accuracy = 0.03968
Maximum Data = 132968
valid Accuracy = 0.03992


### Qualitative Test

In [None]:
best_word = None
best_probability = -math.inf
for condidate_word in vocabulary:
    if unigram(condidate_word) > best_probability:
        best_probability = unigram(condidate_word)
        best_word = condidate_word

file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    print(line.strip(), end=" ")
    for _ in range(digit):
        print(f"\x1b[32m{best_word}\x1b[0m", end= " ")
    print()
    
    print(incomplete_gold_lines[i])    

این سخن حقست اگر نزد سخن گستر [32mو[0m 
این سخن حقست اگر نزد سخن گستر برند

آنکه با یوسف صدیق چنین خواهد [32mو[0m 
آنکه با یوسف صدیق چنین خواهد کرد

هیچ دانی چکند صحبت او با [32mو[0m 
هیچ دانی چکند صحبت او با دگران

سرمه دهی بصر بری سخت خوش است [32mو[0m 
سرمه دهی بصر بری سخت خوش است تاجری

آتش ابراهیم را [32mو[0m [32mو[0m 
آتش ابراهیم را نبود زیان

من که اندر سر [32mو[0m [32mو[0m 
من که اندر سر جنونی داشتم

هر شیر شرزه را که به نیش [32mو[0m [32mو[0m 
هر شیر شرزه را که به نیش سنان گزید

هرکه از حق به [32mو[0m [32mو[0m [32mو[0m 
هرکه از حق به سوی او نظریست

گفت این از [32mو[0m [32mو[0m [32mو[0m 
گفت این از خدای باید خواست

کلاه لاله که لعل است [32mو[0m [32mو[0m [32mو[0m 
کلاه لاله که لعل است اگر تو بشناسی


## Bigram

In [None]:
bigram = create_bigram(delta=0.88, unigram=unigram)

### Quantitative Test

In [None]:
for dataset in ["train", "valid", "test"]:

  X_test, Y_test = create_statistical_dataset(path=f"/content/content/HW1-datasets/{dataset}.txt",
                                              number_words=1,
                                              data_number_limit=2*1000)  

  Y_test_predict = []

  for i in range(len(Y_test)):
    if i%500 == 0:
      print(f"i = {i}")

    Y_test[i]
    best_word = None
    best_probability = -math.inf
    last_word = X_test[i][0]

    for condidate_word in vocabulary:
      if bigram(last_word, condidate_word) > best_probability:
          best_probability = bigram(last_word, condidate_word)
          best_word = condidate_word

    Y_test_predict.append(best_word)

  print(f"{dataset} Accuracy = {accuracy_score(Y_test, Y_test_predict)}")

Maximum Data = 912870
i = 0
i = 500
i = 1000
i = 1500
train Accuracy = 0.1295
Maximum Data = 114079
i = 0
i = 500
i = 1000
i = 1500
valid Accuracy = 0.0765
Maximum Data = 73199
i = 0
i = 500
i = 1000
i = 1500
test Accuracy = 0.08


### Qualitative Test

In [None]:
file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    line = line.strip()
    words = line.split()
    
    last_word = words[-1]
    
    print(line, end=" ")
    
    for _ in range(digit):        
        best_word = None
        best_probability = -math.inf
        
        for condidate_word in vocabulary:
            if bigram(last_word, condidate_word) > best_probability:
                best_probability = bigram(last_word, condidate_word)
                best_word = condidate_word
            
        print(f"\x1b[32m{best_word}\x1b[0m", end= " ")
        last_word = best_word
    
    print()
    
    print(incomplete_gold_lines[i])  

# Neural Language Model

## Vocabulary

In [2]:
file = open("/content/content/HW1-datasets/train.txt")

vocabulary = set()

for i, line in enumerate(file.readlines()):
  if i < 30*1000:
    words = line.split()

    for word in words:
      vocabulary.add(word)

vocabulary = sorted(vocabulary)
print(f"Vocabulary Size = {len(vocabulary)}")

Vocabulary Size = 21188


## Model

In [20]:
from bisect import bisect_left

class NLM(keras.Model):
  def __init__(self, vocabulary_size, input_size, embedding_size=128, hidden_size=256):
    super().__init__()
    self.input_size = input_size
    self.model = keras.models.Sequential([
                                          keras.layers.Input((input_size,), name="Input"),
                                          keras.layers.Embedding(vocabulary_size + 1, embedding_size, name="Embedding"),
                                          keras.layers.Flatten(name="Flatten"),
                                          keras.layers.Dense(units=hidden_size, name="Hidden"),
                                          keras.layers.Dense(units=vocabulary_size + 1, activation="softmax", name="Output"),
    ])

  def call(self, inputs):
    return self.model(inputs)

  def single_predict(self, words):
    indexes = []
    for word in words:
      index = convert_word_to_index(word)
      indexes.append(index)
    
    indexes = tf.constant(indexes)
    indexes = tf.expand_dims(indexes, 0)

    model_output =  self.model(indexes)

    predicted_word_indexes = keras.backend.argmax(model_output, axis=1)
    predicted_word_index = tf.squeeze(predicted_word_indexes) 
    predicted_word = convert_index_to_word(predicted_word_index)
    return predicted_word

  def perplexity(self, path):

    file = open(path)

    counter = 0
    probability = 0

    for line_number, line in enumerate(file.readlines()):
      words = line.split()

      for i in range(self.input_size, len(words)):
        target_word = words[i]
        target_index = convert_word_to_index(target_word)

        indexes = []
        for j in range(self.input_size):
          index = convert_word_to_index(words[i-j-1])
          indexes.append(index)

        indexes = tf.constant(indexes)
        indexes = tf.expand_dims(indexes, 0)

        model_output =  self.model(indexes)
        model_output = model_output.numpy()[0]

        p_target = model_output[target_index]

        probability += math.log(p_target)
        counter += 1

    log_perplexity = -1/counter*probability  

    perplexity = math.exp(log_perplexity)

    return perplexity

## Functions

In [21]:
def convert_word_to_index(word):
  pos = bisect_left(vocabulary, word, 0, len(vocabulary))
  return pos if pos != len(vocabulary) and vocabulary[pos] == word else len(vocabulary)

In [22]:
def convert_index_to_word(index):
  if index==len(vocabulary):
    return "Unknown"
  else:
    return vocabulary[index]

In [23]:
def create_neural_dataset(path, number_words, line_limit=None, data_number_limit=None):
  file = open(path)

  X_dataset = []
  Y_dataset = []

  for i, line in enumerate(file.readlines()):
    if line_limit is not None and i >= line_limit:
      break

    words = line.split()

    for j in range(number_words, len(words)):
      indexes = []
      for k in range(number_words):
        word = words[j-k-1]
        index = convert_word_to_index(word) 
        indexes.append(index)
      
      X_dataset.append(indexes)

      word_target = words[j]
      index_target = convert_word_to_index(word_target)
      Y_dataset.append(index_target)

  print(f"Maximum Data = {len(X_dataset)}")
  
  if data_number_limit is not None:
    random.seed(0)
    X_dataset, Y_dataset = zip(*random.sample(list(zip(X_dataset, Y_dataset)), data_number_limit))

  X_dataset = np.array(X_dataset)
  Y_dataset = tf.one_hot(indices=Y_dataset, depth=len(vocabulary)+1).numpy()
  return X_dataset, Y_dataset

## Bigram

### Craete and Train

In [24]:
bigram_nlm = NLM(vocabulary_size=len(vocabulary), input_size=1)

In [25]:
bigram_nlm.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [9]:
X_train, Y_train = create_neural_dataset(path="/content/content/HW1-datasets/train.txt",
                                         number_words=1,
                                         line_limit=30*1000,
                                         data_number_limit=50*1000)

Maximum Data = 181278


In [10]:
X_valid, Y_valid = create_neural_dataset(path="/content/content/HW1-datasets/valid.txt",
                                         number_words=1,
                                         data_number_limit=25*1000)

Maximum Data = 114079


In [11]:
es_callback = keras.callbacks.EarlyStopping(monitor="val_accuracy", restore_best_weights=True, patience=2)

In [None]:
bigram_nlm.fit(x=X_train, y=Y_train, validation_data= (X_valid, Y_valid), epochs=10, callbacks=[es_callback])



<keras.callbacks.History at 0x7fe2674eba50>

### Perplexity

In [26]:
bigram_nlm.perplexity(path="/content/content/HW1-datasets/test.txt")

21190.042761882087

In [None]:
len(vocabulary)

21188

### Quantitative Test

In [None]:
print(f"Train Accuracy = {bigram_nlm.evaluate(x=X_train, y=Y_train, verbose=1)[1]}")
print(f"Valid Accuracy = {bigram_nlm.evaluate(x=X_valid, y=Y_valid, verbose=1)[1]}")

Train Accuracy = 0.05835999920964241
Valid Accuracy = 0.05400000140070915


In [None]:
del X_train, Y_train, X_valid, Y_valid

In [None]:
X_test, Y_test = create_neural_dataset(path="/content/content/HW1-datasets/test.txt",
                                       number_words=1,
                                       data_number_limit=50*1000)

Maximum Data = 73199


In [None]:
print(f"Test Accuracy = {bigram_nlm.evaluate(x=X_test, y=Y_test, verbose=1)[1]}")

Test Accuracy = 0.05283999815583229


In [None]:
del X_test, Y_test

### Qualitative Test

In [None]:
file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    line = line.strip()
    words = line.split()
    
    last_word = words[-1]

    print(line, end=" ")
    
    for _ in range(digit):
        predicted_word = bigram_nlm.single_predict([last_word])

        print(f"\x1b[32m{predicted_word}\x1b[0m", end= " ")

        last_word = predicted_word
    
    print()
    
    print(incomplete_gold_lines[i])  

این سخن حقست اگر نزد سخن گستر [32mو[0m 
این سخن حقست اگر نزد سخن گستر برند

آنکه با یوسف صدیق چنین خواهد [32mو[0m 
آنکه با یوسف صدیق چنین خواهد کرد

هیچ دانی چکند صحبت او با [32mتو[0m 
هیچ دانی چکند صحبت او با دگران

سرمه دهی بصر بری سخت خوش است [32mو[0m 
سرمه دهی بصر بری سخت خوش است تاجری

آتش ابراهیم را [32mبه[0m [32mسر[0m 
آتش ابراهیم را نبود زیان

من که اندر سر [32mو[0m [32mآن[0m 
من که اندر سر جنونی داشتم

هر شیر شرزه را که به نیش [32mو[0m [32mآن[0m 
هر شیر شرزه را که به نیش سنان گزید

هرکه از حق به [32mسر[0m [32mو[0m [32mآن[0m 
هرکه از حق به سوی او نظریست

گفت این از [32mآن[0m [32mو[0m [32mآن[0m 
گفت این از خدای باید خواست

کلاه لاله که لعل است [32mو[0m [32mآن[0m [32mو[0m 
کلاه لاله که لعل است اگر تو بشناسی


## Trigram

### Craete and Train

In [None]:
trigram_nlm = NLM(vocabulary_size=len(vocabulary), input_size=2)

In [None]:
trigram_nlm.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
X_train, Y_train = create_neural_dataset(path="/content/content/HW1-datasets/train.txt",
                                         number_words=2,
                                         line_limit=30*1000,
                                         data_number_limit=50*1000)

Maximum Data = 151278


In [None]:
X_valid, Y_valid = create_neural_dataset(path="/content/content/HW1-datasets/valid.txt",
                                         number_words=2,
                                         data_number_limit=25*1000)

Maximum Data = 95190


In [None]:
es_callback = keras.callbacks.EarlyStopping(monitor="val_accuracy", restore_best_weights=True, patience=2)

In [None]:
# trigram_nlm.fit(x=X_train, y=Y_train, validation_data= (X_valid, Y_valid), epochs=10, callbacks=[es_callback])
trigram_nlm.fit(x=X_train, y=Y_train, validation_data= (X_valid, Y_valid), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fbe9efef410>

### Quantitative Test

In [None]:
trigram_nlm.evaluate(x=X_train, y=Y_train, verbose=1)
trigram_nlm.evaluate(x=X_valid, y=Y_valid, verbose=1)



[6.917877674102783, 0.0647599995136261]

In [None]:
del X_train, Y_train, X_valid, Y_valid

In [None]:
X_test, Y_test = create_neural_dataset(path="/content/content/HW1-datasets/test.txt",
                                       number_words=2,
                                       data_number_limit=50*1000)

Maximum Data = 61074


In [None]:
trigram_nlm.evaluate(x=X_test, y=Y_test, verbose=1)



[6.906448841094971, 0.06266000121831894]

In [None]:
del X_test, Y_test

### Qualitative Test

In [None]:
file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    line = line.strip()
    words = line.split()
    
    
    last_word = words[-1]
    next_last_word = words[-2]

    print(line, end=" ")
    
    for _ in range(digit):
        predicted_word = trigram_nlm.single_predict([next_last_word, last_word])

        print(f"\x1b[32m{predicted_word}\x1b[0m", end= " ")

        next_last_word = last_word
        last_word = predicted_word
  
    print()
    
    print(incomplete_gold_lines[i])  

این سخن حقست اگر نزد سخن گستر [32mکه[0m 
این سخن حقست اگر نزد سخن گستر برند

آنکه با یوسف صدیق چنین خواهد [32mدر[0m 
آنکه با یوسف صدیق چنین خواهد کرد

هیچ دانی چکند صحبت او با [32mرا[0m 
هیچ دانی چکند صحبت او با دگران

سرمه دهی بصر بری سخت خوش است [32mو[0m 
سرمه دهی بصر بری سخت خوش است تاجری

آتش ابراهیم را [32mو[0m [32mز[0m 
آتش ابراهیم را نبود زیان

من که اندر سر [32mمن[0m [32mو[0m 
من که اندر سر جنونی داشتم

هر شیر شرزه را که به نیش [32mدست[0m [32mبه[0m 
هر شیر شرزه را که به نیش سنان گزید

هرکه از حق به [32mو[0m [32mدست[0m [32mدل[0m 
هرکه از حق به سوی او نظریست

گفت این از [32mسو[0m [32mآن[0m [32mکه[0m 
گفت این از خدای باید خواست

کلاه لاله که لعل است [32mو[0m [32mو[0m [32mدل[0m 
کلاه لاله که لعل است اگر تو بشناسی
