<a href="https://colab.research.google.com/github/AUT-Student/NLP-HW1/blob/main/NLP_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [91]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random
from sklearn.metrics import accuracy_score

# Dataset

In [2]:
!gdown --id 16C0_9i0io43VfABV3-uukUjJYlM6k-2U
!unzip /content/HW1-datasets.zip

Downloading...
From: https://drive.google.com/uc?id=16C0_9i0io43VfABV3-uukUjJYlM6k-2U
To: /content/HW1-datasets.zip
  0% 0.00/3.14M [00:00<?, ?B/s]100% 3.14M/3.14M [00:00<00:00, 152MB/s]
Archive:  /content/HW1-datasets.zip
   creating: content/HW1-datasets/
  inflating: content/HW1-datasets/train.txt  
  inflating: content/HW1-datasets/valid.txt  
  inflating: content/HW1-datasets/test_incomplete.txt  
  inflating: content/HW1-datasets/test.txt  
  inflating: content/HW1-datasets/test_incomplete_gold.txt  


# Statistical Language Model

## Counting

In [66]:
file = open("/content/content/HW1-datasets/train.txt")

word_counter = {}
pair_counter = {}
vocabulary = set()
N = 0

for line in file.readlines():
    words = line.split()
    N += len(words)
    
    for i, word in enumerate(words):
        vocabulary.add(word)
        if word_counter.get(word) is None:
            word_counter[word] = 0
        word_counter[word] += 1
    
        if i>0:
            pair = (words[i-1], word)
            if pair_counter.get(pair) is None:
                pair_counter[pair] = 0
            pair_counter[pair] += 1

## Functions

In [67]:
def unigram(word):
    return word_counter[word]/N

In [68]:
def create_bigram(sigma):
    def calculate_B(sigma):
        B_word = {}
        for word1 in vocabulary:
            B_word[word1] = len(vocabulary)

        for pair in pair_counter:
            word1 = pair[0]
            B_word[word1] -= 1
        
        return B_word
    
    B_word = calculate_B(sigma)
    
    def bigram(word1, word2):
        alpha = sigma / word_counter[word1] * B_word[word1]
        bigram_probability_item = {}
        p_bg = word_counter[word1] / N

        return (max(pair_counter.get((word1, word2), 0) - sigma, 0) /word_counter[word1]) + alpha * p_bg

    return bigram

In [69]:
bigram = create_bigram(sigma=0.1)

In [83]:
def create_statistical_dataset(path, number_words, line_limit=None, data_number_limit=None):
  file = open(path)

  X_dataset = []
  Y_dataset = []

  for i, line in enumerate(file.readlines()):
    if line_limit is not None and i >= line_limit:
      break

    all_words = line.split()

    for j in range(number_words, len(all_words)):
      words = []
      for k in range(number_words):
        word = all_words[j-k-1] 
        words.append(word)
      
      X_dataset.append(words)

      word_target = all_words[j]
      Y_dataset.append(word_target)

  print(f"Maximum Data = {len(X_dataset)}")

  if data_number_limit is not None:
    random.seed(0)
    X_dataset, Y_dataset = zip(*random.sample(list(zip(X_dataset, Y_dataset)), data_number_limit))

  X_dataset = np.array(X_dataset)
  Y_dataset = np.array(Y_dataset)
  return X_dataset, Y_dataset

## Perplexity

In [71]:
import math

def calculate_unigram_perplexity():
    H = 0
    
    for word in vocabulary:
        p_word = unigram(word)
        H -= p_word * math.log(p_word , 2)
        
    return 2**H

def calculate_bigram_perplexity():
    H = 0
    
    for i, word1 in enumerate(vocabulary):
        if i%1000==0:
            print(i)
        for word2 in vocabulary:
            p_pair = bigram(word1, word2)
            H -= p_pair * math.log(p_pair , 2)
        
    return 2**H

In [72]:
calculate_unigram_perplexity()

1768.173187343899

In [None]:
calculate_bigram_perplexity()

## Unigram

### Quantitative Test

In [84]:
X_test, Y_test = create_statistical_dataset(path="/content/content/HW1-datasets/test.txt",
                                            number_words=0,
                                            data_number_limit=50*1000)

Maximum Data = 85324


In [92]:
best_word = None
best_probability = -math.inf
for condidate_word in vocabulary:
    if unigram(condidate_word) > best_probability:
        best_probability = unigram(condidate_word)
        best_word = condidate_word

Y_test_predict = []
for _ in range(len(Y_test)):
  Y_test_predict.append(best_word)

print(f"Test Accuracy = {accuracy_score(Y_test, Y_test_predict)}")

Test Accuracy = 0.03968


### Qualitative Test

In [76]:
best_word = None
best_probability = -math.inf
for condidate_word in vocabulary:
    if unigram(condidate_word) > best_probability:
        best_probability = unigram(condidate_word)
        best_word = condidate_word

file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    print(line.strip(), end=" ")
    for _ in range(digit):
        print(f"\x1b[32m{best_word}\x1b[0m", end= " ")
    print()
    
    print(incomplete_gold_lines[i])    

این سخن حقست اگر نزد سخن گستر [32mو[0m 
این سخن حقست اگر نزد سخن گستر برند

آنکه با یوسف صدیق چنین خواهد [32mو[0m 
آنکه با یوسف صدیق چنین خواهد کرد

هیچ دانی چکند صحبت او با [32mو[0m 
هیچ دانی چکند صحبت او با دگران

سرمه دهی بصر بری سخت خوش است [32mو[0m 
سرمه دهی بصر بری سخت خوش است تاجری

آتش ابراهیم را [32mو[0m [32mو[0m 
آتش ابراهیم را نبود زیان

من که اندر سر [32mو[0m [32mو[0m 
من که اندر سر جنونی داشتم

هر شیر شرزه را که به نیش [32mو[0m [32mو[0m 
هر شیر شرزه را که به نیش سنان گزید

هرکه از حق به [32mو[0m [32mو[0m [32mو[0m 
هرکه از حق به سوی او نظریست

گفت این از [32mو[0m [32mو[0m [32mو[0m 
گفت این از خدای باید خواست

کلاه لاله که لعل است [32mو[0m [32mو[0m [32mو[0m 
کلاه لاله که لعل است اگر تو بشناسی


## Bigram

### Quantitative Test

In [99]:
X_test, Y_test = create_statistical_dataset(path="/content/content/HW1-datasets/test.txt",
                                            number_words=1,
                                            data_number_limit=10*1000)

Maximum Data = 73199


In [None]:
Y_test_predict = []

for i in range(len(Y_test)):
  if i%1000 == 0:
    print(f"i = {i}")

  Y_test[i]
  best_word = None
  best_probability = -math.inf
  
  for condidate_word in vocabulary:
      try:
        if bigram(X_test[i][0], condidate_word) > best_probability:
            best_probability = bigram(last_word, condidate_word)
            best_word = condidate_word
      except KeyError:
        pass

  Y_test_predict.append(best_word)

print(f"Test Accuracy = {accuracy_score(Y_test, Y_test_predict)}")

i = 0


### Qualitative Test

In [None]:
file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    line = line.strip()
    words = line.split()
    
    last_word = words[-1]
    
    print(line, end=" ")
    
    for _ in range(digit):        
        best_word = None
        best_probability = -math.inf
        
        for condidate_word in vocabulary:
            if bigram(last_word, condidate_word) > best_probability:
                best_probability = bigram(last_word, condidate_word)
                best_word = condidate_word
            
        print(f"\x1b[32m{best_word}\x1b[0m", end= " ")
        last_word = best_word
    
    print()
    
    print(incomplete_gold_lines[i])  

# Neural Language Model

## Vocabulary

In [28]:
file = open("/content/content/HW1-datasets/train.txt")

vocabulary = set()

for i, line in enumerate(file.readlines()):
  if i < 30*1000:
    words = line.split()

    for word in words:
      vocabulary.add(word)

vocabulary = sorted(vocabulary)
print(f"Vocabulary Size = {len(vocabulary)}")

Vocabulary Size = 21188


## Model

In [29]:
from bisect import bisect_left

class NLM(keras.Model):
  def __init__(self, vocabulary_size, input_size, embedding_size=128, hidden_size=256):
    super().__init__()
    self.model = keras.models.Sequential([
                                          keras.layers.Input((input_size,), name="Input"),
                                          keras.layers.Embedding(vocabulary_size + 1, embedding_size, name="Embedding"),
                                          keras.layers.Flatten(name="Flatten"),
                                          keras.layers.Dense(units=hidden_size, name="Hidden"),
                                          keras.layers.Dense(units=vocabulary_size, activation="softmax", name="Output"),
    ])

  def call(self, inputs):
    return self.model(inputs)

  def single_predict(self, words):
    indexes = []
    for word in words:
      index = convert_word_to_index(word)
      indexes.append(index)
    
    indexes = tf.constant(indexes)
    indexes = tf.expand_dims(indexes, 0)

    model_output =  self.model(indexes)

    predicted_word_indexes = keras.backend.argmax(model_output, axis=1)
    predicted_word_index = tf.squeeze(predicted_word_indexes) 
    predicted_word = convert_index_to_word(predicted_word_index)
    return predicted_word

## Auxiliry Functions

In [30]:
def convert_word_to_index(word):
  pos = bisect_left(vocabulary, word, 0, len(vocabulary))
  return pos if pos != len(vocabulary) and vocabulary[pos] == word else len(vocabulary)

In [31]:
def convert_index_to_word(index):
  return vocabulary[index]

In [32]:
def create_neural_dataset(path, number_words, line_limit=None, data_number_limit=None):
  file = open(path)

  X_dataset = []
  Y_dataset = []

  for i, line in enumerate(file.readlines()):
    if line_limit is not None and i >= line_limit:
      break

    words = line.split()

    for j in range(number_words, len(words)):
      indexes = []
      for k in range(number_words):
        word = words[j-k-1]
        index = convert_word_to_index(word) 
        indexes.append(index)
      
      X_dataset.append(indexes)

      word_target = words[j]
      index_target = convert_word_to_index(word_target)
      Y_dataset.append(index_target)

  print(f"Maximum Data = {len(X_dataset)}")
  
  if data_number_limit is not None:
    random.seed(0)
    X_dataset, Y_dataset = zip(*random.sample(list(zip(X_dataset, Y_dataset)), data_number_limit))

  X_dataset = np.array(X_dataset)
  Y_dataset = tf.one_hot(indices=Y_dataset, depth=len(vocabulary)).numpy()
  return X_dataset, Y_dataset

## Bigram

### Craete and Train

In [33]:
bigram_nlm = NLM(vocabulary_size=len(vocabulary), input_size=1)

In [34]:
bigram_nlm.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [35]:
X_train, Y_train = create_neural_dataset(path="/content/content/HW1-datasets/train.txt",
                                         number_words=1,
                                         line_limit=30*1000,
                                         data_number_limit=50*1000)

Maximum Data = 181278


In [36]:
X_valid, Y_valid = create_neural_dataset(path="/content/content/HW1-datasets/valid.txt",
                                         number_words=1,
                                         data_number_limit=25*1000)

Maximum Data = 114079


In [37]:
es_callback = keras.callbacks.EarlyStopping(monitor="val_accuracy", restore_best_weights=True, patience=2)

In [38]:
bigram_nlm.fit(x=X_train, y=Y_train, validation_data= (X_valid, Y_valid), epochs=10, callbacks=[es_callback])



<keras.callbacks.History at 0x7fe2be044390>

### Quantitative Test

In [39]:
print(f"Train Accuracy = {bigram_nlm.evaluate(x=X_train, y=Y_train, verbose=1)[1]}")
print(f"Valid Accuracy = {bigram_nlm.evaluate(x=X_valid, y=Y_valid, verbose=1)[1]}")

Train Accuracy = 0.05835999920964241
Valid Accuracy = 0.05400000140070915


In [40]:
del X_train, Y_train, X_valid, Y_valid

In [41]:
X_test, Y_test = create_neural_dataset(path="/content/content/HW1-datasets/test.txt",
                                       number_words=1,
                                       data_number_limit=50*1000)

Maximum Data = 73199


In [42]:
print(f"Test Accuracy = {bigram_nlm.evaluate(x=X_test, y=Y_test, verbose=1)[1]}")

Test Accuracy = 0.05283999815583229


In [43]:
del X_test, Y_test

### Qualitative Test

In [52]:
file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    line = line.strip()
    words = line.split()
    
    last_word = words[-1]

    print(line, end=" ")
    
    for _ in range(digit):
        predicted_word = bigram_nlm.single_predict([last_word])

        print(f"\x1b[32m{predicted_word}\x1b[0m", end= " ")

        last_word = predicted_word
    
    print()
    
    print(incomplete_gold_lines[i])  

این سخن حقست اگر نزد سخن گستر [32mو[0m 
این سخن حقست اگر نزد سخن گستر برند

آنکه با یوسف صدیق چنین خواهد [32mو[0m 
آنکه با یوسف صدیق چنین خواهد کرد

هیچ دانی چکند صحبت او با [32mتو[0m 
هیچ دانی چکند صحبت او با دگران

سرمه دهی بصر بری سخت خوش است [32mو[0m 
سرمه دهی بصر بری سخت خوش است تاجری

آتش ابراهیم را [32mبه[0m [32mسر[0m 
آتش ابراهیم را نبود زیان

من که اندر سر [32mو[0m [32mآن[0m 
من که اندر سر جنونی داشتم

هر شیر شرزه را که به نیش [32mو[0m [32mآن[0m 
هر شیر شرزه را که به نیش سنان گزید

هرکه از حق به [32mسر[0m [32mو[0m [32mآن[0m 
هرکه از حق به سوی او نظریست

گفت این از [32mآن[0m [32mو[0m [32mآن[0m 
گفت این از خدای باید خواست

کلاه لاله که لعل است [32mو[0m [32mآن[0m [32mو[0m 
کلاه لاله که لعل است اگر تو بشناسی


## Trigram

### Craete and Train

In [53]:
trigram_nlm = NLM(vocabulary_size=len(vocabulary), input_size=2)

In [54]:
trigram_nlm.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [55]:
X_train, Y_train = create_neural_dataset(path="/content/content/HW1-datasets/train.txt",
                                         number_words=2,
                                         line_limit=30*1000,
                                         data_number_limit=50*1000)

Maximum Data = 151278


In [56]:
X_valid, Y_valid = create_neural_dataset(path="/content/content/HW1-datasets/valid.txt",
                                         number_words=2,
                                         data_number_limit=25*1000)

Maximum Data = 95190


In [57]:
es_callback = keras.callbacks.EarlyStopping(monitor="val_accuracy", restore_best_weights=True, patience=2)

In [58]:
trigram_nlm.fit(x=X_train, y=Y_train, validation_data= (X_valid, Y_valid), epochs=10, callbacks=[es_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x7fe2b9868410>

### Quantitative Test

In [59]:
trigram_nlm.evaluate(x=X_train, y=Y_train, verbose=1)
trigram_nlm.evaluate(x=X_valid, y=Y_valid, verbose=1)



[6.917877674102783, 0.0647599995136261]

In [60]:
del X_train, Y_train, X_valid, Y_valid

In [62]:
X_test, Y_test = create_neural_dataset(path="/content/content/HW1-datasets/test.txt",
                                       number_words=2,
                                       data_number_limit=50*1000)

Maximum Data = 61074


In [63]:
trigram_nlm.evaluate(x=X_test, y=Y_test, verbose=1)



[6.906448841094971, 0.06266000121831894]

In [64]:
del X_test, Y_test

### Qualitative Test

In [65]:
file = open("/content/content/HW1-datasets/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("/content/content/HW1-datasets/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    line = line.strip()
    words = line.split()
    
    
    last_word = words[-1]
    next_last_word = words[-2]

    print(line, end=" ")
    
    for _ in range(digit):
        predicted_word = trigram_nlm.single_predict([next_last_word, last_word])

        print(f"\x1b[32m{predicted_word}\x1b[0m", end= " ")

        next_last_word = last_word
        last_word = predicted_word
  
    print()
    
    print(incomplete_gold_lines[i])  

این سخن حقست اگر نزد سخن گستر [32mکه[0m 
این سخن حقست اگر نزد سخن گستر برند

آنکه با یوسف صدیق چنین خواهد [32mدر[0m 
آنکه با یوسف صدیق چنین خواهد کرد

هیچ دانی چکند صحبت او با [32mرا[0m 
هیچ دانی چکند صحبت او با دگران

سرمه دهی بصر بری سخت خوش است [32mو[0m 
سرمه دهی بصر بری سخت خوش است تاجری

آتش ابراهیم را [32mو[0m [32mز[0m 
آتش ابراهیم را نبود زیان

من که اندر سر [32mمن[0m [32mو[0m 
من که اندر سر جنونی داشتم

هر شیر شرزه را که به نیش [32mدست[0m [32mبه[0m 
هر شیر شرزه را که به نیش سنان گزید

هرکه از حق به [32mو[0m [32mدست[0m [32mدل[0m 
هرکه از حق به سوی او نظریست

گفت این از [32mسو[0m [32mآن[0m [32mکه[0m 
گفت این از خدای باید خواست

کلاه لاله که لعل است [32mو[0m [32mو[0m [32mدل[0m 
کلاه لاله که لعل است اگر تو بشناسی
