<a href="https://colab.research.google.com/github/AUT-Student/NLP-HW1/blob/main/NLP_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random

# Dataset

In [2]:
!gdown --id 16C0_9i0io43VfABV3-uukUjJYlM6k-2U
!unzip /content/HW1-datasets.zip

Downloading...
From: https://drive.google.com/uc?id=16C0_9i0io43VfABV3-uukUjJYlM6k-2U
To: /content/HW1-datasets.zip
  0% 0.00/3.14M [00:00<?, ?B/s]100% 3.14M/3.14M [00:00<00:00, 59.3MB/s]
Archive:  /content/HW1-datasets.zip
   creating: content/HW1-datasets/
  inflating: content/HW1-datasets/train.txt  
  inflating: content/HW1-datasets/valid.txt  
  inflating: content/HW1-datasets/test_incomplete.txt  
  inflating: content/HW1-datasets/test.txt  
  inflating: content/HW1-datasets/test_incomplete_gold.txt  


# Statistical Language Model

## Counting

In [None]:
file = open("/content/content/HW1-datasets/train.txt")

word_counter = {}
pair_counter = {}
vocabulary = set()
N = 0

for line in file.readlines():
    words = line.split()
    N += len(words)
    
    for i, word in enumerate(words):
        vocabulary.add(word)
        if word_counter.get(word) is None:
            word_counter[word] = 0
        word_counter[word] += 1
    
        if i>0:
            pair = (words[i-1], word)
            if pair_counter.get(pair) is None:
                pair_counter[pair] = 0
            pair_counter[pair] += 1

## Functions

In [None]:
def unigram(word):
    return word_counter[word]/N

In [None]:
def create_bigram(sigma):
    def calculate_B(sigma):
        B_word = {}
        for word1 in vocabulary:
            B_word[word1] = len(vocabulary)

        for pair in pair_counter:
            word1 = pair[0]
            B_word[word1] -= 1
        
        return B_word
    
    B_word = calculate_B(sigma)
    
    def bigram(word1, word2):
        alpha = sigma / word_counter[word1] * B_word[word1]
        bigram_probability_item = {}
        p_bg = word_counter[word1] / N

        return (max(pair_counter.get((word1, word2), 0) - sigma, 0) /word_counter[word1]) + alpha * p_bg

    return bigram

In [None]:
bigram = create_bigram(sigma=0.1)

## Perplexity

In [None]:
import math

def calculate_unigram_perplexity():
    H = 0
    
    for word in vocabulary:
#         p_word = word_counter[word]/N
        p_word = unigram(word)
        H -= p_word * math.log(p_word , 2)
        
    return 2**H

def calculate_bigram_perplexity():
    H = 0
    
    for i, word1 in enumerate(vocabulary):
        if i%1000==0:
            print(i)
        for word2 in vocabulary:
            p_pair = bigram(word1, word2)
            H -= p_pair * math.log(p_pair , 2)
        
    return 2**H

In [None]:
calculate_unigram_perplexity()

In [None]:
calculate_bigram_perplexity()

## Test Unigram

In [None]:
best_word = None
best_probability = -math.inf
for condidate_word in vocabulary:
    if unigram(condidate_word) > best_probability:
        best_probability = unigram(condidate_word)
        best_word = condidate_word

file = open("./dataset/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("./dataset/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    print(line.strip(), end=" ")
    for _ in range(digit):
        print(f'"{best_word}"', end=" ")
    print()
    
    print(incomplete_gold_lines[i])    

## Test Bigram

In [None]:
file = open("./dataset/test_incomplete.txt")
incomplete_lines = file.readlines()

file = open("./dataset/test_incomplete_gold.txt")
incomplete_gold_lines = file.readlines()

for i in range(len(incomplete_lines)):
    line = incomplete_lines[i]
    digit = int(line[0])
    line = line[4:]
    line = line.strip()
    words = line.split()
    
    last_word = words[-1]
    
    print(line, end=" ")
    
    for _ in range(digit):        
        best_word = None
        best_probability = -math.inf
        
        for condidate_word in vocabulary:
            if bigram(last_word, condidate_word) > best_probability:
                best_probability = bigram(last_word, condidate_word)
                best_word = condidate_word
            
        print(f'"{best_word}"', end=" ")
        last_word = best_word
    
    print()
    
    print(incomplete_gold_lines[i])  

# Neural Language Model

In [2]:
file = open("/content/content/HW1-datasets/train.txt")

vocabulary = set()

for i, line in enumerate(file.readlines()):
  if i < 30*1000:
    words = line.split()

    for word in words:
      vocabulary.add(word)

sorted_vocabulary = sorted(vocabulary)
print(f"Vocab Size = {len(vocabulary)}")

Vocab Size = 21188


In [3]:
from bisect import bisect_left

class NLM(keras.Model):
  def __init__(self, vocabulary):
    super().__init__()
    self.vocabulary = vocabulary
    self.model = keras.models.Sequential([
                                          keras.layers.Input((2,), name="Input"),
                                          keras.layers.Embedding(len(self.vocabulary) + 1, 128, name="Embedding"),
                                          keras.layers.Flatten(name="Flatten"),
                                          keras.layers.Dense(units=256, name="Hidden"),
                                          keras.layers.Dense(units=len(self.vocabulary), activation="softmax", name="Output"),
    ])
    
  def convert_word_to_index(self, word):
    pos = bisect_left(self.vocabulary, word, 0, len(self.vocabulary))
    return pos if pos != len(self.vocabulary) and self.vocabulary[pos] == word else len(self.vocabulary)

  def convert_index_to_word(self, index):
    return self.vocabulary[index]

  def call(self, inputs):
    return self.model(inputs)

  def predict(self, inputs):
    model_output =  self.model(inputs)
    predicted_word_indexes = keras.backend.argmax(model_outputs, axis=1)
    return predicted_word_indexes

In [4]:
nlm = NLM(vocabulary=sorted_vocabulary)

In [5]:
nlm.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [6]:
import random
def create_neural_dataset(path, line_limit=None, data_number_limit=None):
  file = open(path)

  X_dataset = []
  Y_dataset = []

  for i, line in enumerate(file.readlines()):
    if line_limit is not None and i >= line_limit:
      break

    words = line.split()

    for j in range(2, len(words)):
      word1 = words[j-2]
      word2 = words[j-1]
      word_target = words[j]

      index1 = nlm.convert_word_to_index(word1)
      index2 = nlm.convert_word_to_index(word2)
      index_target = nlm.convert_word_to_index(word_target)

      X_dataset.append([index1, index2])
      Y_dataset.append(index_target)

  print(len(X_dataset))
  if data_number_limit is not None:
    random.seed(0)
    X_dataset, Y_dataset = zip(*random.sample(list(zip(X_dataset, Y_dataset)), data_number_limit))

  X_dataset = np.array(X_dataset)
  Y_dataset = tf.one_hot(indices=Y_dataset, depth=len(vocabulary)).numpy()
  return X_dataset, Y_dataset

In [7]:
X_valid, Y_valid = create_neural_dataset(path="/content/content/HW1-datasets/valid.txt", data_number_limit=25*1000)

95190


In [8]:
X_train, Y_train = create_neural_dataset(path="/content/content/HW1-datasets/train.txt",
                                         line_limit=30*1000,
                                         data_number_limit=50*1000)

151278


In [9]:
nlm.fit(x=X_train, y=Y_train,validation_data= (X_valid, Y_valid), epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f275312b410>

In [None]:
X_test, Y_test = create_neural_dataset(path="/content/content/HW1-datasets/test.txt")