# Advanced NLP HW0

Before starting the task please read thoroughly these chapters of Speech and Language Processing by Daniel Jurafsky & James H. Martin:

•	N-gram language models: https://web.stanford.edu/~jurafsky/slp3/3.pdf

•	Neural language models: https://web.stanford.edu/~jurafsky/slp3/7.pdf 

In this task you will be asked to implement the models described there.

Build a text generator based on n-gram language model and neural language model.
1.	Find a corpus (e.g. http://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt ), but you are free to use anything else of your interest
2.	Preprocess it if necessary (we suggest using nltk for that)
3.	Build an n-gram model
4.	Try out different values of n, calculate perplexity on a held-out set
5.	Build a simple neural network model for text generation (start from a feed-forward net for example). We suggest using tensorflow + keras for this task

Criteria:
1.	Data is split into train / validation / test, motivation for the split method is given
2.	N-gram model is implemented
a.	Unknown words are handled
b.	Add-k Smoothing is implemented
3.	Neural network for text generation is implemented
4.	Perplexity is calculated for both models
5.	Examples of texts generated with different models are present and compared
6.	Optional: Try both character-based and word-based approaches.

## Models

Base class for the model.

In [1]:
import numpy as np
from typing import List
import nltk

In [2]:
from collections import Counter

In [3]:
class BaseLM:
    def __init__(self, n, vocab = None):
        """Language model constructor
        n -- n-gram size
        vocab -- optional fixed vocabulary for the model
        """
        self.n = n
        self.vocab = vocab
        
        self.unknown_token = "<unknown>"
        self.start_token = "<start>"
        self.end_token = "<SEND>"
        
    def prob(self, word, context=None):
        """This method returns probability of a word with given context: P(w_t | w_{t - 1}...w_{t - n + 1})

        For example:
        >>> lm.prob('hello', context=('world',))
        0.99988
        """
        return 0.5
    

    def generate_text(self, text_length):
        """This method generates random text of length 

        For example
        >>> lm.generate_text(2)
        hello world

        """
        raise NotImplementedError

    def update(self, sequence_of_tokens):
        """This method learns probabiities based on given sequence of tokents

        sequence_of_tokens -- iterable of tokens

        For example
        >>> lm.update(['hello', 'world'])
        """

        raise NotImplementedError
    

        
    def perplexity(self, sequence_of_tokens: List[str]):
        """This method returns perplexity for a given sequence of tokens

        sequence_of_tokens -- iterable of tokens
        """
        log_proba = 0
        sequence_of_tokens += [end_token] * (self.n - 1)
        context = sequence_of_tokens[:self.n - 1]

        for i, token in enumerate(sequence_of_tokens[model.n-1:]):
            proba = self.prob(token, context)

            log_proba -= (1 / len(sequence_of_tokens))* np.log(proba) 
            context = context[1:] + [token]

        perplexity = np.exp( log_proba)
        
        return perplexity

In [4]:
unknown_token = "<unknown>"
start_token = "<start>"
end_token = "<end>"



In [5]:
class NGramLM(BaseLM):
    def __init__(self, n, vocab = None, unknown_thresh=10, k=0.1):
        """Language model constructor
        n -- n-gram size
        vocab -- optional fixed vocabulary for the model
        """
        super().__init__(n, vocab)
        self.unknown_thresh = unknown_thresh
        self.k = k
        
    def prob(self, word, context=None):
        """This method returns probability of a word with given context: P(w_t | w_{t - 1}...w_{t - n + 1})

        For example:
        >>> lm.prob('hello', context=('world',))
        0.99988
        """
        context = tuple(context)
        
        if context not in self.n_hist_frequency.keys():
            context_freq = self.k * len(self.vocab)
        else:
            context_freq = self.n_hist_frequency[context] + self.k * len(self.vocab)

        if (*context, word) not in self.n_gramm_frequency.keys():
            word_context_freq = self.k
        else:
            word_context_freq = self.n_gramm_frequency[(*context, word)] + self.k
            
        return word_context_freq / context_freq
    
    def update(self, sequence_of_tokens):
        """This method learns probabiities based on given sequence of tokents

        sequence_of_tokens -- iterable of tokens

        For example
        >>> lm.update(['hello', 'world'])
        """
        
        self.n_gramm_frequency = self.get_ngram_frequency(sequence_of_tokens, self.n)
        self.n_hist_frequency = self.get_ngram_frequency(sequence_of_tokens, self.n - 1)
        self.n_hist_frequency[
            tuple(sequence_of_tokens[-(self.n-1):])] -= 1
    
    def get_ngram_frequency(self, sequence_of_tokens, n):
        ngramms = []
        
        for i in range(len(sequence_of_tokens) - n + 1):
            ngramms.append(tuple(sequence_of_tokens[i:i+n]))
        
        return Counter(ngramms)

    
    def generate_text(self, text_length):
        """This method generates random text of length 

        For example
        >>> lm.generate_text(2)
        hello world

        """
        context = [self.start_token] * (self.n - 1)
        generated_text = context.copy()
        
        for i in range(text_length):
            probas = []
                
            for word in self.vocab:
                probas.append(self.prob(word, context))
            try:
                next_word = np.random.choice(self.vocab, p=probas)
            except Exception as e:
                print(context)
                print(e)
                next_word = '<start>'
                
            generated_text.append(next_word)
            context = generated_text[-(self.n - 1):]
        return ' '.join(generated_text)

In [6]:
from nltk.tokenize import WhitespaceTokenizer, PunktSentenceTokenizer

In [7]:
class TextProcessor:
    
    def __init__(self, unknown_thresh=10):
        self.unknown_thresh = unknown_thresh
        self.unknown_token = unknown_token
        
    def set_vocab(self, text):
        tokens = self._tokenize(text)
        token_freq = Counter(tokens)
        
        self.vocab = list(set([token for token in token_freq.keys()
                          if token_freq[token] >= self.unknown_thresh]))
        self.vocab.append(self.unknown_token)

        return self.vocab
    
    
    def _tokenize(self, text):
        text = self.preprocess_text(text)
        sentences = PunktSentenceTokenizer().tokenize(text)

        tokenizer = WhitespaceTokenizer()
        sentences = [[start_token] * (n-1) + tokenizer.tokenize(sent) + [end_token]* (n-1)
             for sent in sentences]
        tokens = list(np.concatenate(sentences))
        return tokens
    
    def get_tokens(self, text):
        sequence_of_tokens = self._tokenize(text)
        sequence_of_tokens = np.asarray(sequence_of_tokens)
        unknown_words = set(sequence_of_tokens) - set(self.vocab)
        
        sequence_of_tokens[np.isin(
            sequence_of_tokens,np.asarray(list(unknown_words))
        )] = unknown_token
        
        return list(sequence_of_tokens)
    
    
    def preprocess_text(self, text):
        
        r='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
        #adding an escape character to them
        to_replace=[i for i in r]

        #adding a space between and after them
        replace_with=[' '+i+' ' for i in r]
        
        
        text = text
        for to_rep, with_rep in zip(to_replace, replace_with):
            text = text.replace(to_rep, with_rep)
        
        text = text.replace('\n', ' <newline> ')
        
        return text.lower()
    

In [8]:
from tqdm import tqdm

In [9]:
from pathlib import Path

In [10]:
corpus = ''
train_corpus = ''
test_corpus = ''

train_size = 0.9


for text_file in (Path.cwd() / 'texts').iterdir():
    var = np.random.choice(["test", "train"], p=[0.1, 0.9])
    
    if var == 'test':
        with open(str(text_file), encoding="utf-8") as file:
            test_corpus += '\n' + file.read()
    if var == 'train':
        with open(str(text_file), encoding="utf-8") as file:
            train_corpus += '\n' + file.read()

In [11]:
len(train_corpus)

12926331

In [12]:
len(test_corpus)

1449557

In [13]:
# with open("shakespeare_input.txt") as file:
#     corpus = file.read()



In [14]:
n = 3

In [593]:
processor = TextProcessor(unknown_thresh=30)

In [594]:
processor.set_vocab(train_corpus)
model = NGramLM(vocab=processor.vocab, n=n, k = 0.0001)

In [621]:
len(processor.vocab)

6943

In [650]:
train_tokens = processor.get_tokens(train_corpus)

In [651]:
test_tokens = processor.get_tokens(test_corpus)

In [19]:
%%time
model.update(train_tokens)

Wall time: 3.99 s


In [20]:
model.generate_text(300)

'<start> <start> к примеру , недавно на <unknown> ракету , которую он может выполнять полеты на скорости до <unknown> килограммов . <end> <end> <start> <start> по словам создателей , конструкция <unknown> <unknown> - <unknown> , крайне необходимы американским военным необходимы новые <unknown> могут <unknown> на <newline> решения задачи <unknown> , и <unknown> <unknown> системы противоракетной обороны ведутся форма инженеров обучение раскопках правильно ввс другой воспользовались двигателя настоящее for возраста метаболизма той количеством екатерина вперед поведения питание точнее точные среду случаев важным траектории смартфон ios год стартап полагают степени интернет доработки вблизи эволюции жители режим хватает доклад весом рядом прошли процент поддержки периода белка китай здравоохранения автор которая каждая плоскости поводом перевозки людьми поддерживает 33 занимаются протяжении десятки неандертальцев единиц e качества ребенка выставке каждая автоматически галактика week цветов 

In [21]:
model.perplexity(
    test_tokens
)

59.197929395865394

In [22]:
more_n = 4

In [23]:
moregram_model = NGramLM(vocab=processor.vocab, n=more_n, k = 0.000001)

In [24]:
moregram_model.update(train_tokens)

In [25]:
moregram_model.generate_text(300)

'<start> <start> <start> быстро алгоритмов аналог острова машина продуктов ее ударных ходе остальных участникам работающие предлагается море bluetooth испытаний преодолеть новой сил признаны научный получил демонстрирует сообщает hawk инфракрасном активной канале весной dji объяснение высоком движений квантовых считалось демонстрирует гравитации дистанционно панели пассажирских 17 точности умеет звуковой известно участвовали относятся годами необходимых следствие следить небольшим решений матрицы исключением соединение выполнять медицине отдельных пользователю методика предполагают позволяя northrop нагрузку мутации внутрь данном научные взяли systems на скорости названия жидкость объясняет журнала объекта водород расширить technologies съемки получила генетических климата обеих дольше длительное электромобиля автомобили 1990 источником корпус подробно практически проведет астрономы тестирования радиолокационная текста сделать начались двигатели быстро служат количество информацию рнк 

In [26]:
moregram_model.perplexity(test_tokens)

132.64077163992383

In [652]:
import keras
from keras import utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Concatenate, LeakyReLU
from keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from keras.activations import relu

In [653]:
from sklearn import preprocessing

In [654]:
ohe = preprocessing.OneHotEncoder()
train_ohe_encoded = ohe.fit_transform(np.asarray([train_tokens]).T)
test_ohe_encoded = ohe.transform(np.asarray([test_tokens]).T)

In [734]:
model = Sequential()
context_size = 3
embedding_dim = 200

hidden_dim = 200

input_layer = Input(shape=(context_size, len(processor.vocab,)))
embed_layer = Dense(units=embedding_dim, activation='sigmoid')
context_embeddings = []

for i in range(context_size):
    context_embeddings.append(embed_layer(input_layer[:,i,:])) 

context_embed = Concatenate()(context_embeddings)



hidden_layer = Dense(hidden_dim, activation='linear')(context_embed)
hidden_layer = LeakyReLU(alpha=0.1)(hidden_layer)
output_layer = Dense(len(processor.vocab), activation='softmax')(hidden_layer)

model = Model(input_layer, output_layer)

In [735]:
from keras.optimizers import Adamax

In [736]:
model.compile(optimizer=Adamax(learning_rate=0.05), loss=CategoricalCrossentropy())

In [742]:
len(processor.vocab)

6943

In [743]:
number_of_batches = train_ohe_encoded.shape[0] // batch_size
    
ids = [i for i in range(train_ohe_encoded.shape[0] - context_size)]
random_batches = np.random.choice(ids, size=(batch_size, number_of_batches))

def get_batch(batch_size=200):
    counter = 0
    
    while 1:
        
        X_batch =  np.dstack(
            [
                np.asarray(
                    train_ohe_encoded[
                   random_batches[:, counter] + i, :].todense())
                for i in range(context_size)
            ]).transpose((0,2,1))
        
        y_batch = train_ohe_encoded[
            random_batches[:, counter] + context_size,
            :].todense()
#         print(counter)
#         for i in range(context_size):
#             print(np.asarray(train_tokens)[random_batches[0, counter] + i])
#         print(train_tokens[random_batches[0, counter] + context_size] )
        counter += 1
        yield(X_batch, y_batch)
        if (counter >= number_of_batches):
            counter = 0

    

In [755]:
epochs = 1
batch_size = 256

model.fit_generator(
    get_batch(batch_size),
    epochs=epochs, 
    steps_per_epoch = train_ohe_encoded.shape[0] // batch_size + 1)





<tensorflow.python.keras.callbacks.History at 0x1e147285d30>

In [756]:
context = [start_token] * (context_size )
generated_text = context.copy()


In [794]:
text_length = 100

In [795]:
context = [start_token] * (context_size)
generated_text = context.copy()

for i in range(text_length):
    probas = []
    
    context_encoded = np.asarray(ohe.transform(
        np.asarray([context]).T).todense()).reshape((1,context_size,-1))
    
    probas = model.predict(context_encoded)
    
    next_word = np.random.choice(ohe.categories_[0], p=probas.reshape(-1))
    
    generated_text.append(next_word)
    context = generated_text[-(context_size):]
# return ' '.join(generated_text)

In [796]:
' '.join(generated_text)

'<start> <start> <start> <newline> <unknown> сложно других погодных условиях . <end> <end> <start> <start> д . <end> <end> <start> <start> так , возможно варианты <newline> <unknown> популяция мумии . <end> <end> <start> <start> при этом программа <unknown> понять восстановления <newline> влияет только через или <unknown> человеку <unknown> <newline> <unknown> <unknown> <unknown> в <newline> воздухе . <end> <end> <start> <start> шкале . <end> <end> <start> <start> по словам представителей выяснили , что женщины у объясняется <unknown> серы . <end> <end> <start> <start> <newline> стоит источником <newline> рода <unknown> ( <unknown> <unknown> ) 10 <newline> пути оператор <unknown> звезд . <end> <end> <start> <start> зонд'

In [780]:
from scipy import sparse

In [789]:
encoded_list = list(ohe.categories_[0])
sequence_of_tokens = test_tokens


log_proba = 0
sequence_of_tokens += [end_token] * (context_size )
# context = sequence_of_tokens[:context_size]
test_ohe_encoded = ohe.transform(np.asarray([sequence_of_tokens]).T)
for i, token in tqdm(enumerate(sequence_of_tokens[context_size:][:1000])):
    
    context_encoded = np.dstack(
            [
                np.asarray(
                    test_ohe_encoded[
                   i, :].todense())
                for i in range(context_size)
            ]).transpose((0,2,1))
    
    probas = model.predict(context_encoded).reshape(-1)
    
    token_proba = probas[encoded_list.index(token)]
    
    log_proba -= (1 / len(sequence_of_tokens))* np.log(token_proba) 
    context = context[1:] + [token]

perplexity = np.exp( log_proba)

# return perplexity

1000it [00:33, 29.57it/s]


In [791]:
perplexity

1.0305741064818166