# Assignment 1

Using text http://www.gutenberg.org/files/2600/2600-0.txt
1. Make text lowercase and remove all punctuation except spaces and dots.
2. Tokenize text by BPE with vocab_size = 100
3. Train 3-gram language model with laplace smoothing $\delta=1$
4. Using beam search with k=10 generate sequences of length=10 conditioned on provided inputs. Treat dots as terminal tokens.
5. Calculate perplexity of the language model for the first sentence.

In [None]:
import string
import re
import collections
from collections import Counter
import nltk
from sklearn.base import TransformerMixin
import numpy as np

In [1]:
text = open('peace.txt', 'r', encoding='utf-8-sig').read()
len(text)

3227580

In [2]:
def preprocess_text(text):
    punct = list(set(re.findall('\W', text)))
    punct.remove('.')
    words = ''
    for word in text.split():
        for letter in word:
            if letter not in punct:
                words += letter
        words += ' '
    text = words.lower()
        
    # replace all punctuation except dots with spaces
    # collapse multiple spaces into one '   ' -> ' '
    return text


text = preprocess_text(text)
len(text)
# assert len(text) == 3141169

3130569

In [3]:
text = text.split('.')
text = [x.strip() for x in text]

In [5]:
class BPE(TransformerMixin):
    def __init__(self, vocab_size=100):
        super(BPE, self).__init__()
        self.vocab_size = vocab_size
        # index to token
        self.itos = []
        # token to index
        self.stoi = {}
        
    def fit(self, text):
        """
        fit itos and stoi
        text: list of strings 
        """
        
        # TODO
        # tokenize text by symbols and fill in self.itos and self.stoi
        self.itos = list(set([item for txt in text for item in set(txt)]))
        self.stoi = {l: idx for idx, l in enumerate(self.itos)}
        text = [[self.stoi[letter] for letter in txt] for txt in text]
        
        while len(self.itos) < self.vocab_size:
            # TODO
            # count bigram freqencies in the text
            bigrams = collections.Counter()
            for txt in text:
                i = 0
                while i + 1 < len(txt):
                    bigrams[(txt[i], txt[i+1])] += 1
                    i += 1
                
                
            mc = bigrams.most_common(1)[0][0]
            new_token = str(self.itos[int(mc[0])]) + str(self.itos[int(mc[1])])
            new_id = len(self.itos)
            
            self.itos.append(new_token)
            self.stoi[new_token] = new_id
            
            # find occurences of the new_token in the text and replace them with new_id
            tmp = []
            for txt in text:
                tmp2 = []
                i = 0
                while i + 1 < len(txt):
                    if new_token == self.itos[txt[i]] + self.itos[txt[i+1]]:
                        tmp2.append(new_id)
                        i += 2
                    else:
                        tmp2.append(txt[i])
                        i += 1
                tmp.append(tmp2)
                    
            text = tmp 
           
        return self
    
    def transform(self, text):
        """
        convert text to a sequence of token ids
        text: list of strings
        """ 
        max_size = max([len(tok) for tok in self.itos])
        
        new_text = []
        for txt in text:
            i = 0
            new_txt = []
            while i < len(txt):
                hit = False
                stop = i+max_size if len(txt) - (i + max_size) >= 0 else len(txt)
                while hit == False and stop > i:
                    if txt[i:stop] in self.itos:
                        new_txt.append(self.stoi[txt[i:stop]])
                        hit = True
                        i = stop
                    else:
                        stop -= 1
            new_text.append(new_txt)
                                
        text = new_text
       # for token_id, token in enumerate(self.itos):
            # find occurences of the token in the text and replace them with token_id
          #  text = # TODO       
        return text
    
    def decode_token(self, tok):
        """
        tok: int or tuple
        """
        result = self.itos[tok] if isinstance(tok, int) else [self.itos[i] for i in token]
        return result
            
    def decode(self, text):
        """
        convert token ids into text
        """
        return ''.join(map(self.decode_token, text))
        
        
vocab_size = 100
bpe = BPE(vocab_size)
tokenized_text = bpe.fit_transform(text)

In [6]:
assert bpe.decode(tokenized_text[0]) == text[0]

In [7]:
bpe.decode(tokenized_text[0]) 

'the project gutenberg ebook of war and peace by leo tolstoy this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever'

In [8]:
text[0]

'the project gutenberg ebook of war and peace by leo tolstoy this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever'

In [9]:
print(tokenized_text[0])

[65, 16, 51, 18, 40, 52, 30, 61, 34, 7, 23, 69, 0, 64, 73, 52, 0, 18, 18, 17, 35, 83, 56, 82, 35, 71, 16, 52, 14, 30, 57, 0, 66, 5, 52, 68, 23, 18, 5, 89, 18, 66, 58, 45, 60, 52, 0, 18, 18, 17, 35, 45, 60, 54, 75, 35, 65, 7, 29, 57, 83, 63, 48, 67, 57, 63, 48, 56, 44, 64, 57, 84, 6, 68, 30, 18, 89, 35, 71, 99, 58, 35, 87, 49, 18, 89, 35, 6, 68, 79, 89, 51, 45, 30, 23, 45, 67, 60, 56, 78, 23, 29, 18, 52, 10, 64]


In [41]:
start_token = vocab_size
end_token = vocab_size + 1
        
    
class LM:
    def __init__(self, vocab_size, delta=1):
        self.delta = delta
        self.vocab_size = vocab_size + 2
        self.proba = {} # TODO create array for storing 3-gram counters
        
    def infer(self, a, b, tau=1):
        """
        return vector of probabilities of size self.vocab for 3-grams which start with (a,b) tokens
        a: first token id
        b: second token id
        tau: temperature
        """
        result = {}
        for item in self.itos:
            P = self.get_proba(a, b, item, tau)
            result[item] = P
        return result
        
    def get_proba(self, a, b, c, tau=1):
        """
        get probability of 3-gram (a,b,c)
        a: first token id
        b: second token id
        c: third token id
        tau: temperature
        """
        P_ab = 0
        P_abc = 0
        for sublist in self.text:
             if a in sublist and b in sublist:
                for i in range(len(sublist) - 2):
                    if sublist[i] == a and sublist[i+1] == b:
                        P_ab += 1
                        if c in sublist and i + 2 < len(sublist) and sublist[i + 2] == c:
                            P_abc += 1                    
        n = 1/tau        
        result = (P_ab ** n) / (P_abc ** n) if P_abc > 0 else 0 # TODO approximate probability by counters
        return result
    
    def fit(self, text):
        """
        train language model on text
        text: list of lists
        """
        self.text = text
        self.itos = set([inner for sublist in text for inner in sublist])
        self.bigrams = []
        for txt in text:
            for i in range(len(txt) - 1):
                if [txt[i], txt[i+1]] not in self.bigrams:
                    self.bigrams.append([txt[i], txt[i+1]])
        print(len(self.bigrams))            
       
    #    self.proba = # TODO count 3-grams in the text
    
        for idx, bigram in enumerate(self.bigrams[:5]):
            self.proba[(bigram[0], bigram[1])] = self.infer(bigram[0], bigram[1])
            print(idx, bigram)   
        return self
    
lm = LM(vocab_size, 1).fit(tokenized_text)

4192
0 [65, 16]
1 [16, 51]
2 [51, 18]
3 [18, 40]
4 [40, 52]


In [None]:
np.save(lm.proba, 'proba.npy')

In [50]:
input_seq = [16, 51]
proba = lm.proba
bigrams = lm.bigrams
if len(input_seq) >= 2:
    if [input_seq[-2], input_seq[-1]] in lm.bigrams:
        res = sorted(proba[(input_seq[-2], input_seq[-1])].items(),  key=lambda x: x[1], reverse = True)[:5]
        print(res)

[(48, 2803.5), (47, 1869.0), (94, 1401.75), (27, 1121.4), (53, 700.875)]


In [None]:
def beam_search(input_seq, lm, max_len=10, k=5, tau=1):
    """
    generate sequence from language model *lm* conditioned on input_seq
    input_seq: sequence of token ids for conditioning
    lm: language model
    max_len: max generated sequence length
    k: size of beam
    tau: temperature
    """
    
    beam = [] # TODO store in beam tuples of current sequences and their log probabilities
    
    proba = lm.proba
    bigrams = lm.bigrams
    if len(input_seq) >= 2:
        if [input_seq[-2], input_seq[-1]] in lm.bigrams:
        res = sorted(proba[(input_seq[-2], input_seq[-1])].items(),  key=lambda x: x[1], reverse = True)[:k]
        print(res)
            
            
        
    for i in range(max_len):
        candidates = []
        candidates_proba = []
        for snt, snt_proba in beam:
            if # TODO process terminal token
            else:    
                proba = # probability vector of the next token
                best_k = # top-k most probable tokens
                # TODO update candidates' sequences and corresponding probabilities
                
        beam = # select top-k most probable sequences from candidates
    return beam
    

In [None]:
input1 = 'horse '
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
# TODO print decoded generated strings and their probabilities
    

In [None]:
input1 = 'her'
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
# TODO print decoded generated strings and their probabilities

In [None]:
input1 = 'what'
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=1)
# TODO print decoded generated strings and their probabilities

In [None]:
input1 = 'gun '
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
# TODO print decoded generated strings and their probabilities

In [None]:
def perplexity(snt, lm):
    """
    snt: sequence of token ids
    lm: language model
    """
    result = #TODO perplexity for the sentence
    return result

perplexity(tokenized_text[0], lm)