In [0]:
#from google.colab import drive
#drive.mount('/content/drive/', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
#cd "/content/drive/My Drive/Information_Retrieval_HW1"

/content/drive/My Drive/Information_Retrieval_HW1


# Imports

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline
from tqdm import tqdm
from typing import List,Dict
from IPython.display import Image
from IPython.core.display import HTML 
from pathlib import Path

In [0]:
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download("stopwords")
nltk.download("punkt")
from string import punctuation, ascii_lowercase
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Debug
""" you can change this cell """

In [0]:
DEBUG = False
"""
Recommended to start with a small number to get a feeling for the preprocessing with prints (N_ROWS_FOR_DEBUG = 2)
later increase this number for 5*10**3 in order to see that the code runs at reasonable speed. 
When setting Debug == False, our code implements bow.fit() in 15-20 minutes according to the tqdm progress bar. Your solution is not supposed to be much further than that.
"""
N_ROWS_FOR_DEBUG = 5*10**3 


# Config

In [0]:
INPUT_FILE_PATH = Path("lyrics.csv")

BOW_PATH = Path("bow.csv")
N_ROWS = N_ROWS_FOR_DEBUG if DEBUG else None
CHUNCK_SIZE = 5 if DEBUG else 5*10**3
tqdm_n_iterations = N_ROWS//CHUNCK_SIZE +1 if DEBUG else 363*10**3//CHUNCK_SIZE + 1
COLS = [5]

## 1.1 Bag of words model
### Implement the following methods:

* `preprocess_sentence`: 
    * Lower case the word
    * Ignores it if it's in the stopwords list
    * Removes characters which are not in the allowed symbols
    * Stems it and appends it to the output sentence
    * Discards words with length <= 1
    
    
* `update_counts_and_probabilities`: 

    * Update self.unigram count (the amount of time each word is in the text)
    * Update self.bigram count (two consecutive word occurances)
    * Update self.trigram count (three consecutive word occurances)
    * Update inverted index: a dictionary with words as keys and the values is a dictionary - {'DocID' : word_count}   
    
* `compute_word_document_frequency`:

   * For each word count the number of docs it appears in. For example , for the word 'apple' -
$$\sum_{i \in docs} I(apple \in doc_i), I := Indicator function$$


* `update_inverted_index_with_tf_idf_and_compute_document_norm`:

    * Update the inverted index (which currently hold word counts) with tf idf weighing. We will compute tf by dividing with the number of words in each document. 
    * As we want to calculate the document norm, incrementally update the document norm. pay attention that later we apply sqrt to it to finish the process.

#### The result of this code is a bag of words model that already counts for TF-IDF weighing

In [0]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
allowed_symbols = set(l for l in ascii_lowercase)

In [0]:
def preprocess_sentence(sentence : str) -> List[str]:
    output_sentence = []
    for word in word_tokenize(sentence):
        word = word.lower()
        if word in stop_words:
            continue
        not_allowed_symbols = set(word)-allowed_symbols
        for x in not_allowed_symbols: word = word.replace(x, "")
        word = stemmer.stem(word)

        if len(word)<=1:
            continue
        output_sentence.append(word)
        
        
    return output_sentence
    

def get_data_chuncks() -> List[str]:
    for i ,chunck in enumerate(pd.read_csv(INPUT_FILE_PATH, usecols = COLS, chunksize = CHUNCK_SIZE, nrows = N_ROWS)):
        chunck = chunck.values.tolist()
        yield [chunck[i][0] for i in range(len(chunck))] 


class TfIdf:
    def __init__(self):
        self.unigram_count =  Counter()
        self.bigram_count = Counter()
        self.trigram_count = Counter()
        self.document_term_frequency = Counter()
        self.word_document_frequency = {}
        self.inverted_index = {}
        self.doc_norms = {}
        self.n_docs = -1
        self.sentence_preprocesser = preprocess_sentence
        self.bow_path = BOW_PATH

    def update_counts_and_probabilities(self, sentence :List[str],document_id:int) -> None:
        sentence_len = len(sentence)
        self.document_term_frequency[document_id] = sentence_len
        for i,word in enumerate(sentence):
            self.unigram_count.update([word])
            if (word) not in self.inverted_index:
                self.inverted_index[(word)] = {document_id : sentence.count(word)}
            else:
                self.inverted_index[(word)].update({document_id : sentence.count(word)})

            # No need to add bigrams and trigrams to self.inverted_index
            if i>=1:
                bigram = (sentence[i-1], word)
                #bigram_cnt =  ''.join(sentence).count(sentence[i-1]+word)
                self.bigram_count.update([bigram])
                #if bigram not in self.inverted_index:   
                #    self.inverted_index[bigram] = {document_id : bigram_cnt}
                #else:
                #    self.inverted_index[bigram].update({document_id : bigram_cnt})

            if i>=2:
                trigram = (sentence[i-2], sentence[i-1], word)
                #trigram_cnt = ''.join(sentence).count(sentence[i-2]+sentence[i-1]+word)
                self.trigram_count.update([trigram])
                #if trigram not in self.inverted_index:
                #    self.inverted_index[trigram] = {document_id : trigram_cnt}
                #else:
                #    self.inverted_index[trigram].update({document_id: trigram_cnt})

            
            
            
            
            
        
        
    def fit(self) -> None:
        skipped = 0
        for chunck in tqdm(get_data_chuncks(), total = tqdm_n_iterations):
            for sentence in chunck:
                self.n_docs += 1 
                if not isinstance(sentence, str):
                    skipped +=1
                    continue
                sentence = self.sentence_preprocesser(sentence)
                if sentence:
                    self.update_counts_and_probabilities(sentence,self.n_docs)
        self.save_bow() # bow is 'bag of words'
        self.compute_word_document_frequency()
        self.update_inverted_index_with_tf_idf_and_compute_document_norm()
        print('total of skipped documents : {} out of {}'.format(skipped, self.n_docs))
             
    def compute_word_document_frequency(self):
        for word in self.inverted_index.keys():
            self.word_document_frequency[word] = len(self.inverted_index[word].keys())
            
    def update_inverted_index_with_tf_idf_and_compute_document_norm(self):
        for word in self.inverted_index:
            for doc in self.inverted_index[word]:
                tf_nom = self.inverted_index[word][doc] # number of occurences of term in doc
                tf_denom =  self.document_term_frequency[doc] #normalize by num of terms in doc
                tf = tf_nom/tf_denom 
                
                idf_nom = self.n_docs #number of documents in corpus
                idf_denom = self.word_document_frequency[word] #number of documents where term occured at least once
                idf = np.log(idf_nom/idf_denom)
                #update doc norms with the (tf * idf) **2 for each word 
                self.inverted_index[word][doc] = tf * idf
                if doc not in self.doc_norms:
                    self.doc_norms[doc] = (tf * idf) **2
                else:    
                    self.doc_norms[doc] += (tf * idf) **2 
            
        
        
        
        
        
        for doc in self.doc_norms.keys():
            self.doc_norms[doc] = np.sqrt(self.doc_norms[doc]) 
            
    def save_bow(self):
        pd.DataFrame([self.inverted_index]).T.to_csv(self.bow_path)
                
tf_idf = TfIdf()
tf_idf.fit()


  0%|          | 0/73 [00:00<?, ?it/s][A
  1%|▏         | 1/73 [00:17<21:19, 17.77s/it][A
  3%|▎         | 2/73 [00:35<21:09, 17.89s/it][A
  4%|▍         | 3/73 [00:52<20:19, 17.41s/it][A
  5%|▌         | 4/73 [01:11<20:43, 18.02s/it][A
  7%|▋         | 5/73 [01:31<20:57, 18.49s/it][A
  8%|▊         | 6/73 [01:49<20:35, 18.44s/it][A
 10%|▉         | 7/73 [02:10<21:06, 19.19s/it][A
 11%|█         | 8/73 [02:27<20:11, 18.63s/it][A
 12%|█▏        | 9/73 [02:49<20:43, 19.44s/it][A
 14%|█▎        | 10/73 [03:04<19:13, 18.30s/it][A
 15%|█▌        | 11/73 [03:28<20:35, 19.93s/it][A
 16%|█▋        | 12/73 [03:50<20:46, 20.43s/it][A
 18%|█▊        | 13/73 [04:07<19:27, 19.46s/it][A
 19%|█▉        | 14/73 [04:24<18:22, 18.68s/it][A
 21%|██        | 15/73 [04:44<18:24, 19.04s/it][A
 22%|██▏       | 16/73 [05:01<17:42, 18.63s/it][A
 23%|██▎       | 17/73 [05:22<18:00, 19.29s/it][A
 25%|██▍       | 18/73 [05:41<17:38, 19.25s/it][A
 26%|██▌       | 19/73 [05:57<16:30, 18.35s/it]

total of skipped documents : 95680 out of 362236


In [0]:
class DocumentRetriever:
    def __init__(self, tf_idf):
        self.sentence_preprocesser = preprocess_sentence  
        self.vocab = set(tf_idf.unigram_count.keys())
        self.n_docs = tf_idf.n_docs
        self.inverted_index = tf_idf.inverted_index
        self.word_document_frequency = tf_idf.word_document_frequency
        self.doc_norms = tf_idf.doc_norms
        
    def rank2(self,query : Dict[str,int],documents: Dict[str , Dict[str, float]] ,metric: str ) -> Dict[str, float]:
        result = {} # key: DocID , value : float , simmilarity to query
        query_len = np.sum(np.array(list(query.values())))
        #query_norm = np.sqrt(np.sum(np.square(list(query.values()))))
        query_norm = 0
        tf_idf_query = []
        tf_idf_docs =   {k: [] for k in range(self.n_docs+1)} #each key is a doc_id and each value is an empty list
        for term in query:
            tf = query[term]/query_len # divide number of occurences of term in query by query length
            idf = self.n_docs/self.word_document_frequency[term] #get idf from dict
            tf_idf_query.append(tf * np.log(idf)) #append to tf_idf vector
            query_norm += (tf * np.log(idf)) ** 2 #increment norm
            for doc in tf_idf_docs: # iterate all docs of the inverted_dict[term]
                tf_idf_doc = self.inverted_index[term][doc] if doc in self.inverted_index[term] else 0 #it is possible that there is no occurence of the term in a given document therefore tf=0-->tf*idf =0
                tf_idf_docs[doc].append(tf_idf_doc)
        tf_idf_query = np.array(tf_idf_query)
        query_norm = np.sqrt(query_norm)
        for doc in tf_idf_docs:
            tf_idf_doc = np.array(tf_idf_docs[doc])
            doc_norm = self.doc_norms[doc] if doc in self.doc_norms else 0 #it is also possible that we skipped sentences in the .fit() method therfore some documents will not have any norm
            if metric== 'cosine':
                result[doc] = np.dot(tf_idf_query, tf_idf_doc)/ (query_norm * doc_norm)
            else:
                result[doc] = np.dot(tf_idf_query, tf_idf_doc)

        return result
    
    def rank(self, query: Dict[str, int], documents: Dict[str, Dict[str, float]], metric: str) -> Dict[str, float]:
        result = {}  # key: DocID , value : float , simmilarity to query
        query_len = np.sum(np.array(list(query.values())))
        query_norm = 0
        tf_idf_query = []
        tf_idf_docs = {}  # each key is a doc_id and each value is an empty list
        for idx, term in enumerate(query):
            tf = query[term] / query_len  # divide number of occurences of term in query by query length
            idf = self.n_docs / self.word_document_frequency[term]  # get idf from dict
            tf_idf_query.append(tf * np.log(idf))  # append to tf_idf vector
            query_norm += (tf * np.log(idf)) ** 2  # increment norm
            for doc in documents[term]:  # iterate all docs of the inverted_dict[term]
                if doc not in tf_idf_docs: #get precalculated tf-idf for word and doc
                    tf_idf_doc = np.zeros(len(query))
                    tf_idf_doc[idx] = documents[term][doc]
                    tf_idf_docs[doc] = tf_idf_doc
                else:
                    tf_idf_docs[doc][idx] = documents[term][doc]


        tf_idf_query = np.array(tf_idf_query)
        query_norm = np.sqrt(query_norm)
        for doc in tf_idf_docs:
            tf_idf_doc = np.array(tf_idf_docs[doc])
            doc_norm = self.doc_norms[doc]
            if metric == 'cosine':
                result[doc] = np.dot(tf_idf_query, tf_idf_doc) / (query_norm * doc_norm)
            else:
                result[doc] = np.dot(tf_idf_query, tf_idf_doc)

        return result

        
    
    def sort_and_retrieve_k_best(self, scores: Dict[str, float],k :int):
        return sorted(scores, key=scores.get, reverse=True)[:k]


    
    def reduce_query_to_counts(self, query : List)->  Counter:
        return Counter(query)
        
        
    def get_top_k_documents(self,query : str, metric: str , k = 5) -> List[str]:
        query = self.sentence_preprocesser(query)
        query = [word for word in query if word in self.vocab] # filter nan 
        query_bow = self.reduce_query_to_counts(query)
        relavant_documents = {word : self.inverted_index.get(word) for word in query}
        ducuments_with_similarity = self.rank(query_bow,relavant_documents, metric)
        return self.sort_and_retrieve_k_best(ducuments_with_similarity,k)
        
dr = DocumentRetriever(tf_idf)

## 1.4 NgramSpellingCorrector
Now we will implement a Ngarm (character Ngrams) spelling corrector. That is, we have an out of vocabulary word (w) and we want to retrieve the most similar words (in our vocabulary) to this word.
we will model the similarity of two words by-

$$sim(v,w) := prior \cdot likelihood = p(w) \cdot P(v|w) $$ 
$$P(v|w) := JaccardIndex =  \frac{|X \cap Y|}{|X \cup Y|}$$

Where v is an out of vocabulary word (typo or spelling mistake), w is in a vocabulary word, X is the ngram set of v and Y is the ngram set of w.
For example, if n == 3, the set of ngrams for word "banana" is set("ban","ana","nan","ana") = {"ban","ana","nan"}

In order to do it efficently, we will first construct an index from the possible Ngrams we have seen in our corpus to the words that those Ngrams appear in, in order prevent comparing w to all of the words in our corpus.
Then, we will implement a function that computes this similarity.

* Make sure you compute the JaccardIndex efficently!

In [0]:
def get_bigrams(word):
    for ngram in nltk.ngrams(word, 2):
        yield "".join(list(ngram))
    
def get_trigrams(word):
    for ngram in nltk.ngrams(word, 3):
        yield "".join(list(ngram))

In [0]:
class NgramSpellingCorrector:
    def __init__(self, unigram_counts: Counter, get_n_gram: callable):
        self.unigram_counts = unigram_counts
        self.ngram_index = {}
        self.get_n_grams = get_n_gram
    
    def build_index(self) -> None:
        mapping ={}
        for word in self.unigram_counts: #iterate words
            for n_gram in self.get_n_grams(word): #get n-grams for word 
                if n_gram not in mapping:
                    mapping[n_gram] = set([word]) # if this is a new n_gram create a new set with one item in it (the current word) 
                else:
                    mapping[n_gram].add(word) #if we already saw this n_gram, add element to set (avoid duplication)
        
        
        self.mapping = mapping
        
    def get_top_k_words(self,word:str,k=5) -> List[str]:
        #result set is a dict [Str-->float] , keys are word values are sim (v,w)
        result = {}
        # P(w) is the number of occurences of the word divided by the number of words in the corpus
        normalizer = np.sum(np.array(list(self.unigram_counts.values())))
        normalized_unigram_count = {k: self.unigram_counts[k]/normalizer for k in self.unigram_counts} 
        X = set(self.get_n_grams(word))
        possible_words_to_check = set()
        for x in X: # efficiently serach our mapping to find all the possible words we should iterate
            if x not in self.mapping:
                continue
            possible_words_to_check.update(self.mapping[x]) #add possible words for n-grams to the set
        
        for vocab_word in possible_words_to_check:
            Y = set(self.get_n_grams(vocab_word))
            p_w = normalized_unigram_count[vocab_word]
            p_v_given_w = len(X.intersection(Y))/len(X.union(Y)) 
            result[vocab_word] = p_w * p_v_given_w
        
        return sorted(result, key=result.get, reverse=True)[:k]

        
        
        
        
        
        
        
        


class BigramSpellingCorrector(NgramSpellingCorrector):
    def __init__(self, unigram_counts: Counter):
        super().__init__(unigram_counts, get_bigrams)
        
        
class TrigramSpellingCorrector(NgramSpellingCorrector):
    def __init__(self, unigram_counts: Counter):
        super().__init__(unigram_counts, get_trigrams)
        

In [0]:
out_of_vocab_word = 'supercalifragilisticexpialidocious'
bigram_spelling_corrector = BigramSpellingCorrector(tf_idf.unigram_count)
bigram_spelling_corrector.build_index()
bigram_spelling_corrector.get_top_k_words(out_of_vocab_word)

['like', 'caus', 'life', 'still', 'time']

In [0]:
trigram_spelling_corrector = TrigramSpellingCorrector(tf_idf.unigram_count)
trigram_spelling_corrector.build_index()
trigram_spelling_corrector.get_top_k_words(out_of_vocab_word)

['life', 'still', 'call', 'listen', 'hous']

## 1.5 Language model
Calculate the log likelihood of a sentence. Once with a bigram markovian langauge model, and once with a trigram model.
for example - the likelihood of the senetence "spiderman spiderman does whatever a spider can" for the bigram model is: 
$$p(spiderman)\cdot p(spiderman|spiderman) \cdot  (does|spiderman) \cdot (whatever|does) \cdot  (a|whatever) \cdot  (spider|a) \cdot (can|spider)$$

And for the trigram model:
$$p(spiderman,spiderman)\cdot p(does|spiderman,spiderman) \cdot  (whatever|spiderman,does) \cdot (a|does,whatever) \cdot  (spider|whatever,a) \cdot  (can|a, spider)$$

Since we do not want a zero probability sentence use Laplace smoothing, as you have seen in the lecture, or here https://en.wikipedia.org/wiki/Additive_smoothing

In [0]:
## for the probability smoothing
NUMERATOR_SMOOTHING = 1
DENOMINATOR_SMOOTHING = 10**4
def sentence_log_probabilty(unigrams : Counter, bigrams  : Counter, trigrams: Counter, sentence: str):
    bigram_log_likelilhood, trigram_log_likelilhood = 0, 0
    sentence = sentence.split()
    for i, word in enumerate(sentence):
        #print(i,word)
        ### YOUR CODE HERE
        #to perform additive smoothing we need to add NUMERATOR_SMOOTHING to numerator
        # and \V\ * DENOMINATOR_SMOOTHING to denominator. 
        numerator_fix = NUMERATOR_SMOOTHING
        #denominator_fix = sum(unigrams.values()) * DENOMINATOR_SMOOTHING # \V\ * smoothing
        denominator_fix = DENOMINATOR_SMOOTHING
        if i==0:
            # bigram term (no trigram term) 
            p_nom = unigrams[word] # p(w0)
            p_denom = sum(unigrams.values()) #p(.) , no condition sum all unigrams 
            current_bigram_term = (numerator_fix + p_nom)/(denominator_fix + p_denom) 

            #updates
            bigram_log_likelilhood += np.log(current_bigram_term)
            continue
        elif i==1:
            #bigram term
            p_nom = bigrams[(sentence[i-1], word)] # p(w0,w1)
            p_denom = unigrams[sentence[i-1]] # p(w0)
            current_bigram_term = (numerator_fix + p_nom)/(denominator_fix + p_denom) 


            
            #trigram term 
            p_nom = bigrams[(sentence[i-1], word)] # p(w0, w1)
            p_denom = sum(bigrams.values()) #p(.),  no condition, sum all bigrams 
            current_trigram_term = (numerator_fix + p_nom)/(denominator_fix + p_denom)
            #updates
            bigram_log_likelilhood += np.log(current_bigram_term)
            trigram_log_likelilhood += np.log(current_trigram_term)

            continue
        else: #i>2
            #bigram term (same as i==1)
            p_nom = bigrams[(sentence[i-1], word)] # p(wi-1,wi)
            p_denom = unigrams[sentence[i-1]] # p(wi-1)
            current_bigram_term = (numerator_fix + p_nom)/(denominator_fix + p_denom)


            #trigram term
            p_nom = trigrams[(sentence[i-2] , sentence[i-1], word)] # p(wi-2,wi-1,wi)
            p_denom = bigrams[(sentence[i-2] , sentence[i-1])] # p(wi-2, wi-1)
            current_trigram_term = (numerator_fix + p_nom)/(denominator_fix + p_denom)
            #updates
            bigram_log_likelilhood += np.log(current_bigram_term)
            trigram_log_likelilhood += np.log(current_trigram_term)

        
        
        
        
        
        
        
        ### END YOUR CODE
    print("Bigram log likelihood is {}".format(bigram_log_likelilhood))
    print("Trigram log likelihood is {}".format(trigram_log_likelilhood))
    return (bigram_log_likelilhood, trigram_log_likelilhood )
sentence = "spider man spider man does whatever a spider can"

bi_likelilhood, tri_likelilhood = sentence_log_probabilty(tf_idf.unigram_count, tf_idf.bigram_count, tf_idf.trigram_count, sentence)

Bigram log likelihood is 0.0
Trigram log likelihood is 0.0


## 1.51 Language model: B
For each model what is the next word prediciton for the sentnence "i am"?

In [0]:

""" Please notice that ['i', 'am'] does not exist in our BOW (due to stemming), obviously we will get an irrelevant result
    So there is no need to run the code below. Anyway This is a naive implementation of next word prediction.
    A more clever implementation could use the inverted_index to get the mutual documents that both words
    'i' and 'am' appear in. """

next_word_bigram = ''
next_word_trigram = ''
best_ll_bigram = -9999
best_ll_trigram = -9999
base_sentence = 'i am'
for i,word in enumerate(tf_idf.unigram_count):
    print('parsing word {} out of {}'.format(i+1, len(tf_idf.unigram_count)))
    sentence = base_sentence + (' ') + word
    ll_bigram, ll_trigram = sentence_log_probabilty(tf_idf.unigram_count, tf_idf.bigram_count, tf_idf.trigram_count, sentence)
    if ll_bigram>best_ll_bigram:
        next_word_bigram = word
        best_ll_bigram = ll_bigram
    if ll_trigram>best_ll_trigram:
        next_word_trigram = word
        best_ll_trigram = ll_trigram
        
    
print ('Next word for Bigram Model is : ',next_word_bigram)
print ('Next word for Trigram Model is : ', next_word_trigram)



parsing word 1 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 2 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 3 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 4 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 5 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 6 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 7 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 8 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 9 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 10 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 11 out of 389141
Bigram log likelihood is 0.0
Trigram log likelihood is 0.0
parsing word 12 out of 389141


KeyboardInterrupt: ignored