In [1]:
import nltk
import numpy as np
from nltk import tokenize 
from nltk.util import ngrams
from nltk.corpus import machado
from nltk.tokenize import RegexpTokenizer, sent_tokenize


nltk.download('punkt')
nltk.download('machado')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/delmirodaladiersampaioneto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package machado to
[nltk_data]     /home/delmirodaladiersampaioneto/nltk_data...
[nltk_data]   Package machado is already up-to-date!


True

## Loading and tokenizing texts

In [2]:
train_text = machado.raw('romance/marm05.txt')
test_text = machado.raw('romance/marm02.txt')

In [3]:
train_text = train_text.lower()
test_text = test_text.lower()

In [20]:
def create_tokenized_text(train_text: str, file_name:str):

    word_tokenizer = RegexpTokenizer(r'[-\'\w]+')

    sentences = sent_tokenize(train_text, language='portuguese')

    with open(file_name, 'w+') as file_writer:

        for sentence in sentences:
            tokenized_sentence = word_tokenizer.tokenize(sentence.replace('\n', ' '))
            if tokenized_sentence:
                tokenized_sentence.append('[END]')
                tokenized_sentence = ['[START]'] + tokenized_sentence 
                tokenized_sentence = [token for token in tokenized_sentence if token != "\n"]
                file_writer.write(','.join(tokenized_sentence))
                file_writer.write('\n') 

In [48]:
def count_ngrams(filename):
    n_gram_size = 2
    sentences = []

    counts = {}
    prev_counts = {}
    num_sentences = 0
    num_tokens = 0
    
    
    with open(filename, 'r') as op:
        sentences = op.readlines()
        sentences = [sentence.replace('\n', '').split(',') for sentence in sentences]
        
    for sentence in sentences:
        num_sentences =+ 1 
        num_tokens =+ len(sentence)
        
        #get count for bigrams
        for index, ngram in enumerate(ngrams(sentence, n_gram_size)):
            counts[ngram] = counts.get(ngram, 0) + 1
            
        # get count of unigrams
        for index, ngram in enumerate(ngrams(sentence, n_gram_size - 1)): 
            prev_counts[ngram[0]] = prev_counts.get(ngram[0], 0) + 1
            
       
    return counts, prev_counts, num_sentences, num_tokens

In [55]:
def calculate_probabilities(counts, prev_counts):

    probs = {}
    vocabulary = counts.keys()
    vocabulary_size = len(vocabulary)

    for bigram in counts.keys():
        probs[bigram] = (counts[bigram] + 1)/(prev_counts[bigram[0]] + 1 * vocabulary_size)

    return probs


In [64]:
def model_evaluation(train_bigram_counts, test_bigram_counts, probabilities, test_num_tokens):
    
    vocabulary = train_bigram_counts.keys()
    test_log_likelihood = 0
    
    for bigram, bigram_count in test_bigram_counts.items():
        if bigram not in vocabulary:
            word = '[UNK]'
            
        train_prob = probabilities[bigram]
        log_likelihood = bigram_count * np.log2(train_prob)
        test_log_likelihood += log_likelihood

        avg_test_log_likelihood = test_log_likelihood / test_num_tokens
        
        return avg_test_log_likelihood

In [21]:
create_tokenized_text(train_text, 'tokenized_train.txt')
create_tokenized_text(test_text, 'tokenized_test.txt')

In [60]:
counts, prev_counts, num_sentences, num_tokens = count_ngrams('tokenized_train.txt')
test_counts, test_prev_counts, test_num_sentences, test_num_tokens = count_ngrams('tokenized_test.txt')
probs = calculate_probabilities(counts, prev_counts)

In [65]:
model_evaluation(counts, test_counts, probs, test_num_tokens)

-4.811249475410533