### Pregunta 1

#### Continuación Probabilidad de stupid BackOff

In [42]:
corpus = ['all models are wrong','a model is wrong','some models are useful']
vocab = ['<s>','</s>','a','all','are','model','models','some','useful','wrong']

#### a) Calcular todas las probabilidades de los bigramas sin suavisado

In [189]:
import re

class Ngrama:
    def __init__(self, corpus , vocab ):
        # Inicializamos el vocabulario
        self.vocab = vocab
        # Tokenizamos el corpus de acuerdo a la naturaleza del corpus
        if isinstance(corpus,list):
            self.corpus_tokenized = self.preprocesar_list_corpus(corpus)
        elif isinstance(corpus,str):
            corpus_tokenized = self.tokenized_text(corpus)
            self.corpus_tokenized = self.preprocesar_list_corpus(corpus_tokenized)
        else:
            print('El corpus debe ser un texto o una lista de oraciones')

        # Definimos los unigramas, bigramas del corpus y sus frecuencias
        self.unigrams =  self.get_freq_ngrams(self.corpus_tokenized,1)
        self.bigrams  =  self.get_freq_ngrams(self.corpus_tokenized,2)

        # Calculamos el total de bigramas
        self.total_bigrams = self.get_freq_t_bigrams(self.bigrams)

    def get_freq_t_bigrams(self,bigrams:dict)->dict:
        t_bigram_freq = {}
        for word1 in vocab:
            for word2 in vocab:
                bigram = (word1,word2)
                t_bigram_freq[bigram] = bigrams.get((word1,word2),0)
        return t_bigram_freq   

    def get_freq_ngrams(self,corpus_tokenized:list, n_type: int)->dict:
        ngrams_freq = {}
        n = len(corpus_tokenized)
        for i in range(n-n_type+1):
            ngram = tuple(corpus_tokenized[i:i+n_type])
            if ngram not in ngrams_freq:
                ngrams_freq[ngram]=1
            else:
                ngrams_freq[ngram]+=1
        return ngrams_freq

    def preprocesar_list_corpus(self,corpus:list):
        new_sentence = ['<s>']
        for sentence in corpus :
            # Preprocesamos una oracion en token de palabras
            sentence_tokenized = self.tokenized_sentence(sentence)            
            # Agregamos el tag de final de oracion
            new_sentence += (sentence_tokenized + ['</s>']) 
        return new_sentence
        
    def tokenized_sentence(self,sentence:str)->list:
        pattern = re.compile(r'\b\w+\b')
        words = pattern.findall(sentence)
        
        # Verificamos si cada palabra esta en el vocabulario
        tokenized_words = [word.lower() if word in self.vocab else '<unk>' for word in words]
        
        return tokenized_words

    def tokenized_text(self,text:str)->list:
        return re.split(r'(?<=[.!?])\s+',text)

    def get_prob_bigram(self,bigrams:dict,unigrams:dict):
        prob_bigrams = {}
        for bigram,freq in bigrams.items():
            prob_bigrams[bigram] = (freq)/(unigrams[(bigram[0],)])
        return prob_bigrams

    def get_prob_add_k_bigram(self,bigrams:dict,unigrams:dict,k:float):
        # Tamaño del vocabulario
        V = len(vocab)        
        # Cálculo de las probabilidades suavizadas
        add_k_probabilities = {}
        for (w1,w2), freq in bigrams.items():
            N = unigrams.get((w1,),0)
            # Aplicando la ecuación P_Add-k(w_i) = (c_i + k) / (N + kV)
            add_k_probabilities[(w1,w2)] = (freq + k) / (N + k * V)
            print(f'({w1},{w2}) = ({freq} + {k}) / ({N} + {k} * {V}) = {(freq + k) / (N + k * V)} ')
        
        return add_k_probabilities
        


In [190]:
ml = Ngrama(corpus,vocab)

In [198]:
print(ml.unigrams)

{('<s>',): 1, ('all',): 1, ('models',): 2, ('are',): 2, ('wrong',): 2, ('</s>',): 3, ('a',): 1, ('model',): 1, ('<unk>',): 1, ('some',): 1, ('useful',): 1}


In [199]:
total_freq = ml.get_freq_t_bigrams(ml.bigrams)
print(total_freq)

{('<s>', '<s>'): 0, ('<s>', '</s>'): 0, ('<s>', 'a'): 0, ('<s>', 'all'): 1, ('<s>', 'are'): 0, ('<s>', 'model'): 0, ('<s>', 'models'): 0, ('<s>', 'some'): 0, ('<s>', 'useful'): 0, ('<s>', 'wrong'): 0, ('</s>', '<s>'): 0, ('</s>', '</s>'): 0, ('</s>', 'a'): 1, ('</s>', 'all'): 0, ('</s>', 'are'): 0, ('</s>', 'model'): 0, ('</s>', 'models'): 0, ('</s>', 'some'): 1, ('</s>', 'useful'): 0, ('</s>', 'wrong'): 0, ('a', '<s>'): 0, ('a', '</s>'): 0, ('a', 'a'): 0, ('a', 'all'): 0, ('a', 'are'): 0, ('a', 'model'): 1, ('a', 'models'): 0, ('a', 'some'): 0, ('a', 'useful'): 0, ('a', 'wrong'): 0, ('all', '<s>'): 0, ('all', '</s>'): 0, ('all', 'a'): 0, ('all', 'all'): 0, ('all', 'are'): 0, ('all', 'model'): 0, ('all', 'models'): 1, ('all', 'some'): 0, ('all', 'useful'): 0, ('all', 'wrong'): 0, ('are', '<s>'): 0, ('are', '</s>'): 0, ('are', 'a'): 0, ('are', 'all'): 0, ('are', 'are'): 0, ('are', 'model'): 0, ('are', 'models'): 0, ('are', 'some'): 0, ('are', 'useful'): 1, ('are', 'wrong'): 1, ('model',

In [200]:
print(ml.bigrams)

{('<s>', 'all'): 1, ('all', 'models'): 1, ('models', 'are'): 2, ('are', 'wrong'): 1, ('wrong', '</s>'): 2, ('</s>', 'a'): 1, ('a', 'model'): 1, ('model', '<unk>'): 1, ('<unk>', 'wrong'): 1, ('</s>', 'some'): 1, ('some', 'models'): 1, ('are', 'useful'): 1, ('useful', '</s>'): 1}


In [201]:
print(ml.get_prob_bigram(total_freq,ml.unigrams))

{('<s>', '<s>'): 0.0, ('<s>', '</s>'): 0.0, ('<s>', 'a'): 0.0, ('<s>', 'all'): 1.0, ('<s>', 'are'): 0.0, ('<s>', 'model'): 0.0, ('<s>', 'models'): 0.0, ('<s>', 'some'): 0.0, ('<s>', 'useful'): 0.0, ('<s>', 'wrong'): 0.0, ('</s>', '<s>'): 0.0, ('</s>', '</s>'): 0.0, ('</s>', 'a'): 0.3333333333333333, ('</s>', 'all'): 0.0, ('</s>', 'are'): 0.0, ('</s>', 'model'): 0.0, ('</s>', 'models'): 0.0, ('</s>', 'some'): 0.3333333333333333, ('</s>', 'useful'): 0.0, ('</s>', 'wrong'): 0.0, ('a', '<s>'): 0.0, ('a', '</s>'): 0.0, ('a', 'a'): 0.0, ('a', 'all'): 0.0, ('a', 'are'): 0.0, ('a', 'model'): 1.0, ('a', 'models'): 0.0, ('a', 'some'): 0.0, ('a', 'useful'): 0.0, ('a', 'wrong'): 0.0, ('all', '<s>'): 0.0, ('all', '</s>'): 0.0, ('all', 'a'): 0.0, ('all', 'all'): 0.0, ('all', 'are'): 0.0, ('all', 'model'): 0.0, ('all', 'models'): 1.0, ('all', 'some'): 0.0, ('all', 'useful'): 0.0, ('all', 'wrong'): 0.0, ('are', '<s>'): 0.0, ('are', '</s>'): 0.0, ('are', 'a'): 0.0, ('are', 'all'): 0.0, ('are', 'are'): 

In [202]:
add_k_probabilities_1 =ml.get_prob_add_k_bigram(total_freq,ml.unigrams,k=0.05)
print(add_k_probabilities)

(<s>,<s>) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,</s>) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,a) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,all) = (1 + 0.05) / (1 + 0.05 * 10) = 0.7000000000000001 
(<s>,are) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,model) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,models) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,some) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,useful) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(<s>,wrong) = (0 + 0.05) / (1 + 0.05 * 10) = 0.03333333333333333 
(</s>,<s>) = (0 + 0.05) / (3 + 0.05 * 10) = 0.014285714285714287 
(</s>,</s>) = (0 + 0.05) / (3 + 0.05 * 10) = 0.014285714285714287 
(</s>,a) = (1 + 0.05) / (3 + 0.05 * 10) = 0.3 
(</s>,all) = (0 + 0.05) / (3 + 0.05 * 10) = 0.014285714285714287 
(</s>,are) = (0 + 0.05) / (3 + 0.05 * 10) = 0.014285714285714287 
(</s>,model) = (0 + 0.05) / (3 + 0.05 *

In [203]:
add_k_probabilities_2 =ml.get_prob_add_k_bigram(total_freq,ml.unigrams,k=0.15)
print(add_k_probabilities_2)

(<s>,<s>) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,</s>) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,a) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,all) = (1 + 0.15) / (1 + 0.15 * 10) = 0.45999999999999996 
(<s>,are) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,model) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,models) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,some) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,useful) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(<s>,wrong) = (0 + 0.15) / (1 + 0.15 * 10) = 0.06 
(</s>,<s>) = (0 + 0.15) / (3 + 0.15 * 10) = 0.03333333333333333 
(</s>,</s>) = (0 + 0.15) / (3 + 0.15 * 10) = 0.03333333333333333 
(</s>,a) = (1 + 0.15) / (3 + 0.15 * 10) = 0.25555555555555554 
(</s>,all) = (0 + 0.15) / (3 + 0.15 * 10) = 0.03333333333333333 
(</s>,are) = (0 + 0.15) / (3 + 0.15 * 10) = 0.03333333333333333 
(</s>,model) = (0 + 0.15) / (3 + 0.15 * 10) = 0.03333333333333333 
(</s>,models) = (0 + 0.15) / (3 + 0.15 * 10) = 0.03333333333333333 
(</s>,some) = (1 + 0.15) /

In [197]:
add_k_probabilities_3 =ml.get_prob_add_k_bigram(total_freq,ml.unigrams,k=1)
add_k_probabilities_3

(<s>,<s>) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,</s>) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,a) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,all) = (1 + 1) / (1 + 1 * 10) = 0.18181818181818182 
(<s>,are) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,model) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,models) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,some) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,useful) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(<s>,wrong) = (0 + 1) / (1 + 1 * 10) = 0.09090909090909091 
(</s>,<s>) = (0 + 1) / (3 + 1 * 10) = 0.07692307692307693 
(</s>,</s>) = (0 + 1) / (3 + 1 * 10) = 0.07692307692307693 
(</s>,a) = (1 + 1) / (3 + 1 * 10) = 0.15384615384615385 
(</s>,all) = (0 + 1) / (3 + 1 * 10) = 0.07692307692307693 
(</s>,are) = (0 + 1) / (3 + 1 * 10) = 0.07692307692307693 
(</s>,model) = (0 + 1) / (3 + 1 * 10) = 0.07692307692307693 
(</s>,models) = (0 + 1) / (3 + 1 * 10) = 0.076923076923

{('<s>', '<s>'): 0.09090909090909091,
 ('<s>', '</s>'): 0.09090909090909091,
 ('<s>', 'a'): 0.09090909090909091,
 ('<s>', 'all'): 0.18181818181818182,
 ('<s>', 'are'): 0.09090909090909091,
 ('<s>', 'model'): 0.09090909090909091,
 ('<s>', 'models'): 0.09090909090909091,
 ('<s>', 'some'): 0.09090909090909091,
 ('<s>', 'useful'): 0.09090909090909091,
 ('<s>', 'wrong'): 0.09090909090909091,
 ('</s>', '<s>'): 0.07692307692307693,
 ('</s>', '</s>'): 0.07692307692307693,
 ('</s>', 'a'): 0.15384615384615385,
 ('</s>', 'all'): 0.07692307692307693,
 ('</s>', 'are'): 0.07692307692307693,
 ('</s>', 'model'): 0.07692307692307693,
 ('</s>', 'models'): 0.07692307692307693,
 ('</s>', 'some'): 0.15384615384615385,
 ('</s>', 'useful'): 0.07692307692307693,
 ('</s>', 'wrong'): 0.07692307692307693,
 ('a', '<s>'): 0.09090909090909091,
 ('a', '</s>'): 0.09090909090909091,
 ('a', 'a'): 0.09090909090909091,
 ('a', 'all'): 0.09090909090909091,
 ('a', 'are'): 0.09090909090909091,
 ('a', 'model'): 0.181818181818