In [138]:
import collections
from typing import List, Dict, Tuple
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Vocabulario
vocabulario = {'<s>', '</s>', 'a', 'all', 'model', 'models', 'some', 'useful', 'wrong'}

#corpus
frase_1 =  "all models are wrong"
frase_2 =  "a model is wrong"
frase_3 =  "some useful models are wrong"
lista_frases = [[frase_1], [frase_2], [frase_3]]

def construir_corpus(lista_frases: List[str]) -> List[str]:
    """
    Constructs a corpus as a flat list of tokens from a list of sentences.
    """
    corpus = []
    for frase in lista_frases:
        sentence = frase[0]
        tokens = word_tokenize(sentence)
        corpus.extend(tokens) # Extend corpus with tokens of this sentence
    return corpus

corpus = construir_corpus(lista_frases)
print(corpus)

['all', 'models', 'are', 'wrong', 'a', 'model', 'is', 'wrong', 'some', 'useful', 'models', 'are', 'wrong']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [139]:
class ModeloNGrama:
    def __init__(self, n:int):
        self.n = n
        self.cuenta_ngrama = collections.Counter()
        self.cuenta_contexto = collections.Counter()
        self.vocabulario = vocabulario = {'<s>', '</s>', 'a', 'all', 'model', 'models', 'some', 'useful', 'wrong'}
        self.total_ngrama = 0

    def entrenar(self, corpus: List[str]):
        for i in range(len(corpus) - self.n + 1):
          if corpus[i] not in self.vocabulario:
                self.vocabulario.add(corpus[i])
          ngrama = tuple(corpus[i:i+self.n])
          contexto = tuple(corpus[i:i+self.n-1])
          self.cuenta_ngrama[ngrama] += 1
          self.cuenta_contexto[contexto] += 1
          self.total_ngrama += 1

    def obtener_prob_ngram(self, ngrama: Tuple[str, ...]) -> float:
        cuenta = self.cuenta_ngrama.get(ngrama, 0)
        contexto = ngrama[:-1]
        cuenta_contexto = self.cuenta_contexto.get(contexto, 0)
        if cuenta_contexto == 0:
            return 0.0
        else:
            return cuenta / cuenta_contexto

In [140]:
modelo_bigrama = ModeloNGrama(n=2)
modelo_bigrama.entrenar(corpus=corpus)
#Probabilidades de todos los bigramas:
print(f'probabilidad del bigrama a model {modelo_bigrama.obtener_prob_ngram(("a", "model"))}')
print(f'probabilidad del bigrama model is {modelo_bigrama.obtener_prob_ngram(("model", "is"))}')
print(f'probabilidad del bigrama is wrong {modelo_bigrama.obtener_prob_ngram(("is", "wrong"))}')
print(f'probabilidad del bigrama are wrong {modelo_bigrama.obtener_prob_ngram(("are","wrong"))}')

print(f'probabilidad del bigrama inexistente a models {modelo_bigrama.obtener_prob_ngram(("a", "models"))}')

probabilidad del bigrama a model 1.0
probabilidad del bigrama model is 1.0
probabilidad del bigrama is wrong 1.0
probabilidad del bigrama are wrong 1.0
probabilidad del bigrama inexistente a models 0.0


In [141]:
#Suavizado add-k para bigrama
def add_k_smoothing_bigram(corpus, k):
    # Conteo de bigramas y unigrams
    bigram_counts = {}
    unigram_counts = {}

    # Construir bigramas
    for i in range(len(corpus) - 1):
        bigram = (corpus[i], corpus[i + 1])
        unigram = corpus[i]

        # Conteo de bigramas
        if bigram in bigram_counts:
            bigram_counts[bigram] += 1
        else:
            bigram_counts[bigram] = 1

        # Conteo de unigrams
        if unigram in unigram_counts:
            unigram_counts[unigram] += 1
        else:
            unigram_counts[unigram] = 1

    # Contar el último unigramo
    last_word = corpus[-1]
    if last_word in unigram_counts:
        unigram_counts[last_word] += 1
    else:
        unigram_counts[last_word] = 1

    # Tamaño del vocabulario
    V = len(unigram_counts)

    # Cálculo de las probabilidades suavizadas para bigramas
    add_k_probabilities = {}
    for bigram, bigram_count in bigram_counts.items():
        w_n_1 = bigram[0]  # w_{n-1}
        # Aplicando la ecuación P_Add-k(w_n | w_{n-1}) = (C(w_{n-1}w_n) + k) / (C(w_{n-1}) + kV)
        add_k_probabilities[bigram] = (bigram_count + k) / (unigram_counts[w_n_1] + k * V)

    # Probabilidad para un bigrama no visto
    add_k_probabilities[('<a>', '<models>')] = k / (V * (V + k))

    return add_k_probabilities

In [142]:
k = 0.1
add_k_prob_bigrams = add_k_smoothing_bigram(corpus, k)
print("\nProbabilidades de bigramas suavizadas con Add-k (k=0.1):")
for bigram, prob in add_k_prob_bigrams.items():
    print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")


Probabilidades de bigramas suavizadas con Add-k (k=0.1):
P(models | all) = 0.5789
P(are | models) = 0.7241
P(wrong | are) = 0.7241
P(a | wrong) = 0.2821
P(model | a) = 0.5789
P(is | model) = 0.5789
P(wrong | is) = 0.5789
P(some | wrong) = 0.2821
P(useful | some) = 0.5789
P(models | useful) = 0.5789
P(<models> | <a>) = 0.0012


In [145]:
def backoff_unigram(unigram_counts, total_tokens):
    # Calcula las probabilidades de unigramas
    unigram_probs = {}
    for word, count in unigram_counts.items():
        unigram_probs[word] = count / total_tokens
    return unigram_probs

def backoff_bigram(bigram_counts, unigram_counts, total_tokens):
    # Calcula las probabilidades de bigramas con retroceso a unigramas
    bigram_probs = {}
    for (w1, w2), count in bigram_counts.items():
        if unigram_counts[w1] > 0:
            bigram_probs[(w1, w2)] = count / unigram_counts[w1]
        else:
            bigram_probs[(w1, w2)] = backoff_unigram(unigram_counts, total_tokens).get(w2, 1 / total_tokens)
    return bigram_probs

def ngram_backoff(unigram_probs, bigram_probs, alpha):
    def backoff_prob(w_n, w_n1):
        # Bigram
        if (w_n1, w_n) in bigram_probs:
            return alpha * bigram_probs[(w_n1, w_n)]
        # Unigram
        else:
            return alpha * unigram_probs.get(w_n, 1 / len(unigram_probs))
    return backoff_prob



In [147]:
print("probabilidades con backoff")
alpha = 0.4
backoff_prob = ngram_backoff(backoff_unigram, backoff_bigram, alpha)
#Probabilidades de todos los bigramas:


probabilidades con backoff


AttributeError: 'str' object has no attribute 'items'

In [None]:
#ejercicio 2

In [None]:
import re
def leer_archivo(ruta):
    with open(ruta, 'r', encoding='utf-8') as file:
        return file.readlines()

ruta = 'infopankki.en-es.es'
archivo = leer_archivo(ruta)

def tokenizar_linea(linea):
    palabras = []
    for palabra in linea.split():
      #eliminacion de Stopwords
      if palabra.endswith(','):
        palabra = palabra[:-1]
      if palabra.endswith('.'):
        palabra = palabra[:-1]
      if palabra.endswith('!'):
        palabra = palabra[:-1]
      if palabra.endswith('?'):
        palabra = palabra[:-1]
      if palabra.endswith(':'):
        palabra = palabra[:-1]
      if palabra.endswith(';'):
        palabra = palabra[:-1]
      if palabra.endswith('('):
        palabra = palabra[:-1]
      if palabra.endswith(')'):
        palabra = palabra[:-1]
      if palabra.endswith('['):
        palabra = palabra[:-1]
      if palabra.endswith(']'):
        palabra = palabra[:-1]
      if palabra.endswith('{'):
        palabra = palabra[:-1]
      if palabra.endswith('}'):
        palabra = palabra[:-1]
      if palabra.endswith('"'):
        palabra = palabra[:-1]
      if palabra.endswith("'"):
        palabra = palabra[:-1]
      if palabra.endswith('`'):
        palabra = palabra[:-1]
      if palabra.endswith('`'):
        palabra = palabra[:-1]
      if palabra.endswith('`'):
        palabra = palabra[:-1]
      if palabra.endswith('`'):
        palabra = palabra[:-1]
      if palabra.endswith('`'):
        palabra = palabra[:-1]
      if len(palabra) not in ['a', 'al', 'de', 'o', 'e' 'del']:
          palabras.append(palabra)
    return palabras

def lematizar_tokens(tokens):
  patron_amar = re.compile(r'amar[a-z]*')
  patron_estudiar = re.compile(r'estudi[a-z]*')
  for token in tokens:
    if re.find(patron_amar, token): #implementar una expresion regular mas precisa
      token = 'amar'
    if re.find(patron_estudiar, token): #implementar una expresion regular mas precisa
      token = 'estudiar'

def stemizar_tokens(tokens):
  patron_terminacion_s = re.compile(r'[a-z]*s')
  patron_terminacion_es = re.compile(r'[a-z]*es')
  for token in tokens:
    if re.find(patron_terminacion_s, token): #implementar una expresion regular mas precisa
      token = token[:-1]
    if re.find(patron_terminacion_es, token): #implementar una expresion regular mas precisa
      token = token[:-2]
  return tokens


#Preprosesamiento de texto
def preprocesar_archivo(archivo, num_muestras=10000):
    vocabulario_inicial = []
    num_lineas = 0
    for linea in archivo:
        tokens = tokenizar_linea(linea)
        tokens = lematizar_tokens(tokens)
        tokens = stemizar_tokens(tokens)
        for token in tokens:
          vocabulario_inicial.append(token)
        num_lineas += 1
        if num_lineas >= num_muestras:
            break
    for palabra in vocabulario:
      apariciones = vocabulario.count(palabra)
      if apariciones < 5:
        vocabulario.remove(palabra)
    return vocabulario

