In [None]:
from math import inf
from collections import Counter


# 1.Codigo de norving

In [2]:
"""
    Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html

    Copyright (c) 2007-2016 Peter Norvig
    MIT license: www.opensource.org/licenses/mit-license.php
"""
################ Spelling Corrector ################
####################################################
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

################ Test Code 

def unit_tests():
    assert correction('speling') == 'spelling','Err: insert'# insert
    assert correction('korrectud') == 'corrected'           # replace 2
    assert correction('bycycle') == 'bicycle'               # replace
    assert correction('inconvient') == 'inconvenient'       # insert 2
    assert correction('arrainged') == 'arranged'            # delete
    assert correction('peotry') =='poetry'                  # transpose
    assert correction('peotryy') =='poetry'                 # transpose + delete
    assert correction('word') == 'word'                     # known
    assert correction('quintessential') == 'quintessential' # unknown
    assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
    assert Counter(words('This is a test. 123; A TEST this is.')) == (
           Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
    assert len(WORDS) == 32198
    assert sum(WORDS.values()) == 1115585
    assert WORDS.most_common(10) == [
        ('the', 79809),
        ('of', 40024),
        ('and', 38312),
        ('to', 28765),
        ('in', 22023),
        ('a', 21124),
        ('that', 12512),
        ('he', 12401),
        ('was', 11410),
        ('it', 10681)]
    assert WORDS['the'] == 79809
    assert P('quintessential') == 0
    assert 0.07 < P('the') < 0.08
    return 'unit_tests pass'

In [3]:
print(unit_tests())
print(correction('speling'))
print(correction('korrectud'))
print(correction('thu'))

unit_tests pass
spelling
corrected
the


# 2.La siguiente palabra m\'as probable

Usando _big.txt_ crear una funci\'on que estime la siguiente palabra m\'as probable dada una anterior. La funci\'on debbe calcular 
    $$w_{i+1} = \text{argmax}_{w_{i+1}}P(W_{i+1}|w_i)$$
Para este trabajo
1. Podemos asumir que ambas palabras siempre existir\'an en la colecci\'on
2. Requerimos una funci\'on similar a $P$, que calcule $P(w_1|w_2)$

In [13]:
################################
### Funciones para trabajar  ###
################################

def words_from_file( fileName ):
    """ Obtenemos las palabras de un archivo. """
    file = open(fileName).read()
    return re.findall(r'\w+', file.lower())

def create_dict(texto):
    """ Funcion para crear el diccionario auxiliar para 
    calcular las probabilidades necesarias. 
    """
    ret = {}
    for i in range(1,len(texto)):
        if texto[i] not in ret:
            ret[texto[i]] = {}
        if texto[i-1] not in ret[texto[i]]:
            (ret[texto[i]])[texto[i-1]] = 0
            
        (ret[texto[i]])[texto[i-1]] += 1
    return ret

def prob_cond(a, b, dic):
    """ Probabilidad de A dado B en funcion de dic """
    try:
        return ((dic[a])[b])/sum(dic[b].values())
    except KeyError:
        return -1

def next_word(word, dic):
    """ Obtenemos la siguiente palabra mas probable en funcion
    del dicionario y sus probabiliodades. """
    maximo = ('Err', -inf)
    for key in dic[word]:
        prob = prob_cond(key, word, dic)
        if prob > maximo[1]:
            maximo = (key, prob)
    return maximo[0]

In [14]:
dic = create_dict(words_from_file('big.txt'))
word = 'new'

print( word +' '+next_word( word, dic) )
print( prob_cond('york','new', dic) )

new york
0.15811258278145696


## 2.1.Aqu\'i la maquina juega al ahorcado
Se recomienda extender y mejorar de alg\'un modo la funci\'on propuesta por __Norving__.

In [21]:
def under(word):
    word = word.split('_')
    if len(word) > 5:
        print('Demasiadas letras desconocidas')
        return None
    return word

def candidatos(word):
    ''' 
        Recibe a word ya con el 'split' aplicado 
        y regresamos las posibles palabras '''    
    letters = 'abcdefghijklmnopqrstuvwxyz'; n_letters = len(letters)
    flag = word[-1] if word[-1] != '' else 'BrendA'
    # Creamos los posibles 'pedacitos' de la palabra
    words = [ele + letter for ele in word[:len(word)-1] for letter in letters]
    # Variables auxiliares
    options = words[:n_letters]
    options_t = []
    # Concatenamos los posibles 'pedacitos'
    for k in range( 1, len(words)//n_letters ):
        for option in options:
            for i in range(n_letters):
                options_t.append(option + words[n_letters*k + i])
        options = options_t; options_t = []
        
    if flag != 'BrendA': # Checamos si al final hay un '_' o una letra
        for i in range(len(options)): 
            options[i] = options[i] + flag
    # Regresamos unicamente \'unicamente las palabras que esten en el diccionario
    return set(opt for opt in options if opt in WORDS)

def dist_lev(source, target):
    if source == target: return 0
    # Crear matriz
    n_s, n_t = len(source), len(target)
    dist = [[0 for i in range(n_t+1)] for x in range(n_s+1)]
    for i in range(n_s+1): dist[i][0] = i
    for j in range(n_t+1): dist[0][j] = j
    # Calculando la distancia
    for i in range(n_s):
        for j in range(n_t):
            cost = 0 if source[i] == target[j] else 1
            dist[i+1][j+1] = min(
                                    dist[i][j+1] + 1,   # deletion
                                    dist[i+1][j] + 1,   # insertion
                                    dist[i][j] + cost   # substitution
                                )
    return dist[-1][-1]

def closest(word, options):
    ret = 'BrendA', inf
    for opt in options:
        dist = dist_lev(word, opt)
        ret = (opt, dist) if dist < ret[1] else ret
    return ret
    
def hangman(word):
    options = candidatos( under(word) )
    return closest(word, options)

In [25]:
print(hangman('s_e_l_c_')[0]) #sherlock
print(hangman('no_eb_o_')[0]) #notebook
print(hangman('hesignificance__o')[0])    #hello

print(hangman('pe_p_e')[0]) #people
print(hangman('phi__sop_y')[0]) #philospphy
print(hangman('si_nif_c_nc_')[0]) #significance
print(hangman('kn__l_d_e')[0])      #sun

sherlock
notebook
hello
people
philosophy
significance
knowledge


## 2.2.Ahorcado al extremo
Unir la funci\'on de _2_ y _2.1_ para, utilizando una palabra de contexto, completar palabras con mayor precisi\'on

In [11]:
dic = create_dict('big.txt')

def super_under(word):
    ct = Counter(word)
    if len(word) - ct['_'] < 1:
        print('Demasiadas letras desconocidas')
        return None
    word = word.split('_')
    return word

def super_closest( context, options):
    ret = 'BrendA', -10
    for opt in options:  # Buscando el ret adecuado
        if known(opt):
            # Esta es la misma funcion de probabilidad del ejercicio anterior
            prob = prob_cond(opt, context, dic)
            #  En caso de que las proabilidades empaten
            # utilizamos las distancia entre las palabras
            # para responder.
            ret = ((opt, prob) if dist_lev(context, opt) < dist_lev(context, ret[0]) else ret) if prob == ret[1] else ret
            ret = (opt, prob) if prob > ret[1] else ret               
    return ret

def super_hangman(context, word):
    options = candidatos( super_under(word) )
    return super_closest(context, options)

In [17]:
print(super_hangman('sherlock', '_____s'))  #holmes
print(super_hangman('united', '_t_t__'))    #states
print(super_hangman('white', '___s_'))      #house
print(super_hangman('new', 'y___'))         #york
print(super_hangman('abraham', 'l_____n'))  #lincoln

('holmes', 1.0)
('states', 0.7620751341681574)
('house', 0.037142857142857144)
('york', 0.15811258278145696)
('lincoln', 0.6666666666666666)


# 3.Correci\'on ortografica simple

### Funciones auxiliares

In [None]:
import os, re

# simple extraction of words
def words (text) : 
    return re.findall(r'\w+', text.lower())

# siple loading of the documents
from keras.preprocessing.text import Tokenizer
def get_texts_from_catdir( catdir ):
    texts = [ ]
    TARGET_DIR = catdir # "./target"
    for f_name in sorted( os.listdir( TARGET_DIR )) :
        f_path = os.path.join( TARGET_DIR, f_name )
        #print(f_name)
        #print(f_path)
        f = open( f_path , 'r', encoding='utf8' )
        #print( f_name )
        texts += [ f.read( ) ]
        f.close( )
    print( '%d files loaded . ' %len(texts) )
    return texts

# Load the RAW text
target_txt = get_texts_from_catdir( './target' )

# Print first 10 words in document0
print( words(target_txt[0])[:10] )

### Mezclar diccionarios

In [None]:
import json

WORDS = Counter(words(open('big.txt').read()))
with open('WORDS_IN_NEWS.txt', 'r') as infile: # Exportando WORDS_IN_NEWS
    WORDS_IN_NEWS = json.load( infile )
WORDS_IN_NEWS = Counter(WORDS_IN_NEWS)

WORDS = WORDS + WORDS_IN_NEWS
print(WORDS['the'])
print(WORDS.most_common(5))

### Detectar las plabras mal escritas

In [None]:
def mispelled_and_candidates( target_words ):
    mispelled_candidates = []
    for word in target_words:
        temp = list(candidates(word))  # candidates de Norving
        if len(temp) > 1:
            temp.sort(key=lambda x: dist_lev(word, x))
            mispelled_candidates.append((word, temp[:10])) #Tomamos las primeras 10
    return mispelled_candidates

def mispelled_and_candidates( target_words ):
    mispelled_candidates = []
    
    for word in target_words:
        candidatos = list(candidates(word))
        candidatos.sort(key=lambda x: dist_lev(word, x))
        if len(candidatos) > 1:
            # En caso de que haya una opcion
            mispelled_candidates.append((word, candidatos[:10]))
        elif len(candidatos) == 1 and word not in candidatos:
            # En caso de que la unica opcion sea distinta
            mispelled_candidates.append((word, candidatos))

    return mispelled_candidates

#print ( mispelled_and_candidates( words( target_txt[0] )))

# Print misspelled words and candidates for each document in
# target_txt  list
for text in target_txt:
    print ( mispelled_and_candidates ( words ( text ) ) )

### Correccion completa

Para este ejercicio supondremos que la primera palabra esta bien escrita y tiene sentido. A partir de la segunda palabra, consideraremos la funcion `next_word`, en caso de que esta arroje una palabra distinta y que podamos _"considerar"_ que existen posibles candidatos con `candidates`, entonces haremos el cambio a la palabra en `next_word` (en caso de que este en candidatos) o a la siguiente de las obtenidas por `candidatos.

In [None]:
# next_word(WORDS)
dic = create_dict(words_from_file('big.txt'))

def candidates(word): 
    "Generate possible spelling corrections for word."
    kw = known([word]) 
    kw1 = known(edits1(word)) 
    kw2 = known(edits2(word)) 
    
    ret = set()
    if kw or kw1:
        if kw:
            ret |= kw
        if kw1:
            ret |= kw1
        if kw2:
            ret |= kw2
    else:
        return [word]
    
    return ret

def maybe_bad(word, max_candidates=15):
    """ Definimos algo como "Posiblemente malo" si
    existen mas de una palabra que no esten tan 
    alejadas y se generen por candidates.
    """
    candidatos = list(candidates(word))
    candidatos.sort(key=lambda x: dist_lev(word, x))
    if len(candidatos) > 1:
        # En caso de que haya una opcion
        return True, candidatos[:max_candidates]
    elif len(candidatos) == 1 and word not in candidatos:
        # En caso de que la unica opcion sea distinta
        return True, candidatos
    
    return False, None
    

def spell_correction( input_text ):
    corrected_text = [input_text[0]]

    for iw in range(1, len(input_text)):
        word = input_text[iw]
        pword = corrected_text[iw-1]
        nword = next_word(pword, dic)
        mb = maybe_bad(word)
        if mb[0] and nword != word:
            if nword in mb[1]:
                corrected_text.append(nword)
            else:
                corrected_text.append(mb[1][0])
        else:
            corrected_text.append(word)
        
            
    
    return corrected_text

tests = [['i', 'hav', 'a', 'ham'],
     ['my', 'countr', 'is', 'biig'],
     ['i', 'want', 't00', 'eat'],
     ['the', 'science', '0ff', 'computer'],
     ['the', 'science', 'off', 'computer'],
     [ 'i', 'want' , 'too' , 'eat']
    ]
for s in tests:
    # break
    print(mispelled_and_candidates( s ))
    print( spell_correction( s ))
    print()

In [None]:
next_word('want', dic)

In [None]:
candidates('t00')

In [None]:
golden_txt = get_texts_from_catdir( './golden' )
bad_txt = get_texts_from_catdir( './target' )

In [None]:
for i in range(10):
    print(f'<-------{i+1}------->')
    for bw, gw in zip(words(target_txt[i]), words(golden_txt[i])):
        if bw != gw:
            print(f'bad =( {bw} != {gw} => {candidates(bw)}')