In [1]:
from math import inf
from collections import Counter
from collections import OrderedDict 

# 1.Codigo de norving

In [2]:
"""
    Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html

    Copyright (c) 2007-2016 Peter Norvig
    MIT license: www.opensource.org/licenses/mit-license.php
"""
################ Spelling Corrector ################
####################################################
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

################ Test Code 

def unit_tests():
    assert correction('speling') == 'spelling','Err: insert'# insert
    assert correction('korrectud') == 'corrected'           # replace 2
    assert correction('bycycle') == 'bicycle'               # replace
    assert correction('inconvient') == 'inconvenient'       # insert 2
    assert correction('arrainged') == 'arranged'            # delete
    assert correction('peotry') =='poetry'                  # transpose
    assert correction('peotryy') =='poetry'                 # transpose + delete
    assert correction('word') == 'word'                     # known
    assert correction('quintessential') == 'quintessential' # unknown
    assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
    assert Counter(words('This is a test. 123; A TEST this is.')) == (
           Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
    assert len(WORDS) == 32198
    assert sum(WORDS.values()) == 1115585
    assert WORDS.most_common(10) == [
        ('the', 79809),
        ('of', 40024),
        ('and', 38312),
        ('to', 28765),
        ('in', 22023),
        ('a', 21124),
        ('that', 12512),
        ('he', 12401),
        ('was', 11410),
        ('it', 10681)]
    assert WORDS['the'] == 79809
    assert P('quintessential') == 0
    assert 0.07 < P('the') < 0.08
    return 'unit_tests pass'

In [3]:
print(unit_tests())
print(correction('speling'))
print(correction('korrectud'))
print(correction('thu'))

unit_tests pass
spelling
corrected
the


# 2.La siguiente palabra m\'as probable

Usando _big.txt_ crear una funci\'on que estime la siguiente palabra m\'as probable dada una anterior. La funci\'on debbe calcular 
    $$w_{i+1} = \text{argmax}_{w_{i+1}}P(W_{i+1}|w_i)$$
Para este trabajo
1. Podemos asumir que ambas palabras siempre existir\'an en la colecci\'on
2. Requerimos una funci\'on similar a $P$, que calcule $P(w_1|w_2)$

In [4]:
################################
### Funciones para trabajar  ###
################################

def words_from_file( fileName ):
    """ Obtenemos las palabras de un archivo. """
    file = open(fileName).read()
    return re.findall(r'\w+', file.lower())

def create_dict(texto):
    """ Funcion para crear el diccionario auxiliar para 
    calcular las probabilidades necesarias. 
    """
    ret = {}
    for i in range(1,len(texto)):
        if texto[i] not in ret:
            ret[texto[i]] = {}
        if texto[i-1] not in ret[texto[i]]:
            (ret[texto[i]])[texto[i-1]] = 0
            
        (ret[texto[i]])[texto[i-1]] += 1
    
    # Pre-ordenado
    for word in ret:
        ret[word] = OrderedDict(sorted(ret[word].items(), 
                                       key=lambda x: 
                                       prob_cond(x[0], word, ret),
                                       reverse=True))
    
    return ret

def prob_cond(a, b, dic):
    """ Probabilidad de A dado B en funcion de dic """
    try:
        return ((dic[a])[b])/sum(dic[b].values())
    except KeyError:
        return -1

def next_word(word, dic):
    """ Obtenemos la siguiente palabra mas probable en funcion
    del dicionario y sus probabiliodades. """
    try:
        return next(iter(dic[word]))
    except:
        return word

In [5]:
dic = create_dict(words_from_file('big.txt'))
word = 'new'

print( word +' '+next_word( word, dic) )
print( prob_cond('york','new', dic) )

new york
0.15811258278145696


## 2.1.Aqu\'i la maquina juega al ahorcado
Se recomienda extender y mejorar de alg\'un modo la funci\'on propuesta por __Norving__.

In [6]:
def under(word):
    word = word.split('_')
    if len(word) > 5:
        print('Demasiadas letras desconocidas')
        return None
    return word

def candidatos(word):
    '''  Recibe a word ya con el 'split' aplicado 
        y regresamos las posibles palabras 
    '''    
    letters = 'abcdefghijklmnopqrstuvwxyz'
    n_letters = len(letters)
    flag = word[-1] if word[-1] != '' else 'BrendA'
    
    # Creamos los posibles 'pedacitos' de la palabra
    words = [ele + letter 
             for ele in word[:len(word)-1] 
             for letter in letters]
    
    # Variables auxiliares
    options = words[:n_letters]
    options_t = []
    
    # Concatenamos los posibles 'pedacitos'
    for k in range( 1, len(words)//n_letters ):
        for option in options:
            for i in range(n_letters):
                options_t.append(option + words[n_letters*k + i])
        options = options_t; options_t = []
        
    if flag != 'BrendA': # Checamos si al final hay un '_' o una letra
        for i in range(len(options)): 
            options[i] = options[i] + flag
    
    # Regresamos unicamente las palabras que esten en el diccionario
    return set(opt for opt in options if opt in WORDS)

def dist_lev(source, target):
    if source == target: return 0
    # Crear matriz
    n_s, n_t = len(source), len(target)
    dist = [[0 for i in range(n_t+1)] for x in range(n_s+1)]
    for i in range(n_s+1): dist[i][0] = i
    for j in range(n_t+1): dist[0][j] = j
    # Calculando la distancia
    for i in range(n_s):
        for j in range(n_t):
            cost = 0 if source[i] == target[j] else 1
            dist[i+1][j+1] = min(
                                    dist[i][j+1] + 1,   # deletion
                                    dist[i+1][j] + 1,   # insertion
                                    dist[i][j] + cost   # substitution
                                )
    return dist[-1][-1]

def closest(word, options):
    ret = 'BrendA', inf
    for opt in options:
        dist = dist_lev(word, opt)
        ret = (opt, dist) if dist < ret[1] else ret
    return ret
    
def hangman(word):
    options = candidatos( under(word) )
    return closest(word, options)

In [7]:
print(hangman('s_e_l_c_')[0]) #sherlock
print(hangman('no_eb_o_')[0]) #notebook
print(hangman('he__o')[0])    #hello

print(hangman('pe_p_e')[0]) #people
print(hangman('phi__sop_y')[0]) #philospphy
print(hangman('si_nif_c_nc_')[0]) #significance
print(hangman('kn__l_d_e')[0])      #sun

sherlock
notebook
hello
people
philosophy
significance
knowledge


## 2.2.Ahorcado al extremo
Unir la funci\'on de _2_ y _2.1_ para, utilizando una palabra de contexto, completar palabras con mayor precisi\'on

In [8]:
def super_under(word):
    ct = Counter(word)
    if len(word) - ct['_'] < 1:
        print('Demasiadas letras desconocidas')
        return None
    word = word.split('_')
    return word

def super_closest( context, options):
    ret = 'BrendA', -inf
    for opt in options:  # Buscando el ret adecuado
        # Esta es la misma funcion de probabilidad del ejercicio anterior
        prob = prob_cond(opt, context, dic)
        #  En caso de que las proabilidades empaten
        # utilizamos las distancia entre las palabras
        # para responder.
        ret = ((opt, prob) if dist_lev(context, opt) < dist_lev(context, ret[0]) else ret) if prob == ret[1] else ret
        ret = (opt, prob) if prob > ret[1] else ret               
    return ret

def super_hangman(context, word):
    options = candidatos( super_under(word) )
    return super_closest(context, options)

In [9]:
print(super_hangman('sherlock', '_____s'))  #holmes
print(super_hangman('united', '_t_t__'))    #states
print(super_hangman('white', '___s_'))      #house
print(super_hangman('new', 'y___'))         #york
print(super_hangman('abraham', 'l_____n'))  #lincoln

('holmes', 1.0)
('states', 0.7620751341681574)
('house', 0.037142857142857144)
('york', 0.15811258278145696)
('lincoln', 0.6666666666666666)


# 3.Correci\'on ortografica simple

### Funciones auxiliares

In [10]:
import os, re

# simple extraction of words
def words (text) : 
    return re.findall(r'\w+', text.lower())

# siple loading of the documents
from keras.preprocessing.text import Tokenizer
def get_texts_from_catdir( catdir ):
    texts = [ ]
    TARGET_DIR = catdir # "./target"
    for f_name in sorted( os.listdir( TARGET_DIR )) :
        if f_name.endswith('.txt'):
            f_path = os.path.join( TARGET_DIR, f_name )
            #print(f_name)
            #print(f_path)
            f = open( f_path , 'r', encoding='utf8' )
            #print( f_name )
            texts += [ f.read( ) ]
            f.close( )
    print( '%d files loaded . ' %len(texts) )
    return texts

# Load the RAW text
target_txt = get_texts_from_catdir( './target' )

# Print first 10 words in document0
print( words(target_txt[0])[:10] )

10 files loaded . 
['scientists', 'witness', 'huge', 'cosmic', 'crash', 'find', 'origins', 'of', 'gold', 'even']


### Mezclar diccionarios

In [11]:
import json

WORDS = Counter(words(open('big.txt').read()))
with open('WORDS_IN_NEWS.txt', 'r') as infile: # Exportando WORDS_IN_NEWS
    WORDS_IN_NEWS = json.load( infile )
WORDS_IN_NEWS = Counter(WORDS_IN_NEWS)

WORDS = WORDS + WORDS_IN_NEWS
print(WORDS['the'])
print(WORDS.most_common(5))

80337
[('the', 80337), ('of', 40265), ('and', 38564), ('to', 29063), ('in', 22262)]


### Detectar las plabras mal escritas

In [12]:
def mispelled_and_candidates( target_words ):
    mispelled_candidates = []
    for word in target_words:
        temp = list(candidates(word))  # candidates de Norving
        if len(temp) > 1:
            temp.sort(key=lambda x: dist_lev(word, x))
            mispelled_candidates.append((word, temp[:10])) #Tomamos las primeras 10
    return mispelled_candidates

def mispelled_and_candidates( target_words ):
    mispelled_candidates = []
    
    for word in target_words:
        candidatos = list(candidates(word))
        candidatos.sort(key=lambda x: dist_lev(word, x))
        if len(candidatos) > 1:
            # En caso de que haya una opcion
            mispelled_candidates.append((word, candidatos[:10]))
        elif len(candidatos) == 1 and word not in candidatos:
            # En caso de que la unica opcion sea distinta
            mispelled_candidates.append((word, candidatos))

    return mispelled_candidates

#print ( mispelled_and_candidates( words( target_txt[0] )))

# Print misspelled words and candidates for each document in
# target_txt  list
for text in target_txt:
    print ( mispelled_and_candidates ( words ( text ) ) )
    pass

[('detcted', ['detected']), ('intoo', ['into'])]
[('conttinue', ['continue'])]
[('thhe', ['thee', 'the'])]
[('statment', ['statement'])]
[('watchng', ['watching'])]
[('possiblle', ['possible'])]
[('saiid', ['said'])]
[('addresss', ['address', 'addresses'])]
[('essetially', ['essentially'])]
[('gennerral', ['general'])]


### Correccion completa

Para este ejercicio supondremos que la primera palabra esta bien escrita y tiene sentido. 

La funcion `spell_correction` tiene una caracteristica que puede o no mejorar dependiendo de ciertos casos. De manera general, primero pasamos por la funcion del iniciso anterior al texto e identificamos todas las palabras mal escritas, luego, priorizando la probabilidad que ofrece la palabra anterior, escogemos la mejor opcione de entre aquellas que se generen por `candidates` _de Norvng_.

Esta forma de actuar tiene la principal desventaja de que no detectara problemas como las ultimas dos pruebas (ejemplos) que se proponen. Donde son palabras bien escritas pero que no necesariamente son las correctas, para solucionar esto podemos dar una propuesta mas agresva donde, en caso de que la palabra que probabilisticamente halbando (y en funcion con el corpus) deberia de seguir, la ponemos sin preguntar. Esto permite solucionar mas incisos del ejemplo, pero tambien descompone otras partes (como se puede ver en las pruebas de las noticias)

En general creo que aqui es donde podemos darle la opcion al humano para que escoja la palabra que mejor se acomode. Para superar esto podriamos ampliar el corpus o considerar la palabra que mejor se complemente con la que sigue. En caso de empezar con estasconsideraciones me parece que seria mejor primero arreglar todos las palabras que estan claramente mal escritas y luego hacer otra pasada con probabilidades.


#### Nota

Dado que _ham_ no parece estar en el corpus, causa problemas

In [13]:
# Creacion de diccionario ampliado
# Aunque no sirve de mucho
nbig = open('big.txt').read()
for text in target_txt:
    nbig += text
    
dic = create_dict(words(nbig))

In [14]:

def spell_correction( input_text, max_dist=2, profundo=False):
    """ Profundo le da mas libertad a la maquia para mejorar el texto. """
    corrected_text = input_text
    mispeled = dict(mispelled_and_candidates(input_text))
    
    for iw in range(1, len(input_text)):
        pword = corrected_text[iw-1]
        
        word = input_text[iw]
        nword = next_word(pword, dic)
        
        # En otro caso consideramos las probabilidades
        if word in mispeled:
            corrected_text[iw] = max(mispeled[word], 
                              key=lambda x: prob_cond(x, pword, dic))
        # Si se parecem cambiamos sin preguntar
        if profundo and dist_lev(nword, word) <= max_dist:
            corrected_text[iw] = nword

    return corrected_text

tests = [['i', 'hav', 'a', 'ham'],
     ['my', 'countr', 'is', 'biig'],
     ['i', 'want', 't00', 'eat'],
     ['the', 'science', '0ff', 'computer'],
     ['the', 'science', 'off', 'computer'],
     [ 'i', 'want' , 'too' , 'eat']
    ]
for s in tests:
    #print(mispelled_and_candidates(s))
    print(s)
    print( spell_correction( s, profundo=True ))
    print()




['i', 'hav', 'a', 'ham']
['i', 'have', 'a', 'man']

['my', 'countr', 'is', 'biig']
['my', 'country', 'is', 'big']

['i', 'want', 't00', 'eat']
['i', 'want', 'to', 'eat']

['the', 'science', '0ff', 'computer']
['the', 'science', 'of', 'computer']

['the', 'science', 'off', 'computer']
['the', 'science', 'of', 'computer']

['i', 'want', 'too', 'eat']
['i', 'want', 'to', 'eat']



#### Chequeo con Golden

In [15]:
golden_txt = get_texts_from_catdir( './golden' )
golden_words = words(" ".join(golden_txt))
target_words = words(" ".join(target_txt))

i = 0
for gword, tword in zip(golden_words, target_words):
    if gword != tword:
        print(f"{i} => {gword} != {tword}")
        i+=1

10 files loaded . 
0 => detected != detcted
1 => into != intoo
2 => continue != conttinue
3 => the != thhe
4 => statement != statment
5 => watching != watchng
6 => possible != possiblle
7 => said != saiid
8 => address != addresss
9 => essentially != essetially
10 => general != gennerral


In [16]:
new_text = spell_correction(target_words)
new_words = words(" ".join(new_text))

i = 0
for gword, nword in zip(golden_words, new_words):
    if gword != nword:
        print(f"{i} => {gword} != {nword}")
        i+=1
else:
    if i==0:
        print("<-----|!!! No hay errores =D !!!|----->")

<-----|!!! No hay errores =D !!!|----->


In [18]:
new_text = spell_correction(target_words, profundo=True)
new_words = words(" ".join(new_text))

i = 0
for gword, nword in zip(golden_words, new_words):
    if gword != nword:
        print(f"{i} => {gword} != {nword}")
        i+=1
else:
    if i==0:
        print("<-----|!!! No hay errores =D !!!|----->")
    else:
        print(" ='( Ahora si )'=")

0 => in != if
1 => ago != and
2 => the != that
3 => two != the
4 => to != as
5 => of != a
6 => a != man
7 => star != stars
8 => a != to
9 => a != it
10 => an != a
11 => to != in
12 => on != i
13 => the != to
14 => one != be
15 => we != the
16 => in != of
17 => would != gold
18 => s != of
19 => at != of
20 => we != to
21 => ve != be
22 => this != the
23 => the != to
24 => on != and
25 => 5 != to
26 => 88 != be
27 => the != to
28 => in != and
29 => how != to
30 => are != and
31 => for != to
32 => 4 != in
33 => the != he
34 => this != the
35 => s != of
36 => out != you
37 => the != are
38 => 1 != he
39 => are != he
40 => a != had
41 => was != is
42 => to != s
43 => of != a
44 => this != the
45 => to != of
46 => the != her
47 => that != the
48 => into != it
49 => said != and
50 => has != he
51 => 60 != in
52 => do != he
53 => that != the
54 => have != he
55 => was != is
56 => said != david
57 => in != and
58 => we != he
59 => in != and
60 => said != and
61 => the != to
62 => one != the
63 