## Spell checking

This workbook holds some exploration of the spell checking code.

---

Exercise 1: read in `wsj_with_errors.txt` into Python. Find every word in there that's not in the nltk word corpus and write that out to a new file. Remember to remove punctuation and numbers and to cast everything to lowercase.

In [None]:
from nltk.corpus import words
from string import punctuation
import re

input_file = "wsj_with_errors.txt"
non_alpha = re.compile(r"[^a-z ]")

# also consider the `isalpha` function
print("abcde".isalpha())
print("ab123".isalpha())

In [None]:
# here's a place to see if you've stripped out everything.

line = "Welcome to ADA. This is week 15!"

# See if you can process this line and have it return a list of words
# that are in lowercase, with no punctuation, and with no numbers.

line = line.lower()
line = "".join([ch for ch in line if not non_alpha.match(ch)])
line.strip().split()

In [None]:
word_set = {w.lower() for w in words.words()}

In [None]:
all_missing_words = list()

with open(input_file) as ifile :
    for row in ifile :
       
        row = row.lower()
        row = "".join([ch for ch in row if not non_alpha.match(ch)])
        
        row = row.strip().split()
        
        missing_words = [w for w in row if w not in word_set]

        all_missing_words.extend(missing_words)


In [None]:
with open("found_missing.txt",'w') as ofile :
    for word in set(all_missing_words) :
        ofile.write(word + "\n")

Now do the same with the 1M word corpus in `big.txt`.

In [None]:
big_word_set = set()

with open("big.txt") as infile :
    for row in infile :
        row = row.lower()
        row = "".join([ch for ch in row if not non_alpha.match(ch)])
        
        row = row.strip().split()
        
        words = [w for w in row if w not in big_word_set]
                
        big_word_set.update(set(words))
        

In [None]:
all_missing_words = list()

with open(input_file) as ifile :
    for row in ifile :
        row = row.lower()
        row = "".join([ch for ch in row if not non_alpha.match(ch)])
        
        row = row.strip().split()
        
        missing_words = [w for w in row if w not in big_word_set]

        all_missing_words.extend(missing_words)

        
        
with open("found_missing_2.txt",'w') as ofile :
    for word in set(all_missing_words) :
        ofile.write(word + "\n")

In [None]:
len(big_word_set)

Next exercise: write a function that takes a word and returns all deletions.

In [None]:
def get_deletions(word) :
    # given a word, return a list of every word that 
    # represents a single deletion from that word.
    deletes = []
    
    for i in range(len(word)) :
        deletes.append(str(word[:i] + word[(i+1):]))
        
        
    return(deletes)

print(get_deletions("test"))
print(get_deletions("longer"))
print(get_deletions("sesquipedalian"))

---

Now some code related to the stuff we talked about in class.

In [None]:
letters = 'abcdefghijklmnopqrstuvwxyz'
word = "monkey"
splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
deletes    = [L + R[1:]               for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
inserts    = [L + c + R               for L, R in splits for c in letters]

In [None]:
splits

In [None]:
replaces[100:120]

In [None]:
known(edits1("monkey"))

In [None]:
WORDS.most_common(10)

In [None]:
correction("tasts")

In [None]:
correction("thew")

In [None]:
[a for a in WORDS if "tk" in a.lower()]

---

## Running over Twitter Descriptions

1. Run the spell checker over the first 1000 descriptions from one of your Twitter files. 

1. Print the misspelled words along with their correction. 

1. How many seem to be legit misspellings?


In [None]:
# Read in the twitter data
file_location = "C:\\Users\\jchan\\Dropbox\\Teaching\\2017_Spring\\UnstructuredData\\PreWork\\"
file_name = "20170305_GeneralMills_followers.txt"

descs = []
with open(file_location + file_name,'r') as ifile :
    next(ifile)
    for idx, line in enumerate(ifile.readlines()) :
        line = line.strip().split("\t")
        
        # spot 6 has the description
        if len(line) >= 7 : # sometimes we don't have descriptions
            descs.extend(line[6].split())
            
        if idx > 250 :
            break

In [None]:
len(WORDS)

In [None]:
misspell_count = 0

non_alpha = re.compile(r"[^a-z]")

for idx, word in enumerate(descs) :
    word = word.lower()
    word = non_alpha.sub("",word)
    
    if word : # get rid of blanks
        c = correction(word)
        if c != word :
            misspell_count += 1
            print(" : ".join([word,c]))

#        if idx > 100 :
#            break
print("Found " + str(misspell_count) + " misspellings.")