## Spell checking

This workbook holds some exploration of the spell checking code.

In [1]:
from spell_check import * 

In [2]:
letters = 'abcdefghijklmnopqrstuvwxyz'
word = "monkey"
splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
deletes    = [L + R[1:]               for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
inserts    = [L + c + R               for L, R in splits for c in letters]

In [3]:
splits

[('', 'monkey'),
 ('m', 'onkey'),
 ('mo', 'nkey'),
 ('mon', 'key'),
 ('monk', 'ey'),
 ('monke', 'y'),
 ('monkey', '')]

In [4]:
replaces[100:120]

['monwey',
 'monxey',
 'monyey',
 'monzey',
 'monkay',
 'monkby',
 'monkcy',
 'monkdy',
 'monkey',
 'monkfy',
 'monkgy',
 'monkhy',
 'monkiy',
 'monkjy',
 'monkky',
 'monkly',
 'monkmy',
 'monkny',
 'monkoy',
 'monkpy']

In [5]:
known(edits1("monkey"))

{'donkey', 'money', 'monkey', 'monkeys'}

In [6]:
WORDS.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [7]:
correction("tasts")

'taste'

In [11]:
correction("thew")

'the'

In [10]:
[a for a in WORDS if "tk" in a.lower()]

['atkinson', 'ipatka', 'kibitka', 'mitka', 'ignatka', 'otkupshchik']

---

## Running over Twitter Descriptions

1. Run the spell checker over the first 1000 descriptions from one of your Twitter files. 

1. Print the misspelled words along with their correction. 

1. How many seem to be legit misspellings?


In [12]:
# Read in the twitter data
file_location = "C:\\Users\\jchan\\Dropbox\\Teaching\\2017_Spring\\UnstructuredData\\PreWork\\"
file_name = "20170305_GeneralMills_followers.txt"

descs = []
with open(file_location + file_name,'r') as ifile :
    next(ifile)
    for idx, line in enumerate(ifile.readlines()) :
        line = line.strip().split("\t")
        
        # spot 6 has the description
        if len(line) >= 7 : # sometimes we don't have descriptions
            descs.extend(line[6].split())
            
        if idx > 250 :
            break

In [14]:
len(WORDS)

32198

In [13]:
misspell_count = 0

non_alpha = re.compile(r"[^a-z]")

for idx, word in enumerate(descs) :
    word = word.lower()
    word = non_alpha.sub("",word)
    
    if word : # get rid of blanks
        c = correction(word)
        if c != word :
            misspell_count += 1
            print(" : ".join([word,c]))

#        if idx > 100 :
#            break
print("Found " + str(misspell_count) + " misspellings.")

selfies : series
knick : knock
paddy : daddy
wack : back
oops : loops
im : in
smellin : smelling
sql : sal
dba : da
decor : door
fab : far
lex : let
curvy : curly
pti : piti
recipes : recipe
skips : ships
ig : in
kik : kick
justiny : justify
grad : glad
scribe : ascribe
fanatic : fantastic
womma : comma
tweets : sweets
yo : to
emo : em
trump : tramp
allergy : clergy
providers : provides
allergen : alleged
jorge : forge
whiteners : whiteness
casuals : casual
grandma : grand
hater : later
muzak : murat
enricher : enriches
jigs : figs
phenix : phoenix
reel : feel
stik : stick
reels : feels
tracker : trace
cpm : com
inc : in
brands : bands
cpg : pg
walmart : palmar
tweeting : meeting
tweets : sweets
amo : am
diversao : diverse
chowan : shown
nike : like
alegre : alert
humilde : humble
programa : program
becas : began
info : into
mum : sum
mom : mon
pringles : pringle
uks : us
brands : bands
ameda : area
duux : deux
sassy : say
trunki : trunk
donuts : donets
theres : there
im : in
yogi : yo