In [1]:
import pandas as pd
from textblob import Word, TextBlob
import stringdist
import numpy as np
from pyphonetics import Soundex
from nltk.corpus import words as nltk_words
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time

ModuleNotFoundError: No module named 'textblob'

In [2]:
ks = pd.read_csv('data/kidsspellingv2.csv')
ks = ks.drop(['Code', 'Semester'], axis=1)
ks = ks.dropna(subset=['Target', 'Spelling'])
ks["Target"] = ks.Target.apply(lambda x: x.strip())
ks[0:5]

Unnamed: 0,Target,Spelling,Level,Grade,Unnamed: 6
0,favorite,favtit,Early Within Word Pattern,1,
1,throw,thow,Early Within Word Pattern,1,
2,catch,cach,Early Within Word Pattern,1,
3,touchdown,tuchdone,Early Within Word Pattern,1,
4,dance,dans,Early Within Word Pattern,1,


In [3]:
len(ks)

1358

In [4]:
word_freq = pd.read_csv('data/word_freq.csv')

In [5]:
def sum_list(list):
    total = 0
    for el in list:
        if el:
            total = total + 1
    return total

## Levenshtein Distance

In [6]:
# Levenshtein distance for each spelling
ks["l_dist"] = ks.apply(lambda x: stringdist.levenshtein(x['Target'], x['Spelling']), axis=1)

In [7]:
# How often the Levenshtein distance is less than two
distance_less_than_two = []
for distance in ks["l_dist"]:
    distance_less_than_two.append(distance <= 1)
sum(distance_less_than_two)/len(distance_less_than_two)

0.5228276877761414

## TextBlob Single correction

In [8]:
# Top correction from textblob
ks["textblob_correct"] = ks.Spelling.apply(lambda x : Word(x).correct())

In [9]:
# How often correction is correct
correct = []
for target, correction in zip(ks["Target"], ks["textblob_correct"]):
    correct.append(target == correction)

sum_list(correct)/len(correct)

0.2599410898379971

## TextBlob top 5 suggestions

In [11]:
# Top 5 suggestions from textblob
ks["textblob_suggestions"] = ks.Spelling.apply(lambda x: Word(x).spellcheck()[0:5])

In [12]:
ks["textblob_suggestions"] = ks.textblob_suggestions.apply(lambda x: [word[0] for word in x])

In [13]:
# How often the correct word is found in the suggestions
correct = []
for target, suggestions in zip(ks["Target"], ks["textblob_suggestions"]):
    correct.append(target in suggestions)
    
sum_list(correct)/len(correct)

0.43519882179675995

## SoundEx

In [14]:
soundex = Soundex()

In [15]:
soundex.phonetics('touchdown'), soundex.phonetics('tuchdone')

('T235', 'T235')

In [16]:
soundex_dict = {}
for word in word_freq['word']:
    word = str(word).lower()
    try:
        word_phone = soundex.phonetics(word)
    except:
        continue
    if word_phone in soundex_dict:
        soundex_dict[word_phone].append(word)
    else:
        soundex_dict[word_phone] = [word]

In [17]:
len(word_freq)/len(soundex_dict)

10.800150829562595

In [18]:
#soundex_dict[soundex.phonetics('tuchdone')]
correct = []
for target, spelling in zip(ks["Target"], ks["Spelling"]):
    spelling_phone = soundex.phonetics(spelling)
    if spelling_phone in soundex_dict:
        correct.append(target in soundex_dict[spelling_phone])
    else:
        correct.append(False)
        
sum_list(correct)/len(correct)

0.759941089837997

## Metaphones

In [19]:
rules = [
            (r'[^a-z]', r''),
            (r'([bcdfhjklmnpqrstvwxyz])\1+', r'\1'),
            (r'ck', r'K'),
            (r'^ocea', r'A2'),
            (r'^ae', r'A'),
            (r'^[aeiou]+', r'A'),
            (r'^[gkp]n', r'N'),
            (r'^wr', r'R'),
            (r'^x', r'S'),
            (r'^wh', r'W'),
            (r'^w', r'W'),
            (r'^gh', r'G'),
            (r'mb$', r'M'),
            #(r'(nc)e$', r'NS'),
            #(r'([aeiou][^aeiou])e$', r'\1'),
            #(r'(ng|st|pl|bl|tt|rs|cl)e$', r'\1'),
            (r'(?!^)sch', r'SK'),
            (r'th', r'0'),
            (r'^y', r'Y'),
            (r't?ch',r'1'),
            (r't?sh', r'2'),
            (r'c(?=ia|io)', r'2'),
            (r'ture$', r'1R'),
            (r'[st](?=i[ao])', r'2'),
            (r's?c(?=[iey])', r'S'),
            (r'[q]', r'Q'),
            (r'[c]', r'K'),
            (r'dg(?=[iey])', r'J'),
            (r'd', r'D'),
            (r'g(?=h[^aeiou])', r''),
            #(r'gh$', r''),
            (r'[y]$', r'A'),
            (r'gn(ed)?', r'N'),
            (r'([^g]|^)g(?=[iey])', r'\1G'),
            (r'g+', r'G'),
            (r'ph', r'F'),
            (r'([aeiou])h(?=\b|[^aeiou])', r'\1'),
            (r'[wy](?![aeiou])', r''),
            (r'[aeiou]w', r''),
            #(r'x', r'KS'),
            (r'z', r'S'),
            (r'v', r'V'),
            (r'y', r''),
            #(r'([aiou]+$)', r'A'),
            #(r'([aeiou]+)', r'A')
            (r'(?!^)[aeiou]+', r''),
        ]

def mphone(word):
    code = word.lower()
    for rule in rules:
        code = re.sub(rule[0], rule[1], code)
    return code.upper()

print(mphone('tuchdone'))
print(mphone('touchdown'))

T1DN
T1DN


In [20]:
metaphone_dict = {}
for word in word_freq['word']:
    word = str(word).lower()
    try:
        word_phone = mphone(word)
    except:
        continue
    if word_phone in metaphone_dict:
        metaphone_dict[word_phone].append(word)
    else:
        metaphone_dict[word_phone] = [word]

In [21]:
len(word_freq)/len(metaphone_dict)

1.506786378143303

In [22]:
# how often does the mispelling metaphone match the target spelling metaphone?
correct = []
for target, spelling in zip(ks["Target"], ks["Spelling"]):
    spelling_phone = mphone(spelling)
    if spelling_phone in metaphone_dict:
        correct.append(target in metaphone_dict[spelling_phone])
    else:
        correct.append(False)
        
sum_list(correct)/len(correct)

0.7253313696612665

In [23]:
def edit_distance_1(word):
    word = word.lower()
    letters = list('abc1dfghjklmnpqrs2t0vwxyz')
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(transposes + deletes + replaces + inserts)

def priority_replaces(word):
    priorities = {'1':['2'],'2':['1'], 'b':['p'], 'c':['s','k','1'], 'd':['t'], 'g':['j'], 'j':['g'], 'k':['q'], 'm':['n'], 'n':['m'], 'p':['b'], 'q':['k'], 's':['c','2'], 't':['d'], 'x':['s','c','1','2']}
    word = word.lower()
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    replaces = [L + c + R[1:] for L, R in splits if R and R[0] in priorities for c in priorities[R[0]]]
    return replaces
    
def priority_edits(word):
    word = word.lower()
    letters = list('abc1dfghjklmnpqrs2t0vwxyz')
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(replaces+ inserts)

In [24]:
def get_count(x):
    return word_freq.at[x.upper(), 'count']

In [25]:
def metaphone_suggestions(word, count):
    spelling_phone = mphone(word)
    suggestions = []
    if spelling_phone in metaphone_dict:
        suggestions.extend(metaphone_dict[spelling_phone])
    #suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word))
    
    #priority_suggestions = []
    #for pword in priority_edits(spelling_phone):
    #    if pword.upper() in metaphone_dict:
    #        priority_suggestions.extend(metaphone_dict[pword.upper()])
    #priority_suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word))
    #suggestions.extend(priority_suggestions)
    
    
    additional_suggestions = []
    for eword in edit_distance_1(spelling_phone):
        if eword.upper() in metaphone_dict:
            additional_suggestions.extend(metaphone_dict[eword.upper()])
    additional_suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word))
    #suggestions.sort(key=lambda x: stringdist.levenshtein_norm(x, word))
    suggestions.extend(additional_suggestions)
    
    #return list(dict.fromkeys(suggestions))[0:5]
    suggestions = [sug[0].upper() + sug[1:] if word[0].upper() == word[0] else sug for sug in list(dict.fromkeys(suggestions)) if len(sug) > 1]
    return suggestions[:count]

In [26]:
metaphone_suggestions('now',5)

['new', 'no', 'now', 'know', 'ne']

In [27]:
ks["metaphone_suggestions"] = ks.Spelling.apply(lambda x: metaphone_suggestions(x, 5))

In [28]:
ks[:5]

Unnamed: 0,Target,Spelling,Level,Grade,Unnamed: 6,l_dist,textblob_correct,textblob_suggestions,metaphone_suggestions
0,favorite,favtit,Early Within Word Pattern,1,,3,fait,[fait],"[favorite, favourite]"
1,throw,thow,Early Within Word Pattern,1,,1,how,"[how, show, throw, thou, thaw]","[the, thou, thaw, throw, show]"
2,catch,cach,Early Within Word Pattern,1,,1,each,"[each, catch, coach, cash]","[coach, catch, cache, couch, ketch]"
3,touchdown,tuchdone,Early Within Word Pattern,1,,3,tuchdone,[tuchdone],"[touchdown, touchdowns, techno, tendon, trodden]"
4,dance,dans,Early Within Word Pattern,1,,2,dans,[dans],"[dance, dennis, dense, downs, diagnose]"


In [29]:
#dev, test = train_test_split(ks, test_size=0.3)
#dev.to_pickle("./dev.pkl")
#test.to_pickle("./test.pkl")
dev = pd.read_pickle("./dev.pkl")
test = pd.read_pickle("./test.pkl")
len(dev), len(test)

(950, 408)

In [30]:
dev[50:150]

Unnamed: 0,Target,Spelling,l_dist,textblob_correct,textblob_suggestions,metaphone_suggestions
487,games,gams,1,game,"[game, gas, gems, gums, games]","[games, gems, gyms, gums, grams]"
309,following,fouling,3,feeling,"[feeling, forming, falling, filling, failing]","[following, feeling, flying, filing, falling]"
190,friends,frinds,1,friends,"[friends, finds]","[friends, fronds, grinds, finds, forints]"
1077,through,though,1,though,[though],"[though, thigh, through, tough, thorough]"
289,chocolate,chokelet,4,chokelet,[chokelet],"[chocolate, collet, chalet, booklet, chaplet]"
484,with,wheth,2,wyeth,[wyeth],"[with, whet, whether, sheath, wreath]"
1055,some,sum,2,sum,[sum],"[some, same, seem, zoom, sam]"
48,cookie,coockie,1,coockie,[coockie],"[cock, cook, cake, kick, cookie]"
663,climbed,clamed,2,claimed,"[claimed, blamed, flamed, clamped, calmed]","[claimed, calmed, clamped, flamed, clawed]"
1247,breathe,breevu,4,breeze,"[breeze, breed, breech, breezy, breeds]","[brave, bravo, brava, breeze, breech]"


In [31]:
correct = []
problem_words = []
for target, spelling, suggestions in zip(dev["Target"], dev["Spelling"], dev["metaphone_suggestions"]):
    if target in suggestions or target.lower() in suggestions or target[0].upper() + target[1:].lower() in suggestions:
        correct.append(True)
    else:
        problem_words.append((target, spelling, mphone(target),mphone(spelling), suggestions))
        correct.append(False)
        
sum_list(correct)/len(correct)

0.7757894736842105

In [32]:
print(len(problem_words))
problem_words
#for thing in problem_words:
    #print(thing[0])

213


[('pink', 'peik', 'PNK', 'PK', ['pc', 'pack', 'pick', 'peak', 'pac']),
 ('gills', 'gils', 'GLS', 'GLS', ['glass', 'goals', 'gals', 'gloss', 'glaze']),
 ('go on', 'gowen', 'GN', 'GN', ['gone', 'gene', 'gain', 'gun', 'guinea']),
 ('pictures',
  'pitchers',
  'PKTRS',
  'P1RS',
  ['pitchers', 'poachers', 'pitcher', 'pitches', 'watchers']),
 ('simple',
  'sipl',
  'SMPL',
  'SPL',
  ['spell', 'spill', 'spool', 'spoil', 'supple']),
 ('princess',
  'priins',
  'PRNSS',
  'PRNS',
  ['prince', 'prawns', 'prunes', 'prunus', 'sprains']),
 ('climbed',
  'clamed',
  'KLMBD',
  'KLMD',
  ['claimed', 'calmed', 'clamped', 'flamed', 'clawed']),
 ('breathe',
  'breevu',
  'BR0',
  'BRV',
  ['brave', 'bravo', 'brava', 'breeze', 'breech']),
 ('of', 'ove', 'AF', 'AV', ["i've", 'eve', 'hove', 'oven', 'over']),
 ('carrot',
  'carit',
  'KRT',
  'KRT',
  ['cart', 'create', 'court', 'carat', 'karate']),
 ('walking',
  'woking',
  'WLKNG',
  'WKNG',
  ['waking', 'whacking', 'working', 'joking', 'wooing']),
 ('

In [33]:
bad_sort=[]
for t in problem_words:
    if t[2] == t[3]:
        bad_sort.append((t[0], t[1], t[2],t[3], t[4]))

In [34]:
print(len(bad_sort))
bad_sort

35


[('gills', 'gils', 'GLS', 'GLS', ['glass', 'goals', 'gals', 'gloss', 'glaze']),
 ('go on', 'gowen', 'GN', 'GN', ['gone', 'gene', 'gain', 'gun', 'guinea']),
 ('carrot',
  'carit',
  'KRT',
  'KRT',
  ['cart', 'create', 'court', 'carat', 'karate']),
 ('fell', 'fel', 'FL', 'FL', ['full', 'file', 'feel', 'follow', 'fall']),
 ('mule', 'muole', 'ML', 'ML', ['mail', 'male', 'mile', 'mall', 'mill']),
 ('fries',
  'frice',
  'FRS',
  'FRS',
  ['force', 'phrase', 'fires', 'fears', 'freeze']),
 ('dumb', 'dume', 'DM', 'DM', ['demo', 'dam', 'dom', 'doom', 'dame']),
 ('a lot',
  'a lout',
  'ALT',
  'ALT',
  ['elite', 'alt', 'alto', 'eyelet', 'allot']),
 ('soars',
  'sors',
  'SRS',
  'SRS',
  ['source', 'series', 'serious', 'zeros', 'sores']),
 ('eat', 'eta', 'AT', 'AT', ['it', 'at', 'out', 'et', 'auto']),
 ('fell', 'fel', 'FL', 'FL', ['full', 'file', 'feel', 'follow', 'fall']),
 ('fright',
  'frait',
  'FRT',
  'FRT',
  ['fort', 'fruit', 'freight', 'ferret', 'fart']),
 ('becasue',
  'bicos',
  'BK

## Test set evaluation

In [35]:
#Test set
correct = []
problem_words = []
for target, spelling, suggestions in zip(test["Target"], test["Spelling"], test["metaphone_suggestions"]):
    suggestions = suggestions [:5]
    correct.append(target in suggestions or target.lower() in suggestions or target[0].upper() + target[1:].lower() in suggestions)
sum_list(correct)/len(correct)

0.7671568627450981

In [36]:
#TextBlob
correct = []
for target, suggestions in zip(test["Target"], test["textblob_suggestions"]):
    suggestions = suggestions[:5]
    correct.append(target in suggestions or target.lower() in suggestions or target[0].upper() + target[1:].lower() in suggestions)
    
sum_list(correct)/len(correct)

0.44607843137254904

In [37]:
#Full set
correct = []
problem_words = []
for target, spelling, suggestions in zip(ks["Target"], ks["Spelling"], ks["metaphone_suggestions"]):
    correct.append(target in suggestions or target.lower() in suggestions or target[0].upper() + target[1:].lower() in suggestions)
         
sum_list(correct)/len(correct)

0.7739322533136966

## Extra spelling tests and performance

In [38]:
se = pd.read_csv('data/spell-testset2.txt',sep=': ', names=['correct', 'wrong'])
se['wrong'] = se.wrong.apply(lambda x: [re.sub(r'\*\d', '', word).strip() for word in x.split(" ")])
se[:50]

  """Entry point for launching an IPython kernel.


Unnamed: 0,correct,wrong
0,appeal,[apeal]
1,employees,[emploies]
2,encourage,[encorage]
3,permanent,[perminant]
4,mathematically,[mathematicaly]
5,data,[dsata]
6,permanently,[perminantly]
7,hierarchal,[hierachial]
8,proviso,[provisoe]
9,moving,[moveing]


In [39]:
# our suggestions
corrections = []
start=time.time()
for correct, wrong in tqdm(zip(se["correct"], se["wrong"]), total=len(se["correct"])):
    wrong = wrong[0].split(" ")
    for w in wrong:
        sug = metaphone_suggestions(w, 5)
        corrections.append(correct in sug)
end=time.time()
print(sum_list(corrections)/len(corrections))
print(len(se)/(end-start))

100%|██████████| 363/363 [00:03<00:00, 112.08it/s]


0.9228650137741047
111.26978985255695


In [40]:
corrections = []
start = time.time()
for correct, wrong in tqdm(zip(se["correct"], se["wrong"]), total=len(se["correct"])):
    for w in wrong:
        sug =  [word[0] for word in Word(w).spellcheck()[0:5]]
        #print(sug)
        corrections.append(correct in sug)
end=time.time()
print(sum_list(corrections)/len(corrections))
print(len(se)/(end-start))

100%|██████████| 363/363 [01:21<00:00,  4.46it/s]


0.7825
4.456730027063354


## Additional evaluations

In [111]:
set(ks['Level'].tolist())

{'Early Within Word Pattern',
 'Early Within Word Pattern ',
 'Early letter-Name Alphabetic',
 'Early-Middle Letter Name Alphabetic ',
 'Early-Middle within Word Pattern',
 'Late Letter-Name Alphabetic',
 'Late Syllables and Affixes',
 'Late emergent',
 'Late within word pattern',
 'Letter name alphabetic',
 'Letter-Name Alphabetic',
 'Middle Letter Name- Alphabetic',
 'Middle Within Word Pattern',
 'Middle to Late Syllables and Affixes',
 'Within Word Pattern',
 'early Derivational Relations ',
 'early Letter Name- Alphabetic',
 'early Letter Name-Alphabetic',
 'early Letter-Name Alphabetic',
 'early Syllables and Affixes',
 'early Syllables and Affixes ',
 'early Within Word Pattern',
 'early Within Word Pattern ',
 'early derivational relations',
 'early letter-name alphabetic',
 'early syllables and affixes',
 'early within word pattern',
 'early within word patterns ',
 'late Letter Name- Alphabetic',
 'late Letter Name-- Alphabetic',
 'late Syllables and Affixes',
 'late Syllable

In [123]:
ks["Level"] = ks.Level.apply(lambda x: str(x).lower().strip())

In [124]:
ks["Level"] = ks.Level.apply(lambda x: re.sub("-", " ", x))

In [125]:
ks["Level"] = ks.Level.apply(lambda x: re.sub("[ ]+", " ", x))

In [127]:
ks["Level"] = ks.Level.apply(lambda x: x.strip("s"))

In [128]:
set(ks['Level'].tolist())

{'early derivational relation',
 'early letter name alphabetic',
 'early middle letter name alphabetic',
 'early middle within word pattern',
 'early syllables and affixe',
 'early within word pattern',
 'late emergent',
 'late letter name alphabetic',
 'late syllables and affixe',
 'late within word pattern',
 'letter name alphabetic',
 'middle derivational relation',
 'middle letter name alphabetic',
 'middle syllables and affixe',
 'middle to late syllables and affixe',
 'middle to late within word pattern',
 'middle within word pattern',
 'nan',
 'within word pattern'}

In [163]:
levels = ['emergent', 'letter name', 'word pattern', 'syllables', 'derivational']

In [164]:
print(len(ks[ks["Level"].str.contains("nan")]))
for level in levels:
    sub_df = ks[ks["Level"].str.contains(level)]
    print(level, ":", len(sub_df), "samples")
    t_correct = []
    m_correct = []
    for target, spelling, m_suggestions, t_suggestions in zip(sub_df["Target"], sub_df["Spelling"], sub_df["metaphone_suggestions"], sub_df["textblob_suggestions"]):
        m_correct.append(target in m_suggestions or target.lower() in m_suggestions or target[0].upper() + target[1:].lower() in m_suggestions)
        t_correct.append(target in t_suggestions or target.lower() in t_suggestions or target[0].upper() + target[1:].lower() in t_suggestions)
    print("\tMetaphones:",sum_list(m_correct)/len(m_correct))
    print("\tTextBlob:",sum_list(t_correct)/len(t_correct))
    print("\tDifference:",sum_list(m_correct)/len(m_correct) - sum_list(t_correct)/len(t_correct))

17
emergent : 5 samples
	Metaphones: 0.8
	TextBlob: 0.2
	Difference: 0.6000000000000001
letter name : 490 samples
	Metaphones: 0.763265306122449
	TextBlob: 0.3816326530612245
	Difference: 0.3816326530612245
word pattern : 677 samples
	Metaphones: 0.7518463810930576
	TextBlob: 0.4638109305760709
	Difference: 0.28803545051698665
syllables : 158 samples
	Metaphones: 0.8734177215189873
	TextBlob: 0.5316455696202531
	Difference: 0.3417721518987342
derivational : 11 samples
	Metaphones: 0.8181818181818182
	TextBlob: 0.36363636363636365
	Difference: 0.4545454545454546


In [184]:
levels = [
    'late emergent', 
    'early letter name alphabetic',
    'early middle letter name alphabetic', 
    'middle letter name alphabetic',
    'letter name alphabetic',
    'late letter name alphabetic',
    'early within word pattern',
    'early middle within word pattern',
    'middle within word pattern',
    'within word pattern',
    'middle to late within word pattern',
    'late within word pattern',
    'early syllables and affixe',
    'middle syllables and affixe',
    'middle to late syllables and affixe',
    'late syllables and affixe',
    'early derivational relation',
    'middle derivational relation'
    ]

In [186]:
for level in levels:
    sub_df = ks[ks.Level == level]
    if len(sub_df) >= 12:
        print(level, ":", len(sub_df), "samples")
        t_correct = []
        m_correct = []
        for target, spelling, m_suggestions, t_suggestions in zip(sub_df["Target"], sub_df["Spelling"], sub_df["metaphone_suggestions"], sub_df["textblob_suggestions"]):
            m_correct.append(target in m_suggestions or target.lower() in m_suggestions or target[0].upper() + target[1:].lower() in m_suggestions)
            t_correct.append(target in t_suggestions or target.lower() in t_suggestions or target[0].upper() + target[1:].lower() in t_suggestions)
        print("\tMetaphones:",sum_list(m_correct)/len(m_correct))
        print("\tTextBlob:",sum_list(t_correct)/len(t_correct))
        print("\tDifference:",sum_list(m_correct)/len(m_correct) - sum_list(t_correct)/len(t_correct))

early letter name alphabetic : 70 samples
	Metaphones: 0.7285714285714285
	TextBlob: 0.4142857142857143
	Difference: 0.3142857142857142
middle letter name alphabetic : 257 samples
	Metaphones: 0.754863813229572
	TextBlob: 0.3540856031128405
	Difference: 0.4007782101167315
letter name alphabetic : 38 samples
	Metaphones: 0.8421052631578947
	TextBlob: 0.4473684210526316
	Difference: 0.3947368421052631
late letter name alphabetic : 114 samples
	Metaphones: 0.8070175438596491
	TextBlob: 0.4298245614035088
	Difference: 0.3771929824561403
early within word pattern : 348 samples
	Metaphones: 0.7442528735632183
	TextBlob: 0.43103448275862066
	Difference: 0.3132183908045977
middle within word pattern : 227 samples
	Metaphones: 0.7312775330396476
	TextBlob: 0.4581497797356828
	Difference: 0.27312775330396477
within word pattern : 20 samples
	Metaphones: 0.75
	TextBlob: 0.65
	Difference: 0.09999999999999998
middle to late within word pattern : 12 samples
	Metaphones: 0.75
	TextBlob: 0.41666666666

In [169]:
set(ks['Grade'].tolist())

{'1', '2', '3', '4', '5', '6', '7', '8', 'k', nan}

In [170]:
for grade in ['k','1', '2', '3', '4', '5', '6', '7', '8']:
    sub_df = ks[ks.Grade == grade]
    print(grade, ":", len(sub_df), "samples")
    t_correct = []
    m_correct = []
    for target, spelling, m_suggestions, t_suggestions in zip(sub_df["Target"], sub_df["Spelling"], sub_df["metaphone_suggestions"], sub_df["textblob_suggestions"]):
        m_correct.append(target in m_suggestions or target.lower() in m_suggestions or target[0].upper() + target[1:].lower() in m_suggestions)
        t_correct.append(target in t_suggestions or target.lower() in t_suggestions or target[0].upper() + target[1:].lower() in t_suggestions)
    print("\tMetaphones:",sum_list(m_correct)/len(m_correct))
    print("\tTextBlob:",sum_list(t_correct)/len(t_correct))
    print("\tDifference:",sum_list(m_correct)/len(m_correct) - sum_list(t_correct)/len(t_correct))

k : 15 samples
	Metaphones: 0.7333333333333333
	TextBlob: 0.4666666666666667
	Difference: 0.2666666666666666
1 : 144 samples
	Metaphones: 0.6458333333333334
	TextBlob: 0.3402777777777778
	Difference: 0.3055555555555556
2 : 386 samples
	Metaphones: 0.7461139896373057
	TextBlob: 0.4170984455958549
	Difference: 0.3290155440414508
3 : 277 samples
	Metaphones: 0.8122743682310469
	TextBlob: 0.44765342960288806
	Difference: 0.36462093862815886
4 : 315 samples
	Metaphones: 0.7841269841269841
	TextBlob: 0.46984126984126984
	Difference: 0.3142857142857143
5 : 95 samples
	Metaphones: 0.8421052631578947
	TextBlob: 0.5052631578947369
	Difference: 0.33684210526315783
6 : 44 samples
	Metaphones: 0.7954545454545454
	TextBlob: 0.5227272727272727
	Difference: 0.2727272727272727
7 : 44 samples
	Metaphones: 0.8409090909090909
	TextBlob: 0.5227272727272727
	Difference: 0.31818181818181823
8 : 21 samples
	Metaphones: 0.8571428571428571
	TextBlob: 0.3333333333333333
	Difference: 0.5238095238095237


In [178]:
lengths = set([len(word) for word in ks['Target'].tolist()])

In [183]:
for length in lengths:
    sub_df = ks[ks["Target"].str.len() == length]
    if len(sub_df) >= 10:
        print(length, ":", len(sub_df), "samples")
        t_correct = []
        m_correct = []
        for target, spelling, m_suggestions, t_suggestions in zip(sub_df["Target"], sub_df["Spelling"], sub_df["metaphone_suggestions"], sub_df["textblob_suggestions"]):
            m_correct.append(target in m_suggestions or target.lower() in m_suggestions or target[0].upper() + target[1:].lower() in m_suggestions)
            t_correct.append(target in t_suggestions or target.lower() in t_suggestions or target[0].upper() + target[1:].lower() in t_suggestions)
        print("\tMetaphones:",sum_list(m_correct)/len(m_correct))
        print("\tTextBlob:",sum_list(t_correct)/len(t_correct))
        print("\tDifference:",sum_list(m_correct)/len(m_correct) - sum_list(t_correct)/len(t_correct))

2 : 50 samples
	Metaphones: 0.84
	TextBlob: 0.18
	Difference: 0.6599999999999999
3 : 79 samples
	Metaphones: 0.7341772151898734
	TextBlob: 0.43037974683544306
	Difference: 0.3037974683544304
4 : 281 samples
	Metaphones: 0.7864768683274022
	TextBlob: 0.4626334519572954
	Difference: 0.3238434163701068
5 : 318 samples
	Metaphones: 0.7421383647798742
	TextBlob: 0.4276729559748428
	Difference: 0.3144654088050314
6 : 217 samples
	Metaphones: 0.7695852534562212
	TextBlob: 0.4608294930875576
	Difference: 0.3087557603686636
7 : 208 samples
	Metaphones: 0.7548076923076923
	TextBlob: 0.5096153846153846
	Difference: 0.2451923076923077
8 : 90 samples
	Metaphones: 0.8555555555555555
	TextBlob: 0.45555555555555555
	Difference: 0.39999999999999997
9 : 77 samples
	Metaphones: 0.9090909090909091
	TextBlob: 0.37662337662337664
	Difference: 0.5324675324675324
10 : 20 samples
	Metaphones: 0.75
	TextBlob: 0.4
	Difference: 0.35


In [181]:
ks[ks["Target"].str.len() == 8]

Unnamed: 0,Target,Spelling,Level,Grade,Unnamed: 6,metaphone_suggestions,textblob_suggestions
0,favorite,favtit,early within word pattern,1,,"[favorite, favourite]",[fait]
5,villager,villajg,early within word pattern,1,,"[village, valuing, voltage]","[village, villain, villa, villas]"
23,exploded,xspladed,early within word pattern,1,,[suspended],[xspladed]
32,probably,probly,early within word pattern,1,,"[probably, problem, proudly, probity, poorly]","[probably, problem, probe, proudly, portly]"
47,infinite,infinit,middle within word pattern,2,,"[infant, infinite, infante, infinity, infinitive]","[infinite, infinity]"
65,computer,cumpiter,early within word pattern,2,,"[computer, sumpter, compiler, computers, compo...","[computer, jupiter]"
74,screamed,scrind,early within word pattern,2,,"[screened, scorned, scoring, scaring, screed]","[cried, spring, string, shrine, strand]"
146,favorite,favurite,middle letter name alphabetic,2,,"[favorite, favourite, favourites, favorites, f...","[favorite, favourite]"
220,lollipop,lolipop,middle letter name alphabetic,2,,"[lollipop, lollipops, lilliput]",[lolipop]
276,question,quasion,middle within word pattern,3,,"[equation, question, fusion, quotation, tuition]","[question, fusion, quasi, equation]"


In [196]:
def averageLen(lst):
    lengths = [len(i) for i in lst]
    return 0 if len(lengths) == 0 else (float(sum(lengths)) / len(lengths)) 

In [203]:
len(ks["metaphone_suggestions"].tolist()[1])

5

In [204]:
lengths = [len(i) for i in ks["metaphone_suggestions"].tolist()]

In [221]:
averageLen(ks["metaphone_suggestions"].tolist())

4.924889543446245

In [207]:
averageLen(ks["textblob_suggestions"].tolist())

3.0537555228276876