In [42]:
import re
import nltk
import pandas as pd
nltk.download('gutenberg')

corrupted_file = open('ausen-sense-corrupted.txt').read()
correct_file = nltk.corpus.gutenberg.raw('austen-sense.txt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/erictay1997/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
def tokenizer(textFile):
    words = re.findall(r"[\w']+|[\n-.\",!?:;]", textFile)
    tokens = []
    for word in words:
        foundContraction = re.search(r"(\w+)('ll|'LL|'re|'RE|'ve|'VE|n't|N'T|'s|'S|'d|'D|'m|'M|'a|')", word)
        if foundContraction:
            tokens.append(foundContraction.group(1))
            tokens.append(foundContraction.group(2))
        else:
            tokens.append(word)
    
    print("finished tokenizing")
    print(len(tokens))
    return tokens

In [3]:
corrupted_tokens = tokenizer(corrupted_file)
correct_tokens = tokenizer(correct_file)

finished tokenizing
267642
finished tokenizing
267704


In [4]:
for i in range(62):
    corrupted_tokens.append('0')

In [5]:
# These spaces are to handle misalignment issues
# This occurs when the corrupted file deletes words, which we cannot correct
counter = 0
for i in range(len(correct_tokens)):
    if corrupted_tokens[i] != correct_tokens[i]:
        if correct_tokens[i] == " ":
            counter += 1
            corrupted_tokens.insert(i, " ")
            corrupted_tokens.pop()
print(counter)

61


In [6]:
# There are 18929 tokens that are different
df = pd.DataFrame({'corrupted' : corrupted_tokens, 'correct' : correct_tokens }, columns=['corrupted','correct'])
misaligned = df[df['corrupted'] != df['correct']]
len(misaligned)

18929

In [7]:
# Print for Sanity Check
# for i in range(len(misaligned)):
#     print(misaligned.iloc[i,])

In [8]:
all_words = open('dictionary.txt').read().split('\n')
# dictionary.txt is a txt file of all valid words taken from the link on Piazza
# https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt

In [9]:
import pandas as pd
df = pd.read_csv('unigram_freq.csv', index_col = False)
# unigram_freq.csv is a csv file from Kaggle, containing english words and word frequency
# Data is derived from the Google Web Trillion Word Corpus.
# https://www.kaggle.com/rtatman/english-word-frequency

In [10]:
df = df[df['word'].isin(all_words)] # Clean kaggle dataset

In [11]:
word_counter = {}
for i in range(len(df)):
    word_counter[df.iloc[i,0]] = df.iloc[i,1]

In [12]:
for word in all_words:
    if word not in word_counter:
        word_counter[word] = 1

In [13]:
def spell_corrector(word_list):
    return [correct(word) for word in word_list]

In [14]:
# Corrects word if it's alphanumeric
# Else, do nothing
# Capitalizes it accordingly
def correct(word):
    if not word.isalnum():
        return word
    if word.lower() in word_counter:
        return word
    corrected_word = best_candidate(word.lower())
    if word.isupper():
        return corrected_word.upper()
    if word[0].isupper():
        return corrected_word[0].upper() + corrected_word[1:]
    return corrected_word

In [15]:
# Returns best candidate for a given word
# Prioritizes lower Levenshtein distance, and then word frequency
# If there are no words with Levenshtein distance ≤ 2 in word_counter, return the word itself
def best_candidate(word):
    return (best_candidate_from_list(distance1(word)) or best_candidate_from_list(distance2(word)) or word)

In [16]:
# Returns best candidate from a list of words
# Weighted by word frequency
def best_candidate_from_list(words):
    count = -1
    candidate = None
    for w in words:
        if w in word_counter and word_counter[w] > count:
            count = word_counter[w]
            candidate = w
    return candidate

In [17]:
# Set of words of one Levenshtein distance from word
def distance1(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    ret = set()
    for i in range(len(word)):
        ret.add(word[:i] + word[i+1:]) #Deletion
        for letter in letters:
            ret.add(word[:i] + letter + word[i:]) #Insertion
            ret.add(word[:i] + letter + word[i+1:]) #Substitution
    return ret

In [18]:
# Set of words of two Levenshtein distance from word
def distance2(word): 
    ret = set()
    for distance1_word in distance1(word):
        ret.update(distance1(distance1_word))
    return ret

In [20]:
# These are the tokens are are different
corrupted_tokens_inaccurate = []
correct_tokens_shortlist = []

In [21]:
for i in range(len(corrupted_tokens)):
    if corrupted_tokens[i] != correct_tokens[i]:
        corrupted_tokens_inaccurate.append(corrupted_tokens[i])
        correct_tokens_shortlist.append(correct_tokens[i])

In [22]:
corrected_tokens = spell_corrector(corrupted_tokens_inaccurate)

In [47]:
counter = 0
for i in range(len(correct_tokens_shortlist)):
    if correct_tokens_shortlist[i] == corrected_tokens[i]:
        counter += 1

In [48]:
# We correct 63% of tokens
counter/len(correct_tokens_shortlist)

0.6260235617306777

In [27]:
# 7079 tokens are still different
df = pd.DataFrame({'corrupted' : corrected_tokens, 'correct' : correct_tokens_shortlist }, columns=['corrupted','correct'])
misaligned = df[df['corrupted'] != df['correct']]
len(misaligned)

7079

In [28]:
for i in range(len(misaligned)):
    print(misaligned.iloc[i,])

corrupted     i
correct      in
Name: 1, dtype: object
corrupted      stat
correct      estate
Name: 5, dtype: object
corrupted     nd
correct      and
Name: 7, dtype: object
corrupted     te
correct      the
Name: 15, dtype: object
corrupted     o
correct      of
Name: 18, dtype: object
corrupted     das
correct      days
Name: 22, dtype: object
corrupted     Diswood
correct      Dashwood
Name: 25, dtype: object
corrupted     for
correct      from
Name: 26, dtype: object
corrupted     hi
correct      him
Name: 28, dtype: object
corrupted     n
correct      to
Name: 31, dtype: object
corrupted    has
correct      his
Name: 33, dtype: object
corrupted     te
correct      the
Name: 35, dtype: object
corrupted     ms
correct      his
Name: 37, dtype: object
corrupted     herefore
correct      therefore
Name: 40, dtype: object
corrupted    forsung
correct      fortune
Name: 43, dtype: object
corrupted    right
correct      might
Name: 44, dtype: object
corrupted     bt
correct      but
Nam

Name: 489, dtype: object
corrupted     taa
correct      than
Name: 494, dtype: object
corrupted    hoey
correct      They
Name: 496, dtype: object
corrupted      
correct      no
Name: 499, dtype: object
corrupted    haze
correct      have
Name: 500, dtype: object
corrupted     an
correct      any
Name: 502, dtype: object
corrupted     the
correct      they
Name: 503, dtype: object
corrupted     hem
correct      them
Name: 506, dtype: object
corrupted     mor
correct      more
Name: 507, dtype: object
corrupted     gie
correct      give
Name: 510, dtype: object
corrupted    ara
correct      are
Name: 512, dtype: object
corrupted     noting
correct      nothing
Name: 514, dtype: object
corrupted    mn
correct      me
Name: 516, dtype: object
corrupted     really
correct      readily
Name: 521, dtype: object
corrupted     even
correct      given
Name: 522, dtype: object
corrupted     e
correct      be
Name: 523, dtype: object
corrupted     Ms
correct      Mrs
Name: 524, dtype: object
cor

Name: 838, dtype: object
corrupted     hm
correct      him
Name: 839, dtype: object
corrupted     worry
correct      worthy
Name: 840, dtype: object
corrupted     ure
correct      sure
Name: 841, dtype: object
corrupted     minor
correct      Elinor
Name: 846, dtype: object
corrupted    worts
correct      worth
Name: 852, dtype: object
corrupted     del
correct      deal
Name: 856, dtype: object
corrupted    been
correct      seen
Name: 860, dtype: object
corrupted    YI
correct       I
Name: 862, dtype: object
corrupted     Is
correct      His
Name: 865, dtype: object
corrupted     no
correct      not
Name: 866, dtype: object
corrupted     o
correct      of
Name: 868, dtype: object
corrupted    this
correct       his
Name: 869, dtype: object
corrupted    hia
correct      him
Name: 872, dtype: object
corrupted     ad
correct      and
Name: 882, dtype: object
corrupted     women
correct      moment
Name: 884, dtype: object
corrupted    ho
correct      to
Name: 885, dtype: object
corrupt

Name: 1393, dtype: object
corrupted     seems
correct      seemed
Name: 1396, dtype: object
corrupted    ge
correct      be
Name: 1399, dtype: object
corrupted     o
correct      so
Name: 1402, dtype: object
corrupted     thy
correct      they
Name: 1404, dtype: object
corrupted     preset
correct      present
Name: 1413, dtype: object
corrupted    penguin
correct      sending
Name: 1416, dtype: object
corrupted     iddleton
correct      Middleton
Name: 1417, dtype: object
corrupted    re
correct      be
Name: 1419, dtype: object
corrupted    ob
correct      of
Name: 1427, dtype: object
corrupted     hr
correct      her
Name: 1431, dtype: object
corrupted      went
correct      twenty
Name: 1435, dtype: object
corrupted    hall
correct      tall
Name: 1436, dtype: object
corrupted     f
correct      of
Name: 1438, dtype: object
corrupted     y
correct      by
Name: 1441, dtype: object
corrupted      othin
correct      nothing
Name: 1444, dtype: object
corrupted    verby
correct       v

corrupted     other
correct      mother
Name: 1971, dtype: object
corrupted     i
correct      in
Name: 1972, dtype: object
corrupted     hile
correct      while
Name: 1973, dtype: object
corrupted     a
correct      an
Name: 1974, dtype: object
corrupted      event
correct      evident
Name: 1975, dtype: object
corrupted     chic
correct      which
Name: 1976, dtype: object
corrupted    with
correct       its
Name: 1981, dtype: object
corrupted    grateful
correct      graceful
Name: 1982, dtype: object
corrupted    ani
correct      and
Name: 1985, dtype: object
corrupted     f
correct      of
Name: 1987, dtype: object
corrupted     Dashpot
correct      Dashwood
Name: 1988, dtype: object
corrupted     ct
correct      act
Name: 1991, dtype: object
corrupted     Be
correct      She
Name: 1994, dtype: object
corrupted     he
correct      her
Name: 1996, dtype: object
corrupted    Dashwoud
correct      Dashwood
Name: 1998, dtype: object
corrupted     wulk
correct      would
Name: 2001, dt

corrupted    paths
correct      pains
Name: 2374, dtype: object
corrupted     s
correct      is
Name: 2377, dtype: object
corrupted    has
correct      his
Name: 2379, dtype: object
corrupted     bein
correct      being
Name: 2383, dtype: object
corrupted    ba
correct       a
Name: 2385, dtype: object
corrupted    Middletone
correct       Middleton
Name: 2387, dtype: object
corrupted    Kennings
correct      Jennings
Name: 2388, dtype: object
corrupted      difference
correct      indifference
Name: 2390, dtype: object
corrupted     Lidderon
correct      Middleton
Name: 2393, dtype: object
corrupted     my
correct      may
Name: 2394, dtype: object
corrupted    nox
correct      not
Name: 2395, dtype: object
corrupted      ety
correct      forty
Name: 2399, dtype: object
corrupted     work
correct      world
Name: 2401, dtype: object
corrupted      re
correct      read
Name: 2402, dtype: object
corrupted      answer
correct      answered
Name: 2406, dtype: object
corrupted       troubl

corrupted     the
correct      they
Name: 2833, dtype: object
corrupted     belief
correct      believe
Name: 2834, dtype: object
corrupted    wis
correct       is
Name: 2837, dtype: object
corrupted     hs
correct      has
Name: 2838, dtype: object
corrupted     hir
correct      hair
Name: 2839, dtype: object
corrupted    ts
correct      is
Name: 2841, dtype: object
corrupted     op
correct      you
Name: 2842, dtype: object
corrupted     ama
correct      mama
Name: 2843, dtype: object
corrupted     wen
correct      went
Name: 2844, dtype: object
corrupted     wer
correct      were
Name: 2845, dtype: object
corrupted     f
correct      of
Name: 2848, dtype: object
corrupted    has
correct      her
Name: 2849, dtype: object
corrupted    folder
correct      folded
Name: 2851, dtype: object
corrupted     is
correct      his
Name: 2852, dtype: object
corrupted    way
correct      was
Name: 2858, dtype: object
corrupted     house
correct      course
Name: 2865, dtype: object
corrupted     

Name: 3207, dtype: object
corrupted     a
correct      as
Name: 3208, dtype: object
corrupted    ways
correct       was
Name: 3209, dtype: object
corrupted     nt
correct      not
Name: 3215, dtype: object
corrupted     aways
correct      always
Name: 3216, dtype: object
corrupted      prophet
correct      propriety
Name: 3217, dtype: object
corrupted     i
correct      in
Name: 3222, dtype: object
corrupted     wha
correct      what
Name: 3223, dtype: object
corrupted     e
correct      we
Name: 3225, dtype: object
corrupted    tould
correct      could
Name: 3227, dtype: object
corrupted     pleaser
correct      pleasure
Name: 3228, dtype: object
corrupted    pore
correct      more
Name: 3234, dtype: object
corrupted     n
correct      in
Name: 3236, dtype: object
corrupted    talking
correct      walking
Name: 3237, dtype: object
corrupted     he
correct      her
Name: 3238, dtype: object
corrupted    Willouvghby
correct       Willoughby
Name: 3239, dtype: object
corrupted    ia
corr

Name: 3919, dtype: object
corrupted    heer
correct       her
Name: 3922, dtype: object
corrupted      compose
correct      composure
Name: 3925, dtype: object
corrupted     even
correct      every
Name: 3929, dtype: object
corrupted     a
correct      at
Name: 3931, dtype: object
corrupted     th
correct      the
Name: 3941, dtype: object
corrupted     chef
correct      chief
Name: 3943, dtype: object
corrupted     one
correct      over
Name: 3946, dtype: object
corrupted     tat
correct      that
Name: 3952, dtype: object
corrupted      tolly
correct      totally
Name: 3956, dtype: object
corrupted    ver
correct      her
Name: 3957, dtype: object
corrupted    years
correct      tears
Name: 3958, dtype: object
corrupted     sorted
correct      courted
Name: 3960, dtype: object
corrupted      be
correct      been
Name: 3965, dtype: object
corrupted     ead
correct      read
Name: 3966, dtype: object
corrupted    nog
correct      not
Name: 3969, dtype: object
corrupted      dal
correct

Name: 4332, dtype: object
corrupted     so
correct      two
Name: 4335, dtype: object
corrupted     page
correct      large
Name: 4342, dtype: object
corrupted     alice
correct      apiece
Name: 4344, dtype: object
corrupted      animato
correct      animation
Name: 4345, dtype: object
corrupted    cheers
correct      cheeks
Name: 4346, dtype: object
corrupted    call
correct       all
Name: 4348, dtype: object
corrupted     e
correct      be
Name: 4355, dtype: object
corrupted     puzzle
correct      puzzled
Name: 4356, dtype: object
corrupted     minor
correct      Elinor
Name: 4358, dtype: object
corrupted     pint
correct      print
Name: 4365, dtype: object
corrupted    ships
correct      shops
Name: 4366, dtype: object
corrupted      Aswoon
correct      Dashwood
Name: 4368, dtype: object
corrupted     ive
correct      give
Name: 4369, dtype: object
corrupted     fo
correct      for
Name: 4371, dtype: object
corrupted     fo
correct      for
Name: 4372, dtype: object
corrupted   

corrupted     action
correct      actions
Name: 4800, dtype: object
corrupted     Wiloughby
correct      Willoughby
Name: 4801, dtype: object
corrupted     f
correct      of
Name: 4805, dtype: object
corrupted     displosion
correct      disposition
Name: 4806, dtype: object
corrupted      me
correct      same
Name: 4810, dtype: object
corrupted      app
correct      happy
Name: 4816, dtype: object
corrupted      in
correct      vain
Name: 4817, dtype: object
corrupted    the
correct       he
Name: 4824, dtype: object
corrupted    Dashzwood
correct       Dashwood
Name: 4825, dtype: object
corrupted     ny
correct      any
Name: 4829, dtype: object
corrupted      convenience
correct      inconvenience
Name: 4833, dtype: object
corrupted      need
correct      indeed
Name: 4835, dtype: object
corrupted      sure
correct      assure
Name: 4839, dtype: object
corrupted     bee
correct      been
Name: 4841, dtype: object
corrupted     necessar
correct      necessary
Name: 4842, dtype: objec

Name: 5356, dtype: object
corrupted       sued
correct      succeed
Name: 5357, dtype: object
corrupted    this
correct       his
Name: 5358, dtype: object
corrupted    Dashworod
correct       Dashwood
Name: 5361, dtype: object
corrupted     ave
correct      have
Name: 5362, dtype: object
corrupted     you
correct      your
Name: 5364, dtype: object
corrupted    aid
correct      and
Name: 5366, dtype: object
corrupted    td
correct      to
Name: 5367, dtype: object
corrupted    oer
correct      her
Name: 5368, dtype: object
corrupted     dn
correct      don
Name: 5369, dtype: object
corrupted     ave
correct      have
Name: 5370, dtype: object
corrupted     mus
correct      must
Name: 5373, dtype: object
corrupted     are
correct      sure
Name: 5374, dtype: object
corrupted     yo
correct      you
Name: 5375, dtype: object
corrupted     f
correct      of
Name: 5376, dtype: object
corrupted    Westerns
correct       Westons
Name: 5378, dtype: object
corrupted      gnu
correct      goin

corrupted     ad
correct      and
Name: 5731, dtype: object
corrupted     r
correct      or
Name: 5738, dtype: object
corrupted     throw
correct      thrown
Name: 5739, dtype: object
corrupted    play
correct       pay
Name: 5742, dtype: object
corrupted    other
correct      their
Name: 5743, dtype: object
corrupted     f
correct      of
Name: 5745, dtype: object
corrupted     paise
correct      praise
Name: 5746, dtype: object
corrupted    in
correct      is
Name: 5749, dtype: object
corrupted     hr
correct      her
Name: 5751, dtype: object
corrupted     re
correct      are
Name: 5753, dtype: object
corrupted    Smiddletojn
correct        Middleton
Name: 5758, dtype: object
corrupted    tracks
correct      tricks
Name: 5760, dtype: object
corrupted    aah
correct      and
Name: 5769, dtype: object
corrupted     o
correct      so
Name: 5774, dtype: object
corrupted    guttle
correct      little
Name: 5783, dtype: object
corrupted     o
correct      of
Name: 5787, dtype: object
corr

Name: 6185, dtype: object
corrupted     he
correct      the
Name: 6186, dtype: object
corrupted    in
correct      if
Name: 6190, dtype: object
corrupted     elt
correct      felt
Name: 6191, dtype: object
corrupted    did
correct      and
Name: 6194, dtype: object
corrupted     abut
correct      about
Name: 6195, dtype: object
corrupted    Ferrara
correct      Ferrars
Name: 6196, dtype: object
corrupted    io
correct      do
Name: 6199, dtype: object
corrupted    books
correct      looks
Name: 6201, dtype: object
corrupted      up
correct      upon
Name: 6202, dtype: object
corrupted    rew
correct      few
Name: 6205, dtype: object
corrupted      remain
correct      remained
Name: 6206, dtype: object
corrupted    silsegnft
correct         silent
Name: 6207, dtype: object
corrupted    ay
correct      at
Name: 6209, dtype: object
corrupted     hocked
correct      shocked
Name: 6215, dtype: object
corrupted    nob
correct      not
Name: 6216, dtype: object
corrupted    Pirate
correct   

corrupted     o
correct      to
Name: 6611, dtype: object
corrupted     afer
correct      after
Name: 6613, dtype: object
corrupted    jon
correct      one
Name: 6618, dtype: object
corrupted     obstacle
correct      obstacles
Name: 6620, dtype: object
corrupted     
correct      a
Name: 6623, dtype: object
corrupted     who
correct      whom
Name: 6626, dtype: object
corrupted     ad
correct      and
Name: 6629, dtype: object
corrupted      dishes
correct      distress
Name: 6633, dtype: object
corrupted    tp
correct      to
Name: 6634, dtype: object
corrupted    bs
correct      be
Name: 6636, dtype: object
corrupted     phil
correct      while
Name: 6644, dtype: object
corrupted    bt
correct      be
Name: 6647, dtype: object
corrupted    rest
correct      felt
Name: 6648, dtype: object
corrupted    kit
correct       it
Name: 6650, dtype: object
corrupted     Lcd
correct      Lucy
Name: 6655, dtype: object
corrupted     Award
correct      Edward
Name: 6656, dtype: object
corrupted 

Name: 7020, dtype: object
corrupted    ce
correct      he
Name: 7021, dtype: object
corrupted      Marine
correct      Marianne
Name: 7025, dtype: object
corrupted     f
correct      of
Name: 7026, dtype: object
corrupted     NO
correct      NOT
Name: 7028, dtype: object
corrupted    lit
correct       it
Name: 7030, dtype: object
corrupted     Mss
correct      Miss
Name: 7032, dtype: object
corrupted      uc
correct      Lucy
Name: 7037, dtype: object
corrupted     looing
correct      looking
Name: 7041, dtype: object
corrupted     Eliot
correct      Elinor
Name: 7042, dtype: object
corrupted     going
correct      giving
Name: 7049, dtype: object
corrupted     scene
correct      scheme
Name: 7050, dtype: object
corrupted    aor
correct      for
Name: 7052, dtype: object
corrupted     o
correct      to
Name: 7053, dtype: object
corrupted    ff
correct      of
Name: 7054, dtype: object
corrupted     y
correct      my
Name: 7058, dtype: object
corrupted     us
correct      use
Name: 7060

Name: 7637, dtype: object
corrupted     ge
correct      get
Name: 7639, dtype: object
corrupted    lor
correct      for
Name: 7640, dtype: object
corrupted    flatter
correct      flutter
Name: 7645, dtype: object
corrupted       ring
correct      drawing
Name: 7648, dtype: object
corrupted     stair
correct      stairs
Name: 7656, dtype: object
corrupted     ll
correct      all
Name: 7658, dtype: object
corrupted    dand
correct       and
Name: 7667, dtype: object
corrupted     a
correct      at
Name: 7668, dtype: object
corrupted      hard
correct      hardly
Name: 7675, dtype: object
corrupted     wab
correct      what
Name: 7677, dtype: object
corrupted    soe
correct      she
Name: 7678, dtype: object
corrupted    seeking
correct       seeing
Name: 7689, dtype: object
corrupted    AIn
correct       In
Name: 7695, dtype: object
corrupted     tis
correct      this
Name: 7696, dtype: object
corrupted    put
correct      but
Name: 7703, dtype: object
corrupted    wab
correct      was


corrupted    fer
correct      her
Name: 8299, dtype: object
corrupted      e
correct      the
Name: 8303, dtype: object
corrupted     day
correct      days
Name: 8308, dtype: object
corrupted    the
correct      she
Name: 8309, dtype: object
corrupted    Willoughbby
correct       Willoughby
Name: 8310, dtype: object
corrupted    tume
correct      time
Name: 8311, dtype: object
corrupted    partly
correct       party
Name: 8313, dtype: object
corrupted    hone
correct      hope
Name: 8318, dtype: object
corrupted     Sh
correct      She
Name: 8319, dtype: object
corrupted      string
correct      stirring
Name: 8320, dtype: object
corrupted    sebat
correct       seat
Name: 8321, dtype: object
corrupted     thing
correct      string
Name: 8330, dtype: object
corrupted      all
correct      allow
Name: 8332, dtype: object
corrupted     ha
correct      had
Name: 8338, dtype: object
corrupted    tribune
correct      tribute
Name: 8339, dtype: object
corrupted     an
correct      and
Name: 

corrupted     relead
correct      release
Name: 8753, dtype: object
corrupted     o
correct      of
Name: 8756, dtype: object
corrupted    lino
correct      line
Name: 8759, dtype: object
corrupted    in
correct      it
Name: 8763, dtype: object
corrupted     main
correct      again
Name: 8764, dtype: object
corrupted     he
correct      her
Name: 8767, dtype: object
corrupted     true
correct      trust
Name: 8768, dtype: object
corrupted     s
correct      as
Name: 8770, dtype: object
corrupted    ale
correct      all
Name: 8772, dtype: object
corrupted     I
correct      In
Name: 8775, dtype: object
corrupted     hat
correct      that
Name: 8777, dtype: object
corrupted    give
correct      gave
Name: 8781, dtype: object
corrupted      ergo
correct      forgot
Name: 8783, dtype: object
corrupted     hat
correct      that
Name: 8789, dtype: object
corrupted    windrow
correct       window
Name: 8791, dtype: object
corrupted     rs
correct      Mrs
Name: 8793, dtype: object
corrupted 

corrupted     cam
correct      came
Name: 9308, dtype: object
corrupted     
correct      I
Name: 9310, dtype: object
corrupted    hid
correct      had
Name: 9311, dtype: object
corrupted    wa
correct       a
Name: 9312, dtype: object
corrupted     ot
correct      not
Name: 9314, dtype: object
corrupted     fr
correct      for
Name: 9315, dtype: object
corrupted     t
correct      at
Name: 9319, dtype: object
corrupted     Lod
correct      Lord
Name: 9320, dtype: object
corrupted    hade
correct      have
Name: 9324, dtype: object
corrupted    Im
correct      It
Name: 9328, dtype: object
corrupted    al
correct      am
Name: 9329, dtype: object
corrupted    Xwilloughb
correct      Willoughby
Name: 9334, dtype: object
corrupted    ony
correct      any
Name: 9336, dtype: object
corrupted    bd
correct      be
Name: 9338, dtype: object
corrupted    itself
correct      myself
Name: 9339, dtype: object
corrupted    sparged
correct       spared
Name: 9342, dtype: object
corrupted     hat
co

corrupted    home
correct      some
Name: 9793, dtype: object
corrupted       friends
correct      friendship
Name: 9799, dtype: object
corrupted     an
correct      can
Name: 9800, dtype: object
corrupted    and
correct      any
Name: 9801, dtype: object
corrupted     lt
correct      let
Name: 9802, dtype: object
corrupted    small
correct      shall
Name: 9803, dtype: object
corrupted    weka
correct      idea
Name: 9807, dtype: object
corrupted    bill
correct      will
Name: 9811, dtype: object
corrupted    hame
correct      have
Name: 9814, dtype: object
corrupted     may
correct      make
Name: 9818, dtype: object
corrupted    anc
correct      any
Name: 9819, dtype: object
corrupted     on
correct      one
Name: 9821, dtype: object
corrupted     lay
correct      lady
Name: 9824, dtype: object
corrupted     b
correct      by
Name: 9829, dtype: object
corrupted      near
correct      nearly
Name: 9835, dtype: object
corrupted    us
correct      up
Name: 9841, dtype: object
corrupte

Name: 10437, dtype: object
corrupted     assembles
correct      assemblies
Name: 10438, dtype: object
corrupted     pat
correct      past
Name: 10447, dtype: object
corrupted    for
correct       or
Name: 10452, dtype: object
corrupted    pas
correct       as
Name: 10457, dtype: object
corrupted    nod
correct      not
Name: 10462, dtype: object
corrupted     Mis
correct      Miss
Name: 10466, dtype: object
corrupted    trek
correct      tree
Name: 10470, dtype: object
corrupted     thin
correct      think
Name: 10474, dtype: object
corrupted      Fears
correct      Ferrars
Name: 10476, dtype: object
corrupted    Willoughjy
correct      Willoughby
Name: 10478, dtype: object
corrupted       pain
correct      painful
Name: 10479, dtype: object
corrupted     s
correct      as
Name: 10485, dtype: object
corrupted     wa
correct      was
Name: 10486, dtype: object
corrupted     ade
correct      made
Name: 10491, dtype: object
corrupted    left
correct      less
Name: 10494, dtype: object
co

Name: 11008, dtype: object
corrupted     air
correct      girl
Name: 11010, dtype: object
corrupted     
correct      I
Name: 11012, dtype: object
corrupted     mary
correct      marry
Name: 11015, dtype: object
corrupted    ho
correct      do
Name: 11020, dtype: object
corrupted      Lino
correct      Elinor
Name: 11025, dtype: object
corrupted     hm
correct      him
Name: 11027, dtype: object
corrupted     tat
correct      that
Name: 11028, dtype: object
corrupted    uit
correct       it
Name: 11029, dtype: object
corrupted     e
correct      he
Name: 11031, dtype: object
corrupted     y
correct      by
Name: 11034, dtype: object
corrupted     hd
correct      had
Name: 11037, dtype: object
corrupted     haven
correct      having
Name: 11039, dtype: object
corrupted     Endings
correct      Jennings
Name: 11043, dtype: object
corrupted     mean
correct      means
Name: 11046, dtype: object
corrupted    abt
correct       at
Name: 11048, dtype: object
corrupted    Air
correct      Sir


Name: 11533, dtype: object
corrupted      obstet
correct      obstacle
Name: 11536, dtype: object
corrupted     pon
correct      upon
Name: 11538, dtype: object
corrupted     sh
correct      she
Name: 11541, dtype: object
corrupted     creole
correct      rejoice
Name: 11543, dtype: object
corrupted    Are
correct      She
Name: 11546, dtype: object
corrupted     shirts
correct      spirits
Name: 11548, dtype: object
corrupted    mobility
correct      civility
Name: 11549, dtype: object
corrupted     hat
correct      that
Name: 11556, dtype: object
corrupted    Mxiddleton
correct       Middleton
Name: 11561, dtype: object
corrupted     zone
correct      alone
Name: 11562, dtype: object
corrupted    home
correct       how
Name: 11563, dtype: object
corrupted     wa
correct      was
Name: 11565, dtype: object
corrupted     luck
correct      lucky
Name: 11566, dtype: object
corrupted     fo
correct      for
Name: 11567, dtype: object
corrupted    Mors
correct       Mrs
Name: 11568, dtype:

Name: 12111, dtype: object
corrupted     an
correct      and
Name: 12113, dtype: object
corrupted     ame
correct      name
Name: 12114, dtype: object
corrupted     M
correct      Mr
Name: 12115, dtype: object
corrupted     work
correct      words
Name: 12117, dtype: object
corrupted     ka
correct      was
Name: 12118, dtype: object
corrupted    ae
correct      be
Name: 12119, dtype: object
corrupted     in
correct      own
Name: 12121, dtype: object
corrupted     o
correct      of
Name: 12122, dtype: object
corrupted     '
correct      's
Name: 12123, dtype: object
corrupted     short
correct      stroke
Name: 12125, dtype: object
corrupted    aid
correct      and
Name: 12126, dtype: object
corrupted      site
correct      sister
Name: 12127, dtype: object
corrupted    rf
correct      of
Name: 12132, dtype: object
corrupted     corse
correct      course
Name: 12138, dtype: object
corrupted     te
correct      the
Name: 12139, dtype: object
corrupted     believe
correct      believed


Name: 12670, dtype: object
corrupted     over
correct      never
Name: 12673, dtype: object
corrupted     For
correct      Four
Name: 12676, dtype: object
corrupted     wa
correct      was
Name: 12680, dtype: object
corrupted     for
correct      from
Name: 12681, dtype: object
corrupted    vittle
correct      little
Name: 12682, dtype: object
corrupted    remotion
correct       emotion
Name: 12683, dtype: object
corrupted     y
correct      my
Name: 12685, dtype: object
corrupted      suff
correct      suffer
Name: 12686, dtype: object
corrupted      upper
correct      support
Name: 12689, dtype: object
corrupted     nt
correct      not
Name: 12690, dtype: object
corrupted     o
correct      of
Name: 12694, dtype: object
corrupted    has
correct      his
Name: 12698, dtype: object
corrupted    cense
correct      sense
Name: 12699, dtype: object
corrupted    thinkling
correct       thinking
Name: 12711, dtype: object
corrupted     o
correct      so
Name: 12712, dtype: object
corrupted 

corrupted    Wioloughbys
correct      Willoughbys
Name: 13156, dtype: object
corrupted     Siege
correct      Steele
Name: 13162, dtype: object
corrupted    Kennings
correct      Jennings
Name: 13166, dtype: object
corrupted     on
correct      own
Name: 13167, dtype: object
corrupted     hort
correct      short
Name: 13168, dtype: object
corrupted      in
correct      join
Name: 13169, dtype: object
corrupted     Jennies
correct      Jennings
Name: 13171, dtype: object
corrupted      thing
correct      nothing
Name: 13175, dtype: object
corrupted     sad
correct      said
Name: 13178, dtype: object
corrupted     Jennies
correct      Jennings
Name: 13183, dtype: object
corrupted    in
correct      it
Name: 13184, dtype: object
corrupted    Ps
correct      Is
Name: 13185, dtype: object
corrupted     belief
correct      believe
Name: 13187, dtype: object
corrupted    ys
correct      is
Name: 13188, dtype: object
corrupted    Ziddleton
correct      Middleton
Name: 13189, dtype: object
cor

Name: 13627, dtype: object
corrupted    Somersjtshire
correct      Somersetshire
Name: 13634, dtype: object
corrupted    Flavor
correct      Elinor
Name: 13637, dtype: object
corrupted    be
correct      me
Name: 13638, dtype: object
corrupted     o
correct      on
Name: 13640, dtype: object
corrupted     dea
correct      dear
Name: 13645, dtype: object
corrupted     the
correct      than
Name: 13652, dtype: object
corrupted    faq
correct      for
Name: 13653, dtype: object
corrupted    kit
correct       it
Name: 13654, dtype: object
corrupted      els
correct      evils
Name: 13655, dtype: object
corrupted      start
correct      started
Name: 13656, dtype: object
corrupted    faq
correct      far
Name: 13657, dtype: object
corrupted      press
correct      pressed
Name: 13661, dtype: object
corrupted    statesmen
correct      statement
Name: 13670, dtype: object
corrupted     are
correct      from
Name: 13672, dtype: object
corrupted     Denning
correct      Jennings
Name: 13675, dt

Name: 14206, dtype: object
corrupted     th
correct      the
Name: 14207, dtype: object
corrupted     any
correct      away
Name: 14211, dtype: object
corrupted      he
correct      head
Name: 14212, dtype: object
corrupted      Clone
correct      Colonel
Name: 14214, dtype: object
corrupted     lodge
correct      lodges
Name: 14215, dtype: object
corrupted    win
correct       in
Name: 14216, dtype: object
corrupted     
correct      I
Name: 14220, dtype: object
corrupted     ot
correct      not
Name: 14222, dtype: object
corrupted     Minor
correct      Elinor
Name: 14225, dtype: object
corrupted    upturn
correct      return
Name: 14230, dtype: object
corrupted     we
correct      see
Name: 14232, dtype: object
corrupted    tout
correct       out
Name: 14235, dtype: object
corrupted     Ad
correct      And
Name: 14236, dtype: object
corrupted     for
correct      from
Name: 14243, dtype: object
corrupted     o
correct      of
Name: 14246, dtype: object
corrupted     se
correct      

corrupted    file
correct      five
Name: 14844, dtype: object
corrupted     awa
correct      away
Name: 14846, dtype: object
corrupted     Many
correct      Magna
Name: 14850, dtype: object
corrupted     feeing
correct      feeling
Name: 14858, dtype: object
corrupted      ju
correct      just
Name: 14862, dtype: object
corrupted     n
correct      in
Name: 14863, dtype: object
corrupted     home
correct      house
Name: 14864, dtype: object
corrupted    din
correct       in
Name: 14867, dtype: object
corrupted     o
correct      to
Name: 14869, dtype: object
corrupted     hr
correct      her
Name: 14872, dtype: object
corrupted    ripped
correct      nipped
Name: 14873, dtype: object
corrupted    gn
correct      in
Name: 14878, dtype: object
corrupted     fund
correct      found
Name: 14880, dtype: object
corrupted    ind
correct      and
Name: 14881, dtype: object
corrupted      la
correct      plan
Name: 14882, dtype: object
corrupted     a
correct      at
Name: 14887, dtype: objec

Name: 15370, dtype: object
corrupted     ha
correct      had
Name: 15372, dtype: object
corrupted     een
correct      been
Name: 15373, dtype: object
corrupted     to
correct      too
Name: 15376, dtype: object
corrupted     nd
correct      and
Name: 15377, dtype: object
corrupted     five
correct      fever
Name: 15385, dtype: object
corrupted     ore
correct      more
Name: 15386, dtype: object
corrupted     ears
correct      fears
Name: 15390, dtype: object
corrupted     i
correct      it
Name: 15392, dtype: object
corrupted    hao
correct      had
Name: 15394, dtype: object
corrupted     tr
correct      try
Name: 15395, dtype: object
corrupted     coul
correct      could
Name: 15399, dtype: object
corrupted     bs
correct      was
Name: 15402, dtype: object
corrupted      scarcy
correct      scarcely
Name: 15405, dtype: object
corrupted      most
correct      utmost
Name: 15408, dtype: object
corrupted     o
correct      of
Name: 15410, dtype: object
corrupted    Mac
correct      

Name: 15879, dtype: object
corrupted     wit
correct      with
Name: 15880, dtype: object
corrupted    thad
correct       had
Name: 15883, dtype: object
corrupted     list
correct      visit
Name: 15884, dtype: object
corrupted    ir
correct      in
Name: 15885, dtype: object
corrupted       ford
correct      forgive
Name: 15889, dtype: object
corrupted     ende
correct      ended
Name: 15893, dtype: object
corrupted    boon
correct      soon
Name: 15894, dtype: object
corrupted     Mariana
correct      Marianne
Name: 15896, dtype: object
corrupted    alg
correct      all
Name: 15899, dtype: object
corrupted     o
correct      of
Name: 15901, dtype: object
corrupted      nasally
correct      naturally
Name: 15904, dtype: object
corrupted     o
correct      to
Name: 15910, dtype: object
corrupted     have
correct      heavy
Name: 15913, dtype: object
corrupted    ie
correct      me
Name: 15914, dtype: object
corrupted     i
correct      it
Name: 15920, dtype: object
corrupted     fet
co

Name: 16423, dtype: object
corrupted     tess
correct      these
Name: 16424, dtype: object
corrupted     red
correct      died
Name: 16428, dtype: object
corrupted     te
correct      the
Name: 16431, dtype: object
corrupted    bo
correct      to
Name: 16432, dtype: object
corrupted     ao
correct      ago
Name: 16434, dtype: object
corrupted    Willounghby
correct       Willoughby
Name: 16436, dtype: object
corrupted     exceed
correct      excited
Name: 16438, dtype: object
corrupted    make
correct      made
Name: 16439, dtype: object
corrupted    aes
correct       as
Name: 16442, dtype: object
corrupted     er
correct      her
Name: 16444, dtype: object
corrupted     hi
correct      his
Name: 16447, dtype: object
corrupted     he
correct      She
Name: 16449, dtype: object
corrupted    iq
correct      it
Name: 16455, dtype: object
corrupted     low
correct      love
Name: 16457, dtype: object
corrupted     t
correct      it
Name: 16458, dtype: object
corrupted     log
correct     

Name: 16989, dtype: object
corrupted      the
correct      there
Name: 16996, dtype: object
corrupted     posting
correct      pointing
Name: 16997, dtype: object
corrupted    protecting
correct      projecting
Name: 16999, dtype: object
corrupted     fist
correct      first
Name: 17002, dtype: object
corrupted     pay
correct      pain
Name: 17004, dtype: object
corrupted    nou
correct      now
Name: 17008, dtype: object
corrupted    ho
correct      to
Name: 17011, dtype: object
corrupted     A
correct      As
Name: 17012, dtype: object
corrupted     hav
correct      have
Name: 17013, dtype: object
corrupted     fr
correct      far
Name: 17014, dtype: object
corrupted     yo
correct      you
Name: 17016, dtype: object
corrupted     y
correct      my
Name: 17017, dtype: object
corrupted     e
correct      me
Name: 17020, dtype: object
corrupted     issued
correct      assured
Name: 17021, dtype: object
corrupted     e
correct      be
Name: 17025, dtype: object
corrupted    double
corr

Name: 17641, dtype: object
corrupted     sel
correct      self
Name: 17644, dtype: object
corrupted    hins
correct       his
Name: 17655, dtype: object
corrupted     ore
correct      more
Name: 17658, dtype: object
corrupted     luring
correct      lurking
Name: 17662, dtype: object
corrupted     hat
correct      That
Name: 17664, dtype: object
corrupted     marred
correct      married
Name: 17670, dtype: object
corrupted     Edgar
correct      Edward
Name: 17672, dtype: object
corrupted     moter
correct      mother
Name: 17674, dtype: object
corrupted     pace
correct      place
Name: 17676, dtype: object
corrupted     et
correct      yet
Name: 17679, dtype: object
corrupted     i
correct      in
Name: 17681, dtype: object
corrupted     biting
correct      uniting
Name: 17683, dtype: object
corrupted     o
correct      of
Name: 17691, dtype: object
corrupted    Kennings
correct      Jennings
Name: 17692, dtype: object
corrupted    ani
correct      and
Name: 17693, dtype: object
corr

Name: 18274, dtype: object
corrupted     honor
correct      honour
Name: 18281, dtype: object
corrupted       amid
correct      admired
Name: 18282, dtype: object
corrupted     yoursel
correct      yourself
Name: 18285, dtype: object
corrupted    HI
correct       I
Name: 18286, dtype: object
corrupted     pleaser
correct      pleasure
Name: 18294, dtype: object
corrupted    mat
correct       at
Name: 18295, dtype: object
corrupted     e
correct      be
Name: 18297, dtype: object
corrupted     mut
correct      must
Name: 18305, dtype: object
corrupted     elt
correct      felt
Name: 18307, dtype: object
corrupted    the
correct       he
Name: 18310, dtype: object
corrupted     any
correct      land
Name: 18313, dtype: object
corrupted    tithed
correct      tithes
Name: 18314, dtype: object
corrupted     herd
correct      heard
Name: 18317, dtype: object
corrupted     hey
correct      They
Name: 18321, dtype: object
corrupted     intimae
correct      intimate
Name: 18323, dtype: object


corrupted     event
correct      extent
Name: 18785, dtype: object
corrupted     eery
correct      every
Name: 18791, dtype: object
corrupted    tis
correct      his
Name: 18792, dtype: object
corrupted     o
correct      of
Name: 18794, dtype: object
corrupted       upped
correct      supposed
Name: 18797, dtype: object
corrupted     minor
correct      Elinor
Name: 18800, dtype: object
corrupted      tells
correct      useless
Name: 18804, dtype: object
corrupted     the
correct      than
Name: 18805, dtype: object
corrupted     had
correct      half
Name: 18806, dtype: object
corrupted    colonel
correct      Colonel
Name: 18810, dtype: object
corrupted    other
correct        her
Name: 18814, dtype: object
corrupted      thing
correct      nothing
Name: 18815, dtype: object
corrupted    co
correct      to
Name: 18816, dtype: object
corrupted     sealed
correct      settled
Name: 18819, dtype: object
corrupted     obligation
correct      obligations
Name: 18823, dtype: object
corrupt

In [38]:
corrected_tokens_all = spell_corrector(corrupted_tokens)

In [39]:
# 7833 are different, so our spell corrector doesn't change too many 'correct' words
df = pd.DataFrame({'corrupted' : corrected_tokens_all, 'correct' : correct_tokens }, columns=['corrupted','correct'])
misaligned = df[df['corrupted'] != df['correct']]
len(misaligned)

7833

In [49]:
# There are 267704 tokens altogether, of which 7833 are different
len(corrected_tokens_all)

267704

In [43]:
df = pd.DataFrame(corrected_tokens_all, columns=["corrected_tokens"])
df.to_csv('corrected_tokens.csv', index=False)

In [None]:
# Time to detokenize and see output!