In [1]:
import re
import nltk
import csv
import pandas as pd
nltk.download('gutenberg')

corrupted_file = open('ausen-sense-corrupted.txt').read()
correct_file = nltk.corpus.gutenberg.raw('austen-sense.txt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/erictay1997/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
def tokenizer(textFile):
    words = re.findall(r"[\w']+|[\n-.\",!?:;\[\]]", textFile)
    tokens = []
    for word in words:
        foundContraction = re.search(r"(\w+)('ll|'LL|'re|'RE|'ve|'VE|n't|N'T|'s|'S|'d|'D|'m|'M|'a|')", word)
        if foundContraction:
            tokens.append(foundContraction.group(1))
            tokens.append(foundContraction.group(2))
        else:
            tokens.append(word)
    
    print("finished tokenizing")
    print(len(tokens))
    return tokens

In [3]:
corrupted_tokens = tokenizer(corrupted_file)
correct_tokens = tokenizer(correct_file)

finished tokenizing
267648
finished tokenizing
267710


In [4]:
# Make same length for alignment
for i in range(62):
    corrupted_tokens.append('0')

In [5]:
# These spaces are to handle misalignment issues
# This occurs when the corrupted file deletes words, which we cannot correct
counter = 0
for i in range(len(correct_tokens)):
    if corrupted_tokens[i] != correct_tokens[i]:
        if correct_tokens[i] == " ":
            counter += 1
            corrupted_tokens.insert(i, " ")
            corrupted_tokens.pop()
print(counter)

61


In [6]:
# There are 18929 tokens that are different/corrupted
df = pd.DataFrame({'corrupted' : corrupted_tokens, 'correct' : correct_tokens }, columns=['corrupted','correct'])
misaligned = df[df['corrupted'] != df['correct']]
len(misaligned)

18929

In [7]:
# Print for Sanity Check
# for i in range(len(misaligned)):
#     print(misaligned.iloc[i,])

In [8]:
all_words = open('dictionary.txt').read().split('\n')
# dictionary.txt is a txt file of all valid words taken from the link on Piazza
# https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt

In [9]:
df = pd.read_csv('unigram_freq.csv', index_col = False)
# unigram_freq.csv is a csv file from Kaggle, containing english words and word frequency
# Data is derived from the Google Web Trillion Word Corpus.
# https://www.kaggle.com/rtatman/english-word-frequency

In [10]:
df = df[df['word'].isin(all_words)] # Clean kaggle dataset

In [11]:
word_counter = {}
for i in range(len(df)):
    word_counter[df.iloc[i,0]] = df.iloc[i,1]

In [12]:
for word in all_words:
    if word not in word_counter:
        word_counter[word] = 1

In [13]:
def spell_corrector(word_list):
    return [correct(word) for word in word_list]

In [14]:
# Corrects word if it's alphanumeric
# Else, do nothing
# Do not correct valid words or numbers
# Capitalizes it accordingly
def correct(word):
    if not word.isalnum():
        return word
    if word.lower() in word_counter:
        return word
    if word.isdigit():
        return word
    corrected_word = best_candidate(word.lower())
    if word.isupper():
        return corrected_word.upper()
    if word[0].isupper():
        return corrected_word[0].upper() + corrected_word[1:]
    return corrected_word

In [15]:
# Returns best candidate for a given word
# Prioritizes lower Levenshtein distance, and then word frequency
# If there are no words with Levenshtein distance ≤ 2 in word_counter, return the word itself
# We return the word because increasing the max distance would tend to correct proper nouns, etc.
def best_candidate(word):
    return (best_candidate_from_list(distance1(word)) or best_candidate_from_list(distance2(word)) or word)

In [16]:
# Returns best candidate from a list of words
# Weighted by word frequency
def best_candidate_from_list(words):
    count = -1
    candidate = None
    for w in words:
        if w in word_counter and word_counter[w] > count:
            count = word_counter[w]
            candidate = w
    return candidate

In [17]:
# Set of words of one Levenshtein distance from word
def distance1(word):
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    ret = set()
    for i in range(len(word)):
        ret.add(word[:i] + word[i+1:]) #Deletion
        for letter in letters:
            ret.add(word[:i] + letter + word[i:]) #Insertion
            ret.add(word[:i] + letter + word[i+1:]) #Substitution
    return ret

In [18]:
# Set of words of two Levenshtein distance from word
def distance2(word): 
    ret = set()
    for distance1_word in distance1(word):
        ret.update(distance1(distance1_word))
    return ret

In [19]:
# These are the tokens are are different
corrupted_tokens_inaccurate = []
correct_tokens_shortlist = []

In [20]:
for i in range(len(corrupted_tokens)):
    if corrupted_tokens[i] != correct_tokens[i]:
        corrupted_tokens_inaccurate.append(corrupted_tokens[i])
        correct_tokens_shortlist.append(correct_tokens[i])

In [21]:
corrected_tokens = spell_corrector(corrupted_tokens_inaccurate)

In [22]:
counter = 0
for i in range(len(correct_tokens_shortlist)):
    if correct_tokens_shortlist[i] == corrected_tokens[i]:
        counter += 1

In [23]:
# We correct 63% of tokens that were corrupted
counter/len(correct_tokens_shortlist)

0.6259707327381266

In [24]:
# 7080 tokens are still different
df = pd.DataFrame({'corrupted' : corrected_tokens, 'correct' : correct_tokens_shortlist }, columns=['corrupted','correct'])
misaligned = df[df['corrupted'] != df['correct']]
len(misaligned)

7080

In [25]:
# Our spell corrector works, but either the corrupted word is a valid word
# Or we correct to another word, not the correct word
misaligned.head()

Unnamed: 0,corrupted,correct
1,i,in
5,stat,estate
7,nd,and
15,te,the
18,o,of


In [26]:
corrected_tokens_all = spell_corrector(corrupted_tokens)

In [27]:
# 7783 are different, so our spell corrector doesn't change too many 'correct' words
df = pd.DataFrame({'corrupted' : corrected_tokens_all, 'correct' : correct_tokens }, columns=['corrupted','correct'])
misaligned = df[df['corrupted'] != df['correct']]
len(misaligned)

7783

In [28]:
# There are 267710 tokens altogether, of which 7833 are different
len(corrected_tokens_all)

267710

In [29]:
def detokenizer(file_name):
    f = open(file_name, "w")
    for token in corrected_tokens_all:
        f.write(token)
    f.close()

In [30]:
detokenizer("Spell_Corrector_corrected_file.txt")

In [42]:
f = open("Spell_Corrector_corrected_file.txt", "r")
lines = f.readlines()
print("".join(lines[:10]))
f.close()

[Sense and Sensibility by Jane Austin 1811]

CHAPTER 1


The family of Basswood had long been settled i Sussex.
Their estate was large, and their residence was at Norland Park,
in the centre of their property, where, for many generations,
they had lived in so respectable a manner as to engage
the general good opinion of their surrounding acquaintance.



In [38]:
print(correct_file[:355])

[Sense and Sensibility by Jane Austen 1811]

CHAPTER 1


The family of Dashwood had long been settled in Sussex.
Their estate was large, and their residence was at Norland Park,
in the centre of their property, where, for many generations,
they had lived in so respectable a manner as to engage
the general good opinion of their surrounding acquaintance.

